xref: /linux/drivers/iommu/intel/iommu.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) pci_is_display(pdev)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 #define rwbf_required(iommu)	(rwbf_quirk || cap_rwbf((iommu)->cap))
61 
62 /*
63  * set to 1 to panic kernel if can't successfully enable VT-d
64  * (used when kernel is launched w/ TXT)
65  */
66 static int force_on = 0;
67 static int intel_iommu_tboot_noforce;
68 static int no_platform_optin;
69 
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 
72 /*
73  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
74  * if marked present.
75  */
76 static phys_addr_t root_entry_lctp(struct root_entry *re)
77 {
78 	if (!(re->lo & 1))
79 		return 0;
80 
81 	return re->lo & VTD_PAGE_MASK;
82 }
83 
84 /*
85  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
86  * if marked present.
87  */
88 static phys_addr_t root_entry_uctp(struct root_entry *re)
89 {
90 	if (!(re->hi & 1))
91 		return 0;
92 
93 	return re->hi & VTD_PAGE_MASK;
94 }
95 
96 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
97 {
98 	struct device_domain_info *info =
99 		rb_entry(node, struct device_domain_info, node);
100 	const u16 *rid_lhs = key;
101 
102 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
103 		return -1;
104 
105 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
106 		return 1;
107 
108 	return 0;
109 }
110 
111 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
112 {
113 	struct device_domain_info *info =
114 		rb_entry(lhs, struct device_domain_info, node);
115 	u16 key = PCI_DEVID(info->bus, info->devfn);
116 
117 	return device_rid_cmp_key(&key, rhs);
118 }
119 
120 /*
121  * Looks up an IOMMU-probed device using its source ID.
122  *
123  * Returns the pointer to the device if there is a match. Otherwise,
124  * returns NULL.
125  *
126  * Note that this helper doesn't guarantee that the device won't be
127  * released by the iommu subsystem after being returned. The caller
128  * should use its own synchronization mechanism to avoid the device
129  * being released during its use if its possibly the case.
130  */
131 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
132 {
133 	struct device_domain_info *info = NULL;
134 	struct rb_node *node;
135 	unsigned long flags;
136 
137 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
138 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
139 	if (node)
140 		info = rb_entry(node, struct device_domain_info, node);
141 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
142 
143 	return info ? info->dev : NULL;
144 }
145 
146 static int device_rbtree_insert(struct intel_iommu *iommu,
147 				struct device_domain_info *info)
148 {
149 	struct rb_node *curr;
150 	unsigned long flags;
151 
152 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
153 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
154 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
155 	if (WARN_ON(curr))
156 		return -EEXIST;
157 
158 	return 0;
159 }
160 
161 static void device_rbtree_remove(struct device_domain_info *info)
162 {
163 	struct intel_iommu *iommu = info->iommu;
164 	unsigned long flags;
165 
166 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
167 	rb_erase(&info->node, &iommu->device_rbtree);
168 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
169 }
170 
171 struct dmar_rmrr_unit {
172 	struct list_head list;		/* list of rmrr units	*/
173 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
174 	u64	base_address;		/* reserved base address*/
175 	u64	end_address;		/* reserved end address */
176 	struct dmar_dev_scope *devices;	/* target devices */
177 	int	devices_cnt;		/* target device count */
178 };
179 
180 struct dmar_atsr_unit {
181 	struct list_head list;		/* list of ATSR units */
182 	struct acpi_dmar_header *hdr;	/* ACPI header */
183 	struct dmar_dev_scope *devices;	/* target devices */
184 	int devices_cnt;		/* target device count */
185 	u8 include_all:1;		/* include all ports */
186 };
187 
188 struct dmar_satc_unit {
189 	struct list_head list;		/* list of SATC units */
190 	struct acpi_dmar_header *hdr;	/* ACPI header */
191 	struct dmar_dev_scope *devices;	/* target devices */
192 	struct intel_iommu *iommu;	/* the corresponding iommu */
193 	int devices_cnt;		/* target device count */
194 	u8 atc_required:1;		/* ATS is required */
195 };
196 
197 static LIST_HEAD(dmar_atsr_units);
198 static LIST_HEAD(dmar_rmrr_units);
199 static LIST_HEAD(dmar_satc_units);
200 
201 #define for_each_rmrr_units(rmrr) \
202 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
203 
204 static void intel_iommu_domain_free(struct iommu_domain *domain);
205 
206 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
207 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
208 
209 int intel_iommu_enabled = 0;
210 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
211 
212 static int intel_iommu_superpage = 1;
213 static int iommu_identity_mapping;
214 static int iommu_skip_te_disable;
215 static int disable_igfx_iommu;
216 
217 #define IDENTMAP_AZALIA		4
218 
219 const struct iommu_ops intel_iommu_ops;
220 static const struct iommu_dirty_ops intel_dirty_ops;
221 
222 static bool translation_pre_enabled(struct intel_iommu *iommu)
223 {
224 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
225 }
226 
227 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
228 {
229 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
230 }
231 
232 static void init_translation_status(struct intel_iommu *iommu)
233 {
234 	u32 gsts;
235 
236 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
237 	if (gsts & DMA_GSTS_TES)
238 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
239 }
240 
241 static int __init intel_iommu_setup(char *str)
242 {
243 	if (!str)
244 		return -EINVAL;
245 
246 	while (*str) {
247 		if (!strncmp(str, "on", 2)) {
248 			dmar_disabled = 0;
249 			pr_info("IOMMU enabled\n");
250 		} else if (!strncmp(str, "off", 3)) {
251 			dmar_disabled = 1;
252 			no_platform_optin = 1;
253 			pr_info("IOMMU disabled\n");
254 		} else if (!strncmp(str, "igfx_off", 8)) {
255 			disable_igfx_iommu = 1;
256 			pr_info("Disable GFX device mapping\n");
257 		} else if (!strncmp(str, "forcedac", 8)) {
258 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
259 			iommu_dma_forcedac = true;
260 		} else if (!strncmp(str, "strict", 6)) {
261 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
262 			iommu_set_dma_strict();
263 		} else if (!strncmp(str, "sp_off", 6)) {
264 			pr_info("Disable supported super page\n");
265 			intel_iommu_superpage = 0;
266 		} else if (!strncmp(str, "sm_on", 5)) {
267 			pr_info("Enable scalable mode if hardware supports\n");
268 			intel_iommu_sm = 1;
269 		} else if (!strncmp(str, "sm_off", 6)) {
270 			pr_info("Scalable mode is disallowed\n");
271 			intel_iommu_sm = 0;
272 		} else if (!strncmp(str, "tboot_noforce", 13)) {
273 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
274 			intel_iommu_tboot_noforce = 1;
275 		} else {
276 			pr_notice("Unknown option - '%s'\n", str);
277 		}
278 
279 		str += strcspn(str, ",");
280 		while (*str == ',')
281 			str++;
282 	}
283 
284 	return 1;
285 }
286 __setup("intel_iommu=", intel_iommu_setup);
287 
288 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
289 {
290 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
291 
292 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
293 }
294 
295 /*
296  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
297  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
298  * the returned SAGAW.
299  */
300 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
301 {
302 	unsigned long fl_sagaw, sl_sagaw;
303 
304 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
305 	sl_sagaw = cap_sagaw(iommu->cap);
306 
307 	/* Second level only. */
308 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
309 		return sl_sagaw;
310 
311 	/* First level only. */
312 	if (!ecap_slts(iommu->ecap))
313 		return fl_sagaw;
314 
315 	return fl_sagaw & sl_sagaw;
316 }
317 
318 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
319 {
320 	unsigned long sagaw;
321 	int agaw;
322 
323 	sagaw = __iommu_calculate_sagaw(iommu);
324 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
325 		if (test_bit(agaw, &sagaw))
326 			break;
327 	}
328 
329 	return agaw;
330 }
331 
332 /*
333  * Calculate max SAGAW for each iommu.
334  */
335 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
336 {
337 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
338 }
339 
340 /*
341  * calculate agaw for each iommu.
342  * "SAGAW" may be different across iommus, use a default agaw, and
343  * get a supported less agaw for iommus that don't support the default agaw.
344  */
345 int iommu_calculate_agaw(struct intel_iommu *iommu)
346 {
347 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
348 }
349 
350 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
351 {
352 	return sm_supported(iommu) ?
353 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
354 }
355 
356 /* Return the super pagesize bitmap if supported. */
357 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
358 {
359 	unsigned long bitmap = 0;
360 
361 	/*
362 	 * 1-level super page supports page size of 2MiB, 2-level super page
363 	 * supports page size of both 2MiB and 1GiB.
364 	 */
365 	if (domain->iommu_superpage == 1)
366 		bitmap |= SZ_2M;
367 	else if (domain->iommu_superpage == 2)
368 		bitmap |= SZ_2M | SZ_1G;
369 
370 	return bitmap;
371 }
372 
373 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
374 					 u8 devfn, int alloc)
375 {
376 	struct root_entry *root = &iommu->root_entry[bus];
377 	struct context_entry *context;
378 	u64 *entry;
379 
380 	/*
381 	 * Except that the caller requested to allocate a new entry,
382 	 * returning a copied context entry makes no sense.
383 	 */
384 	if (!alloc && context_copied(iommu, bus, devfn))
385 		return NULL;
386 
387 	entry = &root->lo;
388 	if (sm_supported(iommu)) {
389 		if (devfn >= 0x80) {
390 			devfn -= 0x80;
391 			entry = &root->hi;
392 		}
393 		devfn *= 2;
394 	}
395 	if (*entry & 1)
396 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
397 	else {
398 		unsigned long phy_addr;
399 		if (!alloc)
400 			return NULL;
401 
402 		context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
403 						    SZ_4K);
404 		if (!context)
405 			return NULL;
406 
407 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
408 		phy_addr = virt_to_phys((void *)context);
409 		*entry = phy_addr | 1;
410 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
411 	}
412 	return &context[devfn];
413 }
414 
415 /**
416  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
417  *				 sub-hierarchy of a candidate PCI-PCI bridge
418  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
419  * @bridge: the candidate PCI-PCI bridge
420  *
421  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
422  */
423 static bool
424 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
425 {
426 	struct pci_dev *pdev, *pbridge;
427 
428 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
429 		return false;
430 
431 	pdev = to_pci_dev(dev);
432 	pbridge = to_pci_dev(bridge);
433 
434 	if (pbridge->subordinate &&
435 	    pbridge->subordinate->number <= pdev->bus->number &&
436 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
437 		return true;
438 
439 	return false;
440 }
441 
442 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
443 {
444 	struct dmar_drhd_unit *drhd;
445 	u32 vtbar;
446 	int rc;
447 
448 	/* We know that this device on this chipset has its own IOMMU.
449 	 * If we find it under a different IOMMU, then the BIOS is lying
450 	 * to us. Hope that the IOMMU for this device is actually
451 	 * disabled, and it needs no translation...
452 	 */
453 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
454 	if (rc) {
455 		/* "can't" happen */
456 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
457 		return false;
458 	}
459 	vtbar &= 0xffff0000;
460 
461 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
462 	drhd = dmar_find_matched_drhd_unit(pdev);
463 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
464 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
465 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
466 		return true;
467 	}
468 
469 	return false;
470 }
471 
472 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
473 {
474 	if (!iommu || iommu->drhd->ignored)
475 		return true;
476 
477 	if (dev_is_pci(dev)) {
478 		struct pci_dev *pdev = to_pci_dev(dev);
479 
480 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
481 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
482 		    quirk_ioat_snb_local_iommu(pdev))
483 			return true;
484 	}
485 
486 	return false;
487 }
488 
489 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
490 {
491 	struct dmar_drhd_unit *drhd = NULL;
492 	struct pci_dev *pdev = NULL;
493 	struct intel_iommu *iommu;
494 	struct device *tmp;
495 	u16 segment = 0;
496 	int i;
497 
498 	if (!dev)
499 		return NULL;
500 
501 	if (dev_is_pci(dev)) {
502 		struct pci_dev *pf_pdev;
503 
504 		pdev = pci_real_dma_dev(to_pci_dev(dev));
505 
506 		/* VFs aren't listed in scope tables; we need to look up
507 		 * the PF instead to find the IOMMU. */
508 		pf_pdev = pci_physfn(pdev);
509 		dev = &pf_pdev->dev;
510 		segment = pci_domain_nr(pdev->bus);
511 	} else if (has_acpi_companion(dev))
512 		dev = &ACPI_COMPANION(dev)->dev;
513 
514 	rcu_read_lock();
515 	for_each_iommu(iommu, drhd) {
516 		if (pdev && segment != drhd->segment)
517 			continue;
518 
519 		for_each_active_dev_scope(drhd->devices,
520 					  drhd->devices_cnt, i, tmp) {
521 			if (tmp == dev) {
522 				/* For a VF use its original BDF# not that of the PF
523 				 * which we used for the IOMMU lookup. Strictly speaking
524 				 * we could do this for all PCI devices; we only need to
525 				 * get the BDF# from the scope table for ACPI matches. */
526 				if (pdev && pdev->is_virtfn)
527 					goto got_pdev;
528 
529 				if (bus && devfn) {
530 					*bus = drhd->devices[i].bus;
531 					*devfn = drhd->devices[i].devfn;
532 				}
533 				goto out;
534 			}
535 
536 			if (is_downstream_to_pci_bridge(dev, tmp))
537 				goto got_pdev;
538 		}
539 
540 		if (pdev && drhd->include_all) {
541 got_pdev:
542 			if (bus && devfn) {
543 				*bus = pdev->bus->number;
544 				*devfn = pdev->devfn;
545 			}
546 			goto out;
547 		}
548 	}
549 	iommu = NULL;
550 out:
551 	if (iommu_is_dummy(iommu, dev))
552 		iommu = NULL;
553 
554 	rcu_read_unlock();
555 
556 	return iommu;
557 }
558 
559 static void domain_flush_cache(struct dmar_domain *domain,
560 			       void *addr, int size)
561 {
562 	if (!domain->iommu_coherency)
563 		clflush_cache_range(addr, size);
564 }
565 
566 static void free_context_table(struct intel_iommu *iommu)
567 {
568 	struct context_entry *context;
569 	int i;
570 
571 	if (!iommu->root_entry)
572 		return;
573 
574 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
575 		context = iommu_context_addr(iommu, i, 0, 0);
576 		if (context)
577 			iommu_free_pages(context);
578 
579 		if (!sm_supported(iommu))
580 			continue;
581 
582 		context = iommu_context_addr(iommu, i, 0x80, 0);
583 		if (context)
584 			iommu_free_pages(context);
585 	}
586 
587 	iommu_free_pages(iommu->root_entry);
588 	iommu->root_entry = NULL;
589 }
590 
591 #ifdef CONFIG_DMAR_DEBUG
592 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
593 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
594 {
595 	struct dma_pte *pte;
596 	int offset;
597 
598 	while (1) {
599 		offset = pfn_level_offset(pfn, level);
600 		pte = &parent[offset];
601 
602 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
603 
604 		if (!dma_pte_present(pte)) {
605 			pr_info("page table not present at level %d\n", level - 1);
606 			break;
607 		}
608 
609 		if (level == 1 || dma_pte_superpage(pte))
610 			break;
611 
612 		parent = phys_to_virt(dma_pte_addr(pte));
613 		level--;
614 	}
615 }
616 
617 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
618 			  unsigned long long addr, u32 pasid)
619 {
620 	struct pasid_dir_entry *dir, *pde;
621 	struct pasid_entry *entries, *pte;
622 	struct context_entry *ctx_entry;
623 	struct root_entry *rt_entry;
624 	int i, dir_index, index, level;
625 	u8 devfn = source_id & 0xff;
626 	u8 bus = source_id >> 8;
627 	struct dma_pte *pgtable;
628 
629 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
630 
631 	/* root entry dump */
632 	if (!iommu->root_entry) {
633 		pr_info("root table is not present\n");
634 		return;
635 	}
636 	rt_entry = &iommu->root_entry[bus];
637 
638 	if (sm_supported(iommu))
639 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
640 			rt_entry->hi, rt_entry->lo);
641 	else
642 		pr_info("root entry: 0x%016llx", rt_entry->lo);
643 
644 	/* context entry dump */
645 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
646 	if (!ctx_entry) {
647 		pr_info("context table is not present\n");
648 		return;
649 	}
650 
651 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
652 		ctx_entry->hi, ctx_entry->lo);
653 
654 	/* legacy mode does not require PASID entries */
655 	if (!sm_supported(iommu)) {
656 		if (!context_present(ctx_entry)) {
657 			pr_info("legacy mode page table is not present\n");
658 			return;
659 		}
660 		level = agaw_to_level(ctx_entry->hi & 7);
661 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
662 		goto pgtable_walk;
663 	}
664 
665 	if (!context_present(ctx_entry)) {
666 		pr_info("pasid directory table is not present\n");
667 		return;
668 	}
669 
670 	/* get the pointer to pasid directory entry */
671 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
672 
673 	/* For request-without-pasid, get the pasid from context entry */
674 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
675 		pasid = IOMMU_NO_PASID;
676 
677 	dir_index = pasid >> PASID_PDE_SHIFT;
678 	pde = &dir[dir_index];
679 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
680 
681 	/* get the pointer to the pasid table entry */
682 	entries = get_pasid_table_from_pde(pde);
683 	if (!entries) {
684 		pr_info("pasid table is not present\n");
685 		return;
686 	}
687 	index = pasid & PASID_PTE_MASK;
688 	pte = &entries[index];
689 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
690 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
691 
692 	if (!pasid_pte_is_present(pte)) {
693 		pr_info("scalable mode page table is not present\n");
694 		return;
695 	}
696 
697 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
698 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
699 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
700 	} else {
701 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
702 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
703 	}
704 
705 pgtable_walk:
706 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
707 }
708 #endif
709 
710 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
711 				      unsigned long pfn, int *target_level,
712 				      gfp_t gfp)
713 {
714 	struct dma_pte *parent, *pte;
715 	int level = agaw_to_level(domain->agaw);
716 	int offset;
717 
718 	if (!domain_pfn_supported(domain, pfn))
719 		/* Address beyond IOMMU's addressing capabilities. */
720 		return NULL;
721 
722 	parent = domain->pgd;
723 
724 	while (1) {
725 		void *tmp_page;
726 
727 		offset = pfn_level_offset(pfn, level);
728 		pte = &parent[offset];
729 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
730 			break;
731 		if (level == *target_level)
732 			break;
733 
734 		if (!dma_pte_present(pte)) {
735 			uint64_t pteval, tmp;
736 
737 			tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
738 							     SZ_4K);
739 
740 			if (!tmp_page)
741 				return NULL;
742 
743 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
744 			pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
745 				 DMA_PTE_WRITE;
746 			if (domain->use_first_level)
747 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
748 
749 			tmp = 0ULL;
750 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
751 				/* Someone else set it while we were thinking; use theirs. */
752 				iommu_free_pages(tmp_page);
753 			else
754 				domain_flush_cache(domain, pte, sizeof(*pte));
755 		}
756 		if (level == 1)
757 			break;
758 
759 		parent = phys_to_virt(dma_pte_addr(pte));
760 		level--;
761 	}
762 
763 	if (!*target_level)
764 		*target_level = level;
765 
766 	return pte;
767 }
768 
769 /* return address's pte at specific level */
770 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
771 					 unsigned long pfn,
772 					 int level, int *large_page)
773 {
774 	struct dma_pte *parent, *pte;
775 	int total = agaw_to_level(domain->agaw);
776 	int offset;
777 
778 	parent = domain->pgd;
779 	while (level <= total) {
780 		offset = pfn_level_offset(pfn, total);
781 		pte = &parent[offset];
782 		if (level == total)
783 			return pte;
784 
785 		if (!dma_pte_present(pte)) {
786 			*large_page = total;
787 			break;
788 		}
789 
790 		if (dma_pte_superpage(pte)) {
791 			*large_page = total;
792 			return pte;
793 		}
794 
795 		parent = phys_to_virt(dma_pte_addr(pte));
796 		total--;
797 	}
798 	return NULL;
799 }
800 
801 /* clear last level pte, a tlb flush should be followed */
802 static void dma_pte_clear_range(struct dmar_domain *domain,
803 				unsigned long start_pfn,
804 				unsigned long last_pfn)
805 {
806 	unsigned int large_page;
807 	struct dma_pte *first_pte, *pte;
808 
809 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
810 	    WARN_ON(start_pfn > last_pfn))
811 		return;
812 
813 	/* we don't need lock here; nobody else touches the iova range */
814 	do {
815 		large_page = 1;
816 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
817 		if (!pte) {
818 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
819 			continue;
820 		}
821 		do {
822 			dma_clear_pte(pte);
823 			start_pfn += lvl_to_nr_pages(large_page);
824 			pte++;
825 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
826 
827 		domain_flush_cache(domain, first_pte,
828 				   (void *)pte - (void *)first_pte);
829 
830 	} while (start_pfn && start_pfn <= last_pfn);
831 }
832 
833 static void dma_pte_free_level(struct dmar_domain *domain, int level,
834 			       int retain_level, struct dma_pte *pte,
835 			       unsigned long pfn, unsigned long start_pfn,
836 			       unsigned long last_pfn)
837 {
838 	pfn = max(start_pfn, pfn);
839 	pte = &pte[pfn_level_offset(pfn, level)];
840 
841 	do {
842 		unsigned long level_pfn;
843 		struct dma_pte *level_pte;
844 
845 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
846 			goto next;
847 
848 		level_pfn = pfn & level_mask(level);
849 		level_pte = phys_to_virt(dma_pte_addr(pte));
850 
851 		if (level > 2) {
852 			dma_pte_free_level(domain, level - 1, retain_level,
853 					   level_pte, level_pfn, start_pfn,
854 					   last_pfn);
855 		}
856 
857 		/*
858 		 * Free the page table if we're below the level we want to
859 		 * retain and the range covers the entire table.
860 		 */
861 		if (level < retain_level && !(start_pfn > level_pfn ||
862 		      last_pfn < level_pfn + level_size(level) - 1)) {
863 			dma_clear_pte(pte);
864 			domain_flush_cache(domain, pte, sizeof(*pte));
865 			iommu_free_pages(level_pte);
866 		}
867 next:
868 		pfn += level_size(level);
869 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
870 }
871 
872 /*
873  * clear last level (leaf) ptes and free page table pages below the
874  * level we wish to keep intact.
875  */
876 static void dma_pte_free_pagetable(struct dmar_domain *domain,
877 				   unsigned long start_pfn,
878 				   unsigned long last_pfn,
879 				   int retain_level)
880 {
881 	dma_pte_clear_range(domain, start_pfn, last_pfn);
882 
883 	/* We don't need lock here; nobody else touches the iova range */
884 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
885 			   domain->pgd, 0, start_pfn, last_pfn);
886 
887 	/* free pgd */
888 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
889 		iommu_free_pages(domain->pgd);
890 		domain->pgd = NULL;
891 	}
892 }
893 
894 /* When a page at a given level is being unlinked from its parent, we don't
895    need to *modify* it at all. All we need to do is make a list of all the
896    pages which can be freed just as soon as we've flushed the IOTLB and we
897    know the hardware page-walk will no longer touch them.
898    The 'pte' argument is the *parent* PTE, pointing to the page that is to
899    be freed. */
900 static void dma_pte_list_pagetables(struct dmar_domain *domain,
901 				    int level, struct dma_pte *parent_pte,
902 				    struct iommu_pages_list *freelist)
903 {
904 	struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
905 
906 	iommu_pages_list_add(freelist, pte);
907 
908 	if (level == 1)
909 		return;
910 
911 	do {
912 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
913 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
914 		pte++;
915 	} while (!first_pte_in_page(pte));
916 }
917 
918 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
919 				struct dma_pte *pte, unsigned long pfn,
920 				unsigned long start_pfn, unsigned long last_pfn,
921 				struct iommu_pages_list *freelist)
922 {
923 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
924 
925 	pfn = max(start_pfn, pfn);
926 	pte = &pte[pfn_level_offset(pfn, level)];
927 
928 	do {
929 		unsigned long level_pfn = pfn & level_mask(level);
930 
931 		if (!dma_pte_present(pte))
932 			goto next;
933 
934 		/* If range covers entire pagetable, free it */
935 		if (start_pfn <= level_pfn &&
936 		    last_pfn >= level_pfn + level_size(level) - 1) {
937 			/* These suborbinate page tables are going away entirely. Don't
938 			   bother to clear them; we're just going to *free* them. */
939 			if (level > 1 && !dma_pte_superpage(pte))
940 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
941 
942 			dma_clear_pte(pte);
943 			if (!first_pte)
944 				first_pte = pte;
945 			last_pte = pte;
946 		} else if (level > 1) {
947 			/* Recurse down into a level that isn't *entirely* obsolete */
948 			dma_pte_clear_level(domain, level - 1,
949 					    phys_to_virt(dma_pte_addr(pte)),
950 					    level_pfn, start_pfn, last_pfn,
951 					    freelist);
952 		}
953 next:
954 		pfn = level_pfn + level_size(level);
955 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
956 
957 	if (first_pte)
958 		domain_flush_cache(domain, first_pte,
959 				   (void *)++last_pte - (void *)first_pte);
960 }
961 
962 /* We can't just free the pages because the IOMMU may still be walking
963    the page tables, and may have cached the intermediate levels. The
964    pages can only be freed after the IOTLB flush has been done. */
965 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
966 			 unsigned long last_pfn,
967 			 struct iommu_pages_list *freelist)
968 {
969 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
970 	    WARN_ON(start_pfn > last_pfn))
971 		return;
972 
973 	/* we don't need lock here; nobody else touches the iova range */
974 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
975 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
976 
977 	/* free pgd */
978 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
979 		iommu_pages_list_add(freelist, domain->pgd);
980 		domain->pgd = NULL;
981 	}
982 }
983 
984 /* iommu handling */
985 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
986 {
987 	struct root_entry *root;
988 
989 	root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
990 	if (!root) {
991 		pr_err("Allocating root entry for %s failed\n",
992 			iommu->name);
993 		return -ENOMEM;
994 	}
995 
996 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
997 	iommu->root_entry = root;
998 
999 	return 0;
1000 }
1001 
1002 static void iommu_set_root_entry(struct intel_iommu *iommu)
1003 {
1004 	u64 addr;
1005 	u32 sts;
1006 	unsigned long flag;
1007 
1008 	addr = virt_to_phys(iommu->root_entry);
1009 	if (sm_supported(iommu))
1010 		addr |= DMA_RTADDR_SMT;
1011 
1012 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1013 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1014 
1015 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1016 
1017 	/* Make sure hardware complete it */
1018 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1019 		      readl, (sts & DMA_GSTS_RTPS), sts);
1020 
1021 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1022 
1023 	/*
1024 	 * Hardware invalidates all DMA remapping hardware translation
1025 	 * caches as part of SRTP flow.
1026 	 */
1027 	if (cap_esrtps(iommu->cap))
1028 		return;
1029 
1030 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1031 	if (sm_supported(iommu))
1032 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1033 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1034 }
1035 
1036 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1037 {
1038 	u32 val;
1039 	unsigned long flag;
1040 
1041 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1042 		return;
1043 
1044 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1045 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1046 
1047 	/* Make sure hardware complete it */
1048 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1049 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1050 
1051 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1052 }
1053 
1054 /* return value determine if we need a write buffer flush */
1055 static void __iommu_flush_context(struct intel_iommu *iommu,
1056 				  u16 did, u16 source_id, u8 function_mask,
1057 				  u64 type)
1058 {
1059 	u64 val = 0;
1060 	unsigned long flag;
1061 
1062 	switch (type) {
1063 	case DMA_CCMD_GLOBAL_INVL:
1064 		val = DMA_CCMD_GLOBAL_INVL;
1065 		break;
1066 	case DMA_CCMD_DOMAIN_INVL:
1067 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1068 		break;
1069 	case DMA_CCMD_DEVICE_INVL:
1070 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1071 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1072 		break;
1073 	default:
1074 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1075 			iommu->name, type);
1076 		return;
1077 	}
1078 	val |= DMA_CCMD_ICC;
1079 
1080 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1081 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1082 
1083 	/* Make sure hardware complete it */
1084 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1085 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1086 
1087 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1088 }
1089 
1090 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1091 			 unsigned int size_order, u64 type)
1092 {
1093 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1094 	u64 val = 0, val_iva = 0;
1095 	unsigned long flag;
1096 
1097 	switch (type) {
1098 	case DMA_TLB_GLOBAL_FLUSH:
1099 		/* global flush doesn't need set IVA_REG */
1100 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1101 		break;
1102 	case DMA_TLB_DSI_FLUSH:
1103 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 		break;
1105 	case DMA_TLB_PSI_FLUSH:
1106 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1107 		/* IH bit is passed in as part of address */
1108 		val_iva = size_order | addr;
1109 		break;
1110 	default:
1111 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1112 			iommu->name, type);
1113 		return;
1114 	}
1115 
1116 	if (cap_write_drain(iommu->cap))
1117 		val |= DMA_TLB_WRITE_DRAIN;
1118 
1119 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1120 	/* Note: Only uses first TLB reg currently */
1121 	if (val_iva)
1122 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1123 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1124 
1125 	/* Make sure hardware complete it */
1126 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1127 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1128 
1129 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1130 
1131 	/* check IOTLB invalidation granularity */
1132 	if (DMA_TLB_IAIG(val) == 0)
1133 		pr_err("Flush IOTLB failed\n");
1134 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1135 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1136 			(unsigned long long)DMA_TLB_IIRG(type),
1137 			(unsigned long long)DMA_TLB_IAIG(val));
1138 }
1139 
1140 static struct device_domain_info *
1141 domain_lookup_dev_info(struct dmar_domain *domain,
1142 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1143 {
1144 	struct device_domain_info *info;
1145 	unsigned long flags;
1146 
1147 	spin_lock_irqsave(&domain->lock, flags);
1148 	list_for_each_entry(info, &domain->devices, link) {
1149 		if (info->iommu == iommu && info->bus == bus &&
1150 		    info->devfn == devfn) {
1151 			spin_unlock_irqrestore(&domain->lock, flags);
1152 			return info;
1153 		}
1154 	}
1155 	spin_unlock_irqrestore(&domain->lock, flags);
1156 
1157 	return NULL;
1158 }
1159 
1160 /*
1161  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1162  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1163  * check because it applies only to the built-in QAT devices and it doesn't
1164  * grant additional privileges.
1165  */
1166 #define BUGGY_QAT_DEVID_MASK 0x4940
1167 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1168 {
1169 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1170 		return false;
1171 
1172 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1173 		return false;
1174 
1175 	return true;
1176 }
1177 
1178 static void iommu_enable_pci_ats(struct device_domain_info *info)
1179 {
1180 	struct pci_dev *pdev;
1181 
1182 	if (!info->ats_supported)
1183 		return;
1184 
1185 	pdev = to_pci_dev(info->dev);
1186 	if (!pci_ats_page_aligned(pdev))
1187 		return;
1188 
1189 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1190 		info->ats_enabled = 1;
1191 }
1192 
1193 static void iommu_disable_pci_ats(struct device_domain_info *info)
1194 {
1195 	if (!info->ats_enabled)
1196 		return;
1197 
1198 	pci_disable_ats(to_pci_dev(info->dev));
1199 	info->ats_enabled = 0;
1200 }
1201 
1202 static void iommu_enable_pci_pri(struct device_domain_info *info)
1203 {
1204 	struct pci_dev *pdev;
1205 
1206 	if (!info->ats_enabled || !info->pri_supported)
1207 		return;
1208 
1209 	pdev = to_pci_dev(info->dev);
1210 	/* PASID is required in PRG Response Message. */
1211 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1212 		return;
1213 
1214 	if (pci_reset_pri(pdev))
1215 		return;
1216 
1217 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1218 		info->pri_enabled = 1;
1219 }
1220 
1221 static void iommu_disable_pci_pri(struct device_domain_info *info)
1222 {
1223 	if (!info->pri_enabled)
1224 		return;
1225 
1226 	if (WARN_ON(info->iopf_refcount))
1227 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1228 
1229 	pci_disable_pri(to_pci_dev(info->dev));
1230 	info->pri_enabled = 0;
1231 }
1232 
1233 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1234 {
1235 	cache_tag_flush_all(to_dmar_domain(domain));
1236 }
1237 
1238 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1239 {
1240 	u32 pmen;
1241 	unsigned long flags;
1242 
1243 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1244 		return;
1245 
1246 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1247 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1248 	pmen &= ~DMA_PMEN_EPM;
1249 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1250 
1251 	/* wait for the protected region status bit to clear */
1252 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1253 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1254 
1255 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1256 }
1257 
1258 static void iommu_enable_translation(struct intel_iommu *iommu)
1259 {
1260 	u32 sts;
1261 	unsigned long flags;
1262 
1263 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1264 	iommu->gcmd |= DMA_GCMD_TE;
1265 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1266 
1267 	/* Make sure hardware complete it */
1268 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1269 		      readl, (sts & DMA_GSTS_TES), sts);
1270 
1271 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1272 }
1273 
1274 static void iommu_disable_translation(struct intel_iommu *iommu)
1275 {
1276 	u32 sts;
1277 	unsigned long flag;
1278 
1279 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1280 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1281 		return;
1282 
1283 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1284 	iommu->gcmd &= ~DMA_GCMD_TE;
1285 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1289 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
1294 static void disable_dmar_iommu(struct intel_iommu *iommu)
1295 {
1296 	/*
1297 	 * All iommu domains must have been detached from the devices,
1298 	 * hence there should be no domain IDs in use.
1299 	 */
1300 	if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1301 		return;
1302 
1303 	if (iommu->gcmd & DMA_GCMD_TE)
1304 		iommu_disable_translation(iommu);
1305 }
1306 
1307 static void free_dmar_iommu(struct intel_iommu *iommu)
1308 {
1309 	if (iommu->copied_tables) {
1310 		bitmap_free(iommu->copied_tables);
1311 		iommu->copied_tables = NULL;
1312 	}
1313 
1314 	/* free context mapping */
1315 	free_context_table(iommu);
1316 
1317 	if (ecap_prs(iommu->ecap))
1318 		intel_iommu_finish_prq(iommu);
1319 }
1320 
1321 /*
1322  * Check and return whether first level is used by default for
1323  * DMA translation.
1324  */
1325 static bool first_level_by_default(struct intel_iommu *iommu)
1326 {
1327 	/* Only SL is available in legacy mode */
1328 	if (!sm_supported(iommu))
1329 		return false;
1330 
1331 	/* Only level (either FL or SL) is available, just use it */
1332 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1333 		return ecap_flts(iommu->ecap);
1334 
1335 	return true;
1336 }
1337 
1338 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1339 {
1340 	struct iommu_domain_info *info, *curr;
1341 	int num, ret = -ENOSPC;
1342 
1343 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1344 		return 0;
1345 
1346 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1347 	if (!info)
1348 		return -ENOMEM;
1349 
1350 	guard(mutex)(&iommu->did_lock);
1351 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1352 	if (curr) {
1353 		curr->refcnt++;
1354 		kfree(info);
1355 		return 0;
1356 	}
1357 
1358 	num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1359 			      cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1360 	if (num < 0) {
1361 		pr_err("%s: No free domain ids\n", iommu->name);
1362 		goto err_unlock;
1363 	}
1364 
1365 	info->refcnt	= 1;
1366 	info->did	= num;
1367 	info->iommu	= iommu;
1368 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1369 			  NULL, info, GFP_KERNEL);
1370 	if (curr) {
1371 		ret = xa_err(curr) ? : -EBUSY;
1372 		goto err_clear;
1373 	}
1374 
1375 	return 0;
1376 
1377 err_clear:
1378 	ida_free(&iommu->domain_ida, info->did);
1379 err_unlock:
1380 	kfree(info);
1381 	return ret;
1382 }
1383 
1384 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1385 {
1386 	struct iommu_domain_info *info;
1387 
1388 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1389 		return;
1390 
1391 	guard(mutex)(&iommu->did_lock);
1392 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1393 	if (--info->refcnt == 0) {
1394 		ida_free(&iommu->domain_ida, info->did);
1395 		xa_erase(&domain->iommu_array, iommu->seq_id);
1396 		kfree(info);
1397 	}
1398 }
1399 
1400 /*
1401  * For kdump cases, old valid entries may be cached due to the
1402  * in-flight DMA and copied pgtable, but there is no unmapping
1403  * behaviour for them, thus we need an explicit cache flush for
1404  * the newly-mapped device. For kdump, at this point, the device
1405  * is supposed to finish reset at its driver probe stage, so no
1406  * in-flight DMA will exist, and we don't need to worry anymore
1407  * hereafter.
1408  */
1409 static void copied_context_tear_down(struct intel_iommu *iommu,
1410 				     struct context_entry *context,
1411 				     u8 bus, u8 devfn)
1412 {
1413 	u16 did_old;
1414 
1415 	if (!context_copied(iommu, bus, devfn))
1416 		return;
1417 
1418 	assert_spin_locked(&iommu->lock);
1419 
1420 	did_old = context_domain_id(context);
1421 	context_clear_entry(context);
1422 
1423 	if (did_old < cap_ndoms(iommu->cap)) {
1424 		iommu->flush.flush_context(iommu, did_old,
1425 					   PCI_DEVID(bus, devfn),
1426 					   DMA_CCMD_MASK_NOBIT,
1427 					   DMA_CCMD_DEVICE_INVL);
1428 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1429 					 DMA_TLB_DSI_FLUSH);
1430 	}
1431 
1432 	clear_context_copied(iommu, bus, devfn);
1433 }
1434 
1435 /*
1436  * It's a non-present to present mapping. If hardware doesn't cache
1437  * non-present entry we only need to flush the write-buffer. If the
1438  * _does_ cache non-present entries, then it does so in the special
1439  * domain #0, which we have to flush:
1440  */
1441 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1442 					u8 bus, u8 devfn)
1443 {
1444 	if (cap_caching_mode(iommu->cap)) {
1445 		iommu->flush.flush_context(iommu, 0,
1446 					   PCI_DEVID(bus, devfn),
1447 					   DMA_CCMD_MASK_NOBIT,
1448 					   DMA_CCMD_DEVICE_INVL);
1449 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1450 	} else {
1451 		iommu_flush_write_buffer(iommu);
1452 	}
1453 }
1454 
1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1456 				      struct intel_iommu *iommu,
1457 				      u8 bus, u8 devfn)
1458 {
1459 	struct device_domain_info *info =
1460 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1461 	u16 did = domain_id_iommu(domain, iommu);
1462 	int translation = CONTEXT_TT_MULTI_LEVEL;
1463 	struct dma_pte *pgd = domain->pgd;
1464 	struct context_entry *context;
1465 	int ret;
1466 
1467 	if (WARN_ON(!intel_domain_is_ss_paging(domain)))
1468 		return -EINVAL;
1469 
1470 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1471 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1472 
1473 	spin_lock(&iommu->lock);
1474 	ret = -ENOMEM;
1475 	context = iommu_context_addr(iommu, bus, devfn, 1);
1476 	if (!context)
1477 		goto out_unlock;
1478 
1479 	ret = 0;
1480 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1481 		goto out_unlock;
1482 
1483 	copied_context_tear_down(iommu, context, bus, devfn);
1484 	context_clear_entry(context);
1485 	context_set_domain_id(context, did);
1486 
1487 	if (info && info->ats_supported)
1488 		translation = CONTEXT_TT_DEV_IOTLB;
1489 	else
1490 		translation = CONTEXT_TT_MULTI_LEVEL;
1491 
1492 	context_set_address_root(context, virt_to_phys(pgd));
1493 	context_set_address_width(context, domain->agaw);
1494 	context_set_translation_type(context, translation);
1495 	context_set_fault_enable(context);
1496 	context_set_present(context);
1497 	if (!ecap_coherent(iommu->ecap))
1498 		clflush_cache_range(context, sizeof(*context));
1499 	context_present_cache_flush(iommu, did, bus, devfn);
1500 	ret = 0;
1501 
1502 out_unlock:
1503 	spin_unlock(&iommu->lock);
1504 
1505 	return ret;
1506 }
1507 
1508 static int domain_context_mapping_cb(struct pci_dev *pdev,
1509 				     u16 alias, void *opaque)
1510 {
1511 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1512 	struct intel_iommu *iommu = info->iommu;
1513 	struct dmar_domain *domain = opaque;
1514 
1515 	return domain_context_mapping_one(domain, iommu,
1516 					  PCI_BUS_NUM(alias), alias & 0xff);
1517 }
1518 
1519 static int
1520 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1521 {
1522 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1523 	struct intel_iommu *iommu = info->iommu;
1524 	u8 bus = info->bus, devfn = info->devfn;
1525 	int ret;
1526 
1527 	if (!dev_is_pci(dev))
1528 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1529 
1530 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1531 				     domain_context_mapping_cb, domain);
1532 	if (ret)
1533 		return ret;
1534 
1535 	iommu_enable_pci_ats(info);
1536 
1537 	return 0;
1538 }
1539 
1540 /* Return largest possible superpage level for a given mapping */
1541 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1542 				   unsigned long phy_pfn, unsigned long pages)
1543 {
1544 	int support, level = 1;
1545 	unsigned long pfnmerge;
1546 
1547 	support = domain->iommu_superpage;
1548 
1549 	/* To use a large page, the virtual *and* physical addresses
1550 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1551 	   of them will mean we have to use smaller pages. So just
1552 	   merge them and check both at once. */
1553 	pfnmerge = iov_pfn | phy_pfn;
1554 
1555 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1556 		pages >>= VTD_STRIDE_SHIFT;
1557 		if (!pages)
1558 			break;
1559 		pfnmerge >>= VTD_STRIDE_SHIFT;
1560 		level++;
1561 		support--;
1562 	}
1563 	return level;
1564 }
1565 
1566 /*
1567  * Ensure that old small page tables are removed to make room for superpage(s).
1568  * We're going to add new large pages, so make sure we don't remove their parent
1569  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1570  */
1571 static void switch_to_super_page(struct dmar_domain *domain,
1572 				 unsigned long start_pfn,
1573 				 unsigned long end_pfn, int level)
1574 {
1575 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1576 	struct dma_pte *pte = NULL;
1577 
1578 	if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
1579 		    !IS_ALIGNED(end_pfn + 1, lvl_pages)))
1580 		return;
1581 
1582 	while (start_pfn <= end_pfn) {
1583 		if (!pte)
1584 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1585 					     GFP_ATOMIC);
1586 
1587 		if (dma_pte_present(pte)) {
1588 			dma_pte_free_pagetable(domain, start_pfn,
1589 					       start_pfn + lvl_pages - 1,
1590 					       level + 1);
1591 
1592 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1593 					      end_pfn << VTD_PAGE_SHIFT, 0);
1594 		}
1595 
1596 		pte++;
1597 		start_pfn += lvl_pages;
1598 		if (first_pte_in_page(pte))
1599 			pte = NULL;
1600 	}
1601 }
1602 
1603 static int
1604 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1605 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1606 		 gfp_t gfp)
1607 {
1608 	struct dma_pte *first_pte = NULL, *pte = NULL;
1609 	unsigned int largepage_lvl = 0;
1610 	unsigned long lvl_pages = 0;
1611 	phys_addr_t pteval;
1612 	u64 attr;
1613 
1614 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1615 		return -EINVAL;
1616 
1617 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1618 		return -EINVAL;
1619 
1620 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1621 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1622 		return -EINVAL;
1623 	}
1624 
1625 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1626 	if (domain->use_first_level) {
1627 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1628 		if (prot & DMA_PTE_WRITE)
1629 			attr |= DMA_FL_PTE_DIRTY;
1630 	}
1631 
1632 	domain->has_mappings = true;
1633 
1634 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1635 
1636 	while (nr_pages > 0) {
1637 		uint64_t tmp;
1638 
1639 		if (!pte) {
1640 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1641 					phys_pfn, nr_pages);
1642 
1643 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1644 					     gfp);
1645 			if (!pte)
1646 				return -ENOMEM;
1647 			first_pte = pte;
1648 
1649 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1650 
1651 			/* It is large page*/
1652 			if (largepage_lvl > 1) {
1653 				unsigned long end_pfn;
1654 				unsigned long pages_to_remove;
1655 
1656 				pteval |= DMA_PTE_LARGE_PAGE;
1657 				pages_to_remove = min_t(unsigned long,
1658 							round_down(nr_pages, lvl_pages),
1659 							nr_pte_to_next_page(pte) * lvl_pages);
1660 				end_pfn = iov_pfn + pages_to_remove - 1;
1661 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1662 			} else {
1663 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1664 			}
1665 
1666 		}
1667 		/* We don't need lock here, nobody else
1668 		 * touches the iova range
1669 		 */
1670 		tmp = 0ULL;
1671 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1672 			static int dumps = 5;
1673 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1674 				iov_pfn, tmp, (unsigned long long)pteval);
1675 			if (dumps) {
1676 				dumps--;
1677 				debug_dma_dump_mappings(NULL);
1678 			}
1679 			WARN_ON(1);
1680 		}
1681 
1682 		nr_pages -= lvl_pages;
1683 		iov_pfn += lvl_pages;
1684 		phys_pfn += lvl_pages;
1685 		pteval += lvl_pages * VTD_PAGE_SIZE;
1686 
1687 		/* If the next PTE would be the first in a new page, then we
1688 		 * need to flush the cache on the entries we've just written.
1689 		 * And then we'll need to recalculate 'pte', so clear it and
1690 		 * let it get set again in the if (!pte) block above.
1691 		 *
1692 		 * If we're done (!nr_pages) we need to flush the cache too.
1693 		 *
1694 		 * Also if we've been setting superpages, we may need to
1695 		 * recalculate 'pte' and switch back to smaller pages for the
1696 		 * end of the mapping, if the trailing size is not enough to
1697 		 * use another superpage (i.e. nr_pages < lvl_pages).
1698 		 */
1699 		pte++;
1700 		if (!nr_pages || first_pte_in_page(pte) ||
1701 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1702 			domain_flush_cache(domain, first_pte,
1703 					   (void *)pte - (void *)first_pte);
1704 			pte = NULL;
1705 		}
1706 	}
1707 
1708 	return 0;
1709 }
1710 
1711 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1712 {
1713 	struct intel_iommu *iommu = info->iommu;
1714 	struct context_entry *context;
1715 	u16 did;
1716 
1717 	spin_lock(&iommu->lock);
1718 	context = iommu_context_addr(iommu, bus, devfn, 0);
1719 	if (!context) {
1720 		spin_unlock(&iommu->lock);
1721 		return;
1722 	}
1723 
1724 	did = context_domain_id(context);
1725 	context_clear_entry(context);
1726 	__iommu_flush_cache(iommu, context, sizeof(*context));
1727 	spin_unlock(&iommu->lock);
1728 	intel_context_flush_no_pasid(info, context, did);
1729 }
1730 
1731 int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
1732 			       ioasid_t pasid, u16 did, phys_addr_t fsptptr,
1733 			       int flags, struct iommu_domain *old)
1734 {
1735 	if (!old)
1736 		return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
1737 						     did, flags);
1738 	return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
1739 					       iommu_domain_did(old, iommu),
1740 					       flags);
1741 }
1742 
1743 static int domain_setup_second_level(struct intel_iommu *iommu,
1744 				     struct dmar_domain *domain,
1745 				     struct device *dev, ioasid_t pasid,
1746 				     struct iommu_domain *old)
1747 {
1748 	if (!old)
1749 		return intel_pasid_setup_second_level(iommu, domain,
1750 						      dev, pasid);
1751 	return intel_pasid_replace_second_level(iommu, domain, dev,
1752 						iommu_domain_did(old, iommu),
1753 						pasid);
1754 }
1755 
1756 static int domain_setup_passthrough(struct intel_iommu *iommu,
1757 				    struct device *dev, ioasid_t pasid,
1758 				    struct iommu_domain *old)
1759 {
1760 	if (!old)
1761 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1762 	return intel_pasid_replace_pass_through(iommu, dev,
1763 						iommu_domain_did(old, iommu),
1764 						pasid);
1765 }
1766 
1767 static int domain_setup_first_level(struct intel_iommu *iommu,
1768 				    struct dmar_domain *domain,
1769 				    struct device *dev,
1770 				    u32 pasid, struct iommu_domain *old)
1771 {
1772 	struct dma_pte *pgd = domain->pgd;
1773 	int level, flags = 0;
1774 
1775 	level = agaw_to_level(domain->agaw);
1776 	if (level != 4 && level != 5)
1777 		return -EINVAL;
1778 
1779 	if (level == 5)
1780 		flags |= PASID_FLAG_FL5LP;
1781 
1782 	if (domain->force_snooping)
1783 		flags |= PASID_FLAG_PAGE_SNOOP;
1784 
1785 	return __domain_setup_first_level(iommu, dev, pasid,
1786 					  domain_id_iommu(domain, iommu),
1787 					  __pa(pgd), flags, old);
1788 }
1789 
1790 static int dmar_domain_attach_device(struct dmar_domain *domain,
1791 				     struct device *dev)
1792 {
1793 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1794 	struct intel_iommu *iommu = info->iommu;
1795 	unsigned long flags;
1796 	int ret;
1797 
1798 	ret = domain_attach_iommu(domain, iommu);
1799 	if (ret)
1800 		return ret;
1801 
1802 	info->domain = domain;
1803 	info->domain_attached = true;
1804 	spin_lock_irqsave(&domain->lock, flags);
1805 	list_add(&info->link, &domain->devices);
1806 	spin_unlock_irqrestore(&domain->lock, flags);
1807 
1808 	if (dev_is_real_dma_subdevice(dev))
1809 		return 0;
1810 
1811 	if (!sm_supported(iommu))
1812 		ret = domain_context_mapping(domain, dev);
1813 	else if (intel_domain_is_fs_paging(domain))
1814 		ret = domain_setup_first_level(iommu, domain, dev,
1815 					       IOMMU_NO_PASID, NULL);
1816 	else if (intel_domain_is_ss_paging(domain))
1817 		ret = domain_setup_second_level(iommu, domain, dev,
1818 						IOMMU_NO_PASID, NULL);
1819 	else if (WARN_ON(true))
1820 		ret = -EINVAL;
1821 
1822 	if (ret)
1823 		goto out_block_translation;
1824 
1825 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1826 	if (ret)
1827 		goto out_block_translation;
1828 
1829 	return 0;
1830 
1831 out_block_translation:
1832 	device_block_translation(dev);
1833 	return ret;
1834 }
1835 
1836 /**
1837  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1838  * is relaxable (ie. is allowed to be not enforced under some conditions)
1839  * @dev: device handle
1840  *
1841  * We assume that PCI USB devices with RMRRs have them largely
1842  * for historical reasons and that the RMRR space is not actively used post
1843  * boot.  This exclusion may change if vendors begin to abuse it.
1844  *
1845  * The same exception is made for graphics devices, with the requirement that
1846  * any use of the RMRR regions will be torn down before assigning the device
1847  * to a guest.
1848  *
1849  * Return: true if the RMRR is relaxable, false otherwise
1850  */
1851 static bool device_rmrr_is_relaxable(struct device *dev)
1852 {
1853 	struct pci_dev *pdev;
1854 
1855 	if (!dev_is_pci(dev))
1856 		return false;
1857 
1858 	pdev = to_pci_dev(dev);
1859 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1860 		return true;
1861 	else
1862 		return false;
1863 }
1864 
1865 static int device_def_domain_type(struct device *dev)
1866 {
1867 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1868 	struct intel_iommu *iommu = info->iommu;
1869 
1870 	/*
1871 	 * Hardware does not support the passthrough translation mode.
1872 	 * Always use a dynamaic mapping domain.
1873 	 */
1874 	if (!ecap_pass_through(iommu->ecap))
1875 		return IOMMU_DOMAIN_DMA;
1876 
1877 	if (dev_is_pci(dev)) {
1878 		struct pci_dev *pdev = to_pci_dev(dev);
1879 
1880 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1881 			return IOMMU_DOMAIN_IDENTITY;
1882 	}
1883 
1884 	return 0;
1885 }
1886 
1887 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1888 {
1889 	/*
1890 	 * Start from the sane iommu hardware state.
1891 	 * If the queued invalidation is already initialized by us
1892 	 * (for example, while enabling interrupt-remapping) then
1893 	 * we got the things already rolling from a sane state.
1894 	 */
1895 	if (!iommu->qi) {
1896 		/*
1897 		 * Clear any previous faults.
1898 		 */
1899 		dmar_fault(-1, iommu);
1900 		/*
1901 		 * Disable queued invalidation if supported and already enabled
1902 		 * before OS handover.
1903 		 */
1904 		dmar_disable_qi(iommu);
1905 	}
1906 
1907 	if (dmar_enable_qi(iommu)) {
1908 		/*
1909 		 * Queued Invalidate not enabled, use Register Based Invalidate
1910 		 */
1911 		iommu->flush.flush_context = __iommu_flush_context;
1912 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1913 		pr_info("%s: Using Register based invalidation\n",
1914 			iommu->name);
1915 	} else {
1916 		iommu->flush.flush_context = qi_flush_context;
1917 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1918 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1919 	}
1920 }
1921 
1922 static int copy_context_table(struct intel_iommu *iommu,
1923 			      struct root_entry *old_re,
1924 			      struct context_entry **tbl,
1925 			      int bus, bool ext)
1926 {
1927 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1928 	struct context_entry *new_ce = NULL, ce;
1929 	struct context_entry *old_ce = NULL;
1930 	struct root_entry re;
1931 	phys_addr_t old_ce_phys;
1932 
1933 	tbl_idx = ext ? bus * 2 : bus;
1934 	memcpy(&re, old_re, sizeof(re));
1935 
1936 	for (devfn = 0; devfn < 256; devfn++) {
1937 		/* First calculate the correct index */
1938 		idx = (ext ? devfn * 2 : devfn) % 256;
1939 
1940 		if (idx == 0) {
1941 			/* First save what we may have and clean up */
1942 			if (new_ce) {
1943 				tbl[tbl_idx] = new_ce;
1944 				__iommu_flush_cache(iommu, new_ce,
1945 						    VTD_PAGE_SIZE);
1946 				pos = 1;
1947 			}
1948 
1949 			if (old_ce)
1950 				memunmap(old_ce);
1951 
1952 			ret = 0;
1953 			if (devfn < 0x80)
1954 				old_ce_phys = root_entry_lctp(&re);
1955 			else
1956 				old_ce_phys = root_entry_uctp(&re);
1957 
1958 			if (!old_ce_phys) {
1959 				if (ext && devfn == 0) {
1960 					/* No LCTP, try UCTP */
1961 					devfn = 0x7f;
1962 					continue;
1963 				} else {
1964 					goto out;
1965 				}
1966 			}
1967 
1968 			ret = -ENOMEM;
1969 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1970 					MEMREMAP_WB);
1971 			if (!old_ce)
1972 				goto out;
1973 
1974 			new_ce = iommu_alloc_pages_node_sz(iommu->node,
1975 							   GFP_KERNEL, SZ_4K);
1976 			if (!new_ce)
1977 				goto out_unmap;
1978 
1979 			ret = 0;
1980 		}
1981 
1982 		/* Now copy the context entry */
1983 		memcpy(&ce, old_ce + idx, sizeof(ce));
1984 
1985 		if (!context_present(&ce))
1986 			continue;
1987 
1988 		did = context_domain_id(&ce);
1989 		if (did >= 0 && did < cap_ndoms(iommu->cap))
1990 			ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
1991 
1992 		set_context_copied(iommu, bus, devfn);
1993 		new_ce[idx] = ce;
1994 	}
1995 
1996 	tbl[tbl_idx + pos] = new_ce;
1997 
1998 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
1999 
2000 out_unmap:
2001 	memunmap(old_ce);
2002 
2003 out:
2004 	return ret;
2005 }
2006 
2007 static int copy_translation_tables(struct intel_iommu *iommu)
2008 {
2009 	struct context_entry **ctxt_tbls;
2010 	struct root_entry *old_rt;
2011 	phys_addr_t old_rt_phys;
2012 	int ctxt_table_entries;
2013 	u64 rtaddr_reg;
2014 	int bus, ret;
2015 	bool new_ext, ext;
2016 
2017 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2018 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2019 	new_ext    = !!sm_supported(iommu);
2020 
2021 	/*
2022 	 * The RTT bit can only be changed when translation is disabled,
2023 	 * but disabling translation means to open a window for data
2024 	 * corruption. So bail out and don't copy anything if we would
2025 	 * have to change the bit.
2026 	 */
2027 	if (new_ext != ext)
2028 		return -EINVAL;
2029 
2030 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2031 	if (!iommu->copied_tables)
2032 		return -ENOMEM;
2033 
2034 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2035 	if (!old_rt_phys)
2036 		return -EINVAL;
2037 
2038 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2039 	if (!old_rt)
2040 		return -ENOMEM;
2041 
2042 	/* This is too big for the stack - allocate it from slab */
2043 	ctxt_table_entries = ext ? 512 : 256;
2044 	ret = -ENOMEM;
2045 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2046 	if (!ctxt_tbls)
2047 		goto out_unmap;
2048 
2049 	for (bus = 0; bus < 256; bus++) {
2050 		ret = copy_context_table(iommu, &old_rt[bus],
2051 					 ctxt_tbls, bus, ext);
2052 		if (ret) {
2053 			pr_err("%s: Failed to copy context table for bus %d\n",
2054 				iommu->name, bus);
2055 			continue;
2056 		}
2057 	}
2058 
2059 	spin_lock(&iommu->lock);
2060 
2061 	/* Context tables are copied, now write them to the root_entry table */
2062 	for (bus = 0; bus < 256; bus++) {
2063 		int idx = ext ? bus * 2 : bus;
2064 		u64 val;
2065 
2066 		if (ctxt_tbls[idx]) {
2067 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2068 			iommu->root_entry[bus].lo = val;
2069 		}
2070 
2071 		if (!ext || !ctxt_tbls[idx + 1])
2072 			continue;
2073 
2074 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2075 		iommu->root_entry[bus].hi = val;
2076 	}
2077 
2078 	spin_unlock(&iommu->lock);
2079 
2080 	kfree(ctxt_tbls);
2081 
2082 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2083 
2084 	ret = 0;
2085 
2086 out_unmap:
2087 	memunmap(old_rt);
2088 
2089 	return ret;
2090 }
2091 
2092 static int __init init_dmars(void)
2093 {
2094 	struct dmar_drhd_unit *drhd;
2095 	struct intel_iommu *iommu;
2096 	int ret;
2097 
2098 	for_each_iommu(iommu, drhd) {
2099 		if (drhd->ignored) {
2100 			iommu_disable_translation(iommu);
2101 			continue;
2102 		}
2103 
2104 		/*
2105 		 * Find the max pasid size of all IOMMU's in the system.
2106 		 * We need to ensure the system pasid table is no bigger
2107 		 * than the smallest supported.
2108 		 */
2109 		if (pasid_supported(iommu)) {
2110 			u32 temp = 2 << ecap_pss(iommu->ecap);
2111 
2112 			intel_pasid_max_id = min_t(u32, temp,
2113 						   intel_pasid_max_id);
2114 		}
2115 
2116 		intel_iommu_init_qi(iommu);
2117 		init_translation_status(iommu);
2118 
2119 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2120 			iommu_disable_translation(iommu);
2121 			clear_translation_pre_enabled(iommu);
2122 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2123 				iommu->name);
2124 		}
2125 
2126 		/*
2127 		 * TBD:
2128 		 * we could share the same root & context tables
2129 		 * among all IOMMU's. Need to Split it later.
2130 		 */
2131 		ret = iommu_alloc_root_entry(iommu);
2132 		if (ret)
2133 			goto free_iommu;
2134 
2135 		if (translation_pre_enabled(iommu)) {
2136 			pr_info("Translation already enabled - trying to copy translation structures\n");
2137 
2138 			ret = copy_translation_tables(iommu);
2139 			if (ret) {
2140 				/*
2141 				 * We found the IOMMU with translation
2142 				 * enabled - but failed to copy over the
2143 				 * old root-entry table. Try to proceed
2144 				 * by disabling translation now and
2145 				 * allocating a clean root-entry table.
2146 				 * This might cause DMAR faults, but
2147 				 * probably the dump will still succeed.
2148 				 */
2149 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2150 				       iommu->name);
2151 				iommu_disable_translation(iommu);
2152 				clear_translation_pre_enabled(iommu);
2153 			} else {
2154 				pr_info("Copied translation tables from previous kernel for %s\n",
2155 					iommu->name);
2156 			}
2157 		}
2158 
2159 		intel_svm_check(iommu);
2160 	}
2161 
2162 	/*
2163 	 * Now that qi is enabled on all iommus, set the root entry and flush
2164 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2165 	 * flush_context function will loop forever and the boot hangs.
2166 	 */
2167 	for_each_active_iommu(iommu, drhd) {
2168 		iommu_flush_write_buffer(iommu);
2169 		iommu_set_root_entry(iommu);
2170 	}
2171 
2172 	check_tylersburg_isoch();
2173 
2174 	/*
2175 	 * for each drhd
2176 	 *   enable fault log
2177 	 *   global invalidate context cache
2178 	 *   global invalidate iotlb
2179 	 *   enable translation
2180 	 */
2181 	for_each_iommu(iommu, drhd) {
2182 		if (drhd->ignored) {
2183 			/*
2184 			 * we always have to disable PMRs or DMA may fail on
2185 			 * this device
2186 			 */
2187 			if (force_on)
2188 				iommu_disable_protect_mem_regions(iommu);
2189 			continue;
2190 		}
2191 
2192 		iommu_flush_write_buffer(iommu);
2193 
2194 		if (ecap_prs(iommu->ecap)) {
2195 			/*
2196 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2197 			 * could cause possible lock race condition.
2198 			 */
2199 			up_write(&dmar_global_lock);
2200 			ret = intel_iommu_enable_prq(iommu);
2201 			down_write(&dmar_global_lock);
2202 			if (ret)
2203 				goto free_iommu;
2204 		}
2205 
2206 		ret = dmar_set_interrupt(iommu);
2207 		if (ret)
2208 			goto free_iommu;
2209 	}
2210 
2211 	return 0;
2212 
2213 free_iommu:
2214 	for_each_active_iommu(iommu, drhd) {
2215 		disable_dmar_iommu(iommu);
2216 		free_dmar_iommu(iommu);
2217 	}
2218 
2219 	return ret;
2220 }
2221 
2222 static void __init init_no_remapping_devices(void)
2223 {
2224 	struct dmar_drhd_unit *drhd;
2225 	struct device *dev;
2226 	int i;
2227 
2228 	for_each_drhd_unit(drhd) {
2229 		if (!drhd->include_all) {
2230 			for_each_active_dev_scope(drhd->devices,
2231 						  drhd->devices_cnt, i, dev)
2232 				break;
2233 			/* ignore DMAR unit if no devices exist */
2234 			if (i == drhd->devices_cnt)
2235 				drhd->ignored = 1;
2236 		}
2237 	}
2238 
2239 	for_each_active_drhd_unit(drhd) {
2240 		if (drhd->include_all)
2241 			continue;
2242 
2243 		for_each_active_dev_scope(drhd->devices,
2244 					  drhd->devices_cnt, i, dev)
2245 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2246 				break;
2247 		if (i < drhd->devices_cnt)
2248 			continue;
2249 
2250 		/* This IOMMU has *only* gfx devices. Either bypass it or
2251 		   set the gfx_mapped flag, as appropriate */
2252 		drhd->gfx_dedicated = 1;
2253 		if (disable_igfx_iommu)
2254 			drhd->ignored = 1;
2255 	}
2256 }
2257 
2258 #ifdef CONFIG_SUSPEND
2259 static int init_iommu_hw(void)
2260 {
2261 	struct dmar_drhd_unit *drhd;
2262 	struct intel_iommu *iommu = NULL;
2263 	int ret;
2264 
2265 	for_each_active_iommu(iommu, drhd) {
2266 		if (iommu->qi) {
2267 			ret = dmar_reenable_qi(iommu);
2268 			if (ret)
2269 				return ret;
2270 		}
2271 	}
2272 
2273 	for_each_iommu(iommu, drhd) {
2274 		if (drhd->ignored) {
2275 			/*
2276 			 * we always have to disable PMRs or DMA may fail on
2277 			 * this device
2278 			 */
2279 			if (force_on)
2280 				iommu_disable_protect_mem_regions(iommu);
2281 			continue;
2282 		}
2283 
2284 		iommu_flush_write_buffer(iommu);
2285 		iommu_set_root_entry(iommu);
2286 		iommu_enable_translation(iommu);
2287 		iommu_disable_protect_mem_regions(iommu);
2288 	}
2289 
2290 	return 0;
2291 }
2292 
2293 static void iommu_flush_all(void)
2294 {
2295 	struct dmar_drhd_unit *drhd;
2296 	struct intel_iommu *iommu;
2297 
2298 	for_each_active_iommu(iommu, drhd) {
2299 		iommu->flush.flush_context(iommu, 0, 0, 0,
2300 					   DMA_CCMD_GLOBAL_INVL);
2301 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2302 					 DMA_TLB_GLOBAL_FLUSH);
2303 	}
2304 }
2305 
2306 static int iommu_suspend(void)
2307 {
2308 	struct dmar_drhd_unit *drhd;
2309 	struct intel_iommu *iommu = NULL;
2310 	unsigned long flag;
2311 
2312 	iommu_flush_all();
2313 
2314 	for_each_active_iommu(iommu, drhd) {
2315 		iommu_disable_translation(iommu);
2316 
2317 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2318 
2319 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2320 			readl(iommu->reg + DMAR_FECTL_REG);
2321 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2322 			readl(iommu->reg + DMAR_FEDATA_REG);
2323 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2324 			readl(iommu->reg + DMAR_FEADDR_REG);
2325 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2326 			readl(iommu->reg + DMAR_FEUADDR_REG);
2327 
2328 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2329 	}
2330 	return 0;
2331 }
2332 
2333 static void iommu_resume(void)
2334 {
2335 	struct dmar_drhd_unit *drhd;
2336 	struct intel_iommu *iommu = NULL;
2337 	unsigned long flag;
2338 
2339 	if (init_iommu_hw()) {
2340 		if (force_on)
2341 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2342 		else
2343 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2344 		return;
2345 	}
2346 
2347 	for_each_active_iommu(iommu, drhd) {
2348 
2349 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350 
2351 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2352 			iommu->reg + DMAR_FECTL_REG);
2353 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2354 			iommu->reg + DMAR_FEDATA_REG);
2355 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2356 			iommu->reg + DMAR_FEADDR_REG);
2357 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2358 			iommu->reg + DMAR_FEUADDR_REG);
2359 
2360 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361 	}
2362 }
2363 
2364 static struct syscore_ops iommu_syscore_ops = {
2365 	.resume		= iommu_resume,
2366 	.suspend	= iommu_suspend,
2367 };
2368 
2369 static void __init init_iommu_pm_ops(void)
2370 {
2371 	register_syscore_ops(&iommu_syscore_ops);
2372 }
2373 
2374 #else
2375 static inline void init_iommu_pm_ops(void) {}
2376 #endif	/* CONFIG_PM */
2377 
2378 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2379 {
2380 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2381 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2382 	    rmrr->end_address <= rmrr->base_address ||
2383 	    arch_rmrr_sanity_check(rmrr))
2384 		return -EINVAL;
2385 
2386 	return 0;
2387 }
2388 
2389 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2390 {
2391 	struct acpi_dmar_reserved_memory *rmrr;
2392 	struct dmar_rmrr_unit *rmrru;
2393 
2394 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2395 	if (rmrr_sanity_check(rmrr)) {
2396 		pr_warn(FW_BUG
2397 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2398 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2399 			   rmrr->base_address, rmrr->end_address,
2400 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2401 			   dmi_get_system_info(DMI_BIOS_VERSION),
2402 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2403 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2404 	}
2405 
2406 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2407 	if (!rmrru)
2408 		goto out;
2409 
2410 	rmrru->hdr = header;
2411 
2412 	rmrru->base_address = rmrr->base_address;
2413 	rmrru->end_address = rmrr->end_address;
2414 
2415 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2416 				((void *)rmrr) + rmrr->header.length,
2417 				&rmrru->devices_cnt);
2418 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2419 		goto free_rmrru;
2420 
2421 	list_add(&rmrru->list, &dmar_rmrr_units);
2422 
2423 	return 0;
2424 free_rmrru:
2425 	kfree(rmrru);
2426 out:
2427 	return -ENOMEM;
2428 }
2429 
2430 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2431 {
2432 	struct dmar_atsr_unit *atsru;
2433 	struct acpi_dmar_atsr *tmp;
2434 
2435 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2436 				dmar_rcu_check()) {
2437 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2438 		if (atsr->segment != tmp->segment)
2439 			continue;
2440 		if (atsr->header.length != tmp->header.length)
2441 			continue;
2442 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2443 			return atsru;
2444 	}
2445 
2446 	return NULL;
2447 }
2448 
2449 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2450 {
2451 	struct acpi_dmar_atsr *atsr;
2452 	struct dmar_atsr_unit *atsru;
2453 
2454 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2455 		return 0;
2456 
2457 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2458 	atsru = dmar_find_atsr(atsr);
2459 	if (atsru)
2460 		return 0;
2461 
2462 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2463 	if (!atsru)
2464 		return -ENOMEM;
2465 
2466 	/*
2467 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2468 	 * copy the memory content because the memory buffer will be freed
2469 	 * on return.
2470 	 */
2471 	atsru->hdr = (void *)(atsru + 1);
2472 	memcpy(atsru->hdr, hdr, hdr->length);
2473 	atsru->include_all = atsr->flags & 0x1;
2474 	if (!atsru->include_all) {
2475 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2476 				(void *)atsr + atsr->header.length,
2477 				&atsru->devices_cnt);
2478 		if (atsru->devices_cnt && atsru->devices == NULL) {
2479 			kfree(atsru);
2480 			return -ENOMEM;
2481 		}
2482 	}
2483 
2484 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2485 
2486 	return 0;
2487 }
2488 
2489 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2490 {
2491 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2492 	kfree(atsru);
2493 }
2494 
2495 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2496 {
2497 	struct acpi_dmar_atsr *atsr;
2498 	struct dmar_atsr_unit *atsru;
2499 
2500 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2501 	atsru = dmar_find_atsr(atsr);
2502 	if (atsru) {
2503 		list_del_rcu(&atsru->list);
2504 		synchronize_rcu();
2505 		intel_iommu_free_atsr(atsru);
2506 	}
2507 
2508 	return 0;
2509 }
2510 
2511 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2512 {
2513 	int i;
2514 	struct device *dev;
2515 	struct acpi_dmar_atsr *atsr;
2516 	struct dmar_atsr_unit *atsru;
2517 
2518 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2519 	atsru = dmar_find_atsr(atsr);
2520 	if (!atsru)
2521 		return 0;
2522 
2523 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2524 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2525 					  i, dev)
2526 			return -EBUSY;
2527 	}
2528 
2529 	return 0;
2530 }
2531 
2532 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2533 {
2534 	struct dmar_satc_unit *satcu;
2535 	struct acpi_dmar_satc *tmp;
2536 
2537 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2538 				dmar_rcu_check()) {
2539 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2540 		if (satc->segment != tmp->segment)
2541 			continue;
2542 		if (satc->header.length != tmp->header.length)
2543 			continue;
2544 		if (memcmp(satc, tmp, satc->header.length) == 0)
2545 			return satcu;
2546 	}
2547 
2548 	return NULL;
2549 }
2550 
2551 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2552 {
2553 	struct acpi_dmar_satc *satc;
2554 	struct dmar_satc_unit *satcu;
2555 
2556 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2557 		return 0;
2558 
2559 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2560 	satcu = dmar_find_satc(satc);
2561 	if (satcu)
2562 		return 0;
2563 
2564 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2565 	if (!satcu)
2566 		return -ENOMEM;
2567 
2568 	satcu->hdr = (void *)(satcu + 1);
2569 	memcpy(satcu->hdr, hdr, hdr->length);
2570 	satcu->atc_required = satc->flags & 0x1;
2571 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2572 					      (void *)satc + satc->header.length,
2573 					      &satcu->devices_cnt);
2574 	if (satcu->devices_cnt && !satcu->devices) {
2575 		kfree(satcu);
2576 		return -ENOMEM;
2577 	}
2578 	list_add_rcu(&satcu->list, &dmar_satc_units);
2579 
2580 	return 0;
2581 }
2582 
2583 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2584 {
2585 	struct intel_iommu *iommu = dmaru->iommu;
2586 	int ret;
2587 
2588 	/*
2589 	 * Disable translation if already enabled prior to OS handover.
2590 	 */
2591 	if (iommu->gcmd & DMA_GCMD_TE)
2592 		iommu_disable_translation(iommu);
2593 
2594 	ret = iommu_alloc_root_entry(iommu);
2595 	if (ret)
2596 		goto out;
2597 
2598 	intel_svm_check(iommu);
2599 
2600 	if (dmaru->ignored) {
2601 		/*
2602 		 * we always have to disable PMRs or DMA may fail on this device
2603 		 */
2604 		if (force_on)
2605 			iommu_disable_protect_mem_regions(iommu);
2606 		return 0;
2607 	}
2608 
2609 	intel_iommu_init_qi(iommu);
2610 	iommu_flush_write_buffer(iommu);
2611 
2612 	if (ecap_prs(iommu->ecap)) {
2613 		ret = intel_iommu_enable_prq(iommu);
2614 		if (ret)
2615 			goto disable_iommu;
2616 	}
2617 
2618 	ret = dmar_set_interrupt(iommu);
2619 	if (ret)
2620 		goto disable_iommu;
2621 
2622 	iommu_set_root_entry(iommu);
2623 	iommu_enable_translation(iommu);
2624 
2625 	iommu_disable_protect_mem_regions(iommu);
2626 	return 0;
2627 
2628 disable_iommu:
2629 	disable_dmar_iommu(iommu);
2630 out:
2631 	free_dmar_iommu(iommu);
2632 	return ret;
2633 }
2634 
2635 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2636 {
2637 	int ret = 0;
2638 	struct intel_iommu *iommu = dmaru->iommu;
2639 
2640 	if (!intel_iommu_enabled)
2641 		return 0;
2642 	if (iommu == NULL)
2643 		return -EINVAL;
2644 
2645 	if (insert) {
2646 		ret = intel_iommu_add(dmaru);
2647 	} else {
2648 		disable_dmar_iommu(iommu);
2649 		free_dmar_iommu(iommu);
2650 	}
2651 
2652 	return ret;
2653 }
2654 
2655 static void intel_iommu_free_dmars(void)
2656 {
2657 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2658 	struct dmar_atsr_unit *atsru, *atsr_n;
2659 	struct dmar_satc_unit *satcu, *satc_n;
2660 
2661 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2662 		list_del(&rmrru->list);
2663 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2664 		kfree(rmrru);
2665 	}
2666 
2667 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2668 		list_del(&atsru->list);
2669 		intel_iommu_free_atsr(atsru);
2670 	}
2671 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2672 		list_del(&satcu->list);
2673 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2674 		kfree(satcu);
2675 	}
2676 }
2677 
2678 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2679 {
2680 	struct dmar_satc_unit *satcu;
2681 	struct acpi_dmar_satc *satc;
2682 	struct device *tmp;
2683 	int i;
2684 
2685 	rcu_read_lock();
2686 
2687 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2688 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2689 		if (satc->segment != pci_domain_nr(dev->bus))
2690 			continue;
2691 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2692 			if (to_pci_dev(tmp) == dev)
2693 				goto out;
2694 	}
2695 	satcu = NULL;
2696 out:
2697 	rcu_read_unlock();
2698 	return satcu;
2699 }
2700 
2701 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2702 {
2703 	struct pci_dev *bridge = NULL;
2704 	struct dmar_atsr_unit *atsru;
2705 	struct dmar_satc_unit *satcu;
2706 	struct acpi_dmar_atsr *atsr;
2707 	bool supported = true;
2708 	struct pci_bus *bus;
2709 	struct device *tmp;
2710 	int i;
2711 
2712 	dev = pci_physfn(dev);
2713 	satcu = dmar_find_matched_satc_unit(dev);
2714 	if (satcu)
2715 		/*
2716 		 * This device supports ATS as it is in SATC table.
2717 		 * When IOMMU is in legacy mode, enabling ATS is done
2718 		 * automatically by HW for the device that requires
2719 		 * ATS, hence OS should not enable this device ATS
2720 		 * to avoid duplicated TLB invalidation.
2721 		 */
2722 		return !(satcu->atc_required && !sm_supported(iommu));
2723 
2724 	for (bus = dev->bus; bus; bus = bus->parent) {
2725 		bridge = bus->self;
2726 		/* If it's an integrated device, allow ATS */
2727 		if (!bridge)
2728 			return true;
2729 		/* Connected via non-PCIe: no ATS */
2730 		if (!pci_is_pcie(bridge) ||
2731 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2732 			return false;
2733 		/* If we found the root port, look it up in the ATSR */
2734 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2735 			break;
2736 	}
2737 
2738 	rcu_read_lock();
2739 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2740 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2741 		if (atsr->segment != pci_domain_nr(dev->bus))
2742 			continue;
2743 
2744 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2745 			if (tmp == &bridge->dev)
2746 				goto out;
2747 
2748 		if (atsru->include_all)
2749 			goto out;
2750 	}
2751 	supported = false;
2752 out:
2753 	rcu_read_unlock();
2754 
2755 	return supported;
2756 }
2757 
2758 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2759 {
2760 	int ret;
2761 	struct dmar_rmrr_unit *rmrru;
2762 	struct dmar_atsr_unit *atsru;
2763 	struct dmar_satc_unit *satcu;
2764 	struct acpi_dmar_atsr *atsr;
2765 	struct acpi_dmar_reserved_memory *rmrr;
2766 	struct acpi_dmar_satc *satc;
2767 
2768 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2769 		return 0;
2770 
2771 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2772 		rmrr = container_of(rmrru->hdr,
2773 				    struct acpi_dmar_reserved_memory, header);
2774 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2775 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2776 				((void *)rmrr) + rmrr->header.length,
2777 				rmrr->segment, rmrru->devices,
2778 				rmrru->devices_cnt);
2779 			if (ret < 0)
2780 				return ret;
2781 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2782 			dmar_remove_dev_scope(info, rmrr->segment,
2783 				rmrru->devices, rmrru->devices_cnt);
2784 		}
2785 	}
2786 
2787 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2788 		if (atsru->include_all)
2789 			continue;
2790 
2791 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2792 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2793 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2794 					(void *)atsr + atsr->header.length,
2795 					atsr->segment, atsru->devices,
2796 					atsru->devices_cnt);
2797 			if (ret > 0)
2798 				break;
2799 			else if (ret < 0)
2800 				return ret;
2801 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2802 			if (dmar_remove_dev_scope(info, atsr->segment,
2803 					atsru->devices, atsru->devices_cnt))
2804 				break;
2805 		}
2806 	}
2807 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2808 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2809 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2810 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2811 					(void *)satc + satc->header.length,
2812 					satc->segment, satcu->devices,
2813 					satcu->devices_cnt);
2814 			if (ret > 0)
2815 				break;
2816 			else if (ret < 0)
2817 				return ret;
2818 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2819 			if (dmar_remove_dev_scope(info, satc->segment,
2820 					satcu->devices, satcu->devices_cnt))
2821 				break;
2822 		}
2823 	}
2824 
2825 	return 0;
2826 }
2827 
2828 static void intel_disable_iommus(void)
2829 {
2830 	struct intel_iommu *iommu = NULL;
2831 	struct dmar_drhd_unit *drhd;
2832 
2833 	for_each_iommu(iommu, drhd)
2834 		iommu_disable_translation(iommu);
2835 }
2836 
2837 void intel_iommu_shutdown(void)
2838 {
2839 	struct dmar_drhd_unit *drhd;
2840 	struct intel_iommu *iommu = NULL;
2841 
2842 	if (no_iommu || dmar_disabled)
2843 		return;
2844 
2845 	/*
2846 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2847 	 * no lock and RCU checking needed anymore
2848 	 */
2849 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2850 		iommu = drhd->iommu;
2851 
2852 		/* Disable PMRs explicitly here. */
2853 		iommu_disable_protect_mem_regions(iommu);
2854 
2855 		/* Make sure the IOMMUs are switched off */
2856 		iommu_disable_translation(iommu);
2857 	}
2858 }
2859 
2860 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2861 {
2862 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2863 
2864 	return container_of(iommu_dev, struct intel_iommu, iommu);
2865 }
2866 
2867 static ssize_t version_show(struct device *dev,
2868 			    struct device_attribute *attr, char *buf)
2869 {
2870 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2871 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2872 	return sysfs_emit(buf, "%d:%d\n",
2873 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2874 }
2875 static DEVICE_ATTR_RO(version);
2876 
2877 static ssize_t address_show(struct device *dev,
2878 			    struct device_attribute *attr, char *buf)
2879 {
2880 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2881 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2882 }
2883 static DEVICE_ATTR_RO(address);
2884 
2885 static ssize_t cap_show(struct device *dev,
2886 			struct device_attribute *attr, char *buf)
2887 {
2888 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2889 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2890 }
2891 static DEVICE_ATTR_RO(cap);
2892 
2893 static ssize_t ecap_show(struct device *dev,
2894 			 struct device_attribute *attr, char *buf)
2895 {
2896 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2897 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2898 }
2899 static DEVICE_ATTR_RO(ecap);
2900 
2901 static ssize_t domains_supported_show(struct device *dev,
2902 				      struct device_attribute *attr, char *buf)
2903 {
2904 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2905 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2906 }
2907 static DEVICE_ATTR_RO(domains_supported);
2908 
2909 static ssize_t domains_used_show(struct device *dev,
2910 				 struct device_attribute *attr, char *buf)
2911 {
2912 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2913 	unsigned int count = 0;
2914 	int id;
2915 
2916 	for (id = 0; id < cap_ndoms(iommu->cap); id++)
2917 		if (ida_exists(&iommu->domain_ida, id))
2918 			count++;
2919 
2920 	return sysfs_emit(buf, "%d\n", count);
2921 }
2922 static DEVICE_ATTR_RO(domains_used);
2923 
2924 static struct attribute *intel_iommu_attrs[] = {
2925 	&dev_attr_version.attr,
2926 	&dev_attr_address.attr,
2927 	&dev_attr_cap.attr,
2928 	&dev_attr_ecap.attr,
2929 	&dev_attr_domains_supported.attr,
2930 	&dev_attr_domains_used.attr,
2931 	NULL,
2932 };
2933 
2934 static struct attribute_group intel_iommu_group = {
2935 	.name = "intel-iommu",
2936 	.attrs = intel_iommu_attrs,
2937 };
2938 
2939 const struct attribute_group *intel_iommu_groups[] = {
2940 	&intel_iommu_group,
2941 	NULL,
2942 };
2943 
2944 static bool has_external_pci(void)
2945 {
2946 	struct pci_dev *pdev = NULL;
2947 
2948 	for_each_pci_dev(pdev)
2949 		if (pdev->external_facing) {
2950 			pci_dev_put(pdev);
2951 			return true;
2952 		}
2953 
2954 	return false;
2955 }
2956 
2957 static int __init platform_optin_force_iommu(void)
2958 {
2959 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2960 		return 0;
2961 
2962 	if (no_iommu || dmar_disabled)
2963 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2964 
2965 	/*
2966 	 * If Intel-IOMMU is disabled by default, we will apply identity
2967 	 * map for all devices except those marked as being untrusted.
2968 	 */
2969 	if (dmar_disabled)
2970 		iommu_set_default_passthrough(false);
2971 
2972 	dmar_disabled = 0;
2973 	no_iommu = 0;
2974 
2975 	return 1;
2976 }
2977 
2978 static int __init probe_acpi_namespace_devices(void)
2979 {
2980 	struct dmar_drhd_unit *drhd;
2981 	/* To avoid a -Wunused-but-set-variable warning. */
2982 	struct intel_iommu *iommu __maybe_unused;
2983 	struct device *dev;
2984 	int i, ret = 0;
2985 
2986 	for_each_active_iommu(iommu, drhd) {
2987 		for_each_active_dev_scope(drhd->devices,
2988 					  drhd->devices_cnt, i, dev) {
2989 			struct acpi_device_physical_node *pn;
2990 			struct acpi_device *adev;
2991 
2992 			if (dev->bus != &acpi_bus_type)
2993 				continue;
2994 
2995 			up_read(&dmar_global_lock);
2996 			adev = to_acpi_device(dev);
2997 			mutex_lock(&adev->physical_node_lock);
2998 			list_for_each_entry(pn,
2999 					    &adev->physical_node_list, node) {
3000 				ret = iommu_probe_device(pn->dev);
3001 				if (ret)
3002 					break;
3003 			}
3004 			mutex_unlock(&adev->physical_node_lock);
3005 			down_read(&dmar_global_lock);
3006 
3007 			if (ret)
3008 				return ret;
3009 		}
3010 	}
3011 
3012 	return 0;
3013 }
3014 
3015 static __init int tboot_force_iommu(void)
3016 {
3017 	if (!tboot_enabled())
3018 		return 0;
3019 
3020 	if (no_iommu || dmar_disabled)
3021 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3022 
3023 	dmar_disabled = 0;
3024 	no_iommu = 0;
3025 
3026 	return 1;
3027 }
3028 
3029 int __init intel_iommu_init(void)
3030 {
3031 	int ret = -ENODEV;
3032 	struct dmar_drhd_unit *drhd;
3033 	struct intel_iommu *iommu;
3034 
3035 	/*
3036 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3037 	 * opt in, so enforce that.
3038 	 */
3039 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3040 		    platform_optin_force_iommu();
3041 
3042 	down_write(&dmar_global_lock);
3043 	if (dmar_table_init()) {
3044 		if (force_on)
3045 			panic("tboot: Failed to initialize DMAR table\n");
3046 		goto out_free_dmar;
3047 	}
3048 
3049 	if (dmar_dev_scope_init() < 0) {
3050 		if (force_on)
3051 			panic("tboot: Failed to initialize DMAR device scope\n");
3052 		goto out_free_dmar;
3053 	}
3054 
3055 	up_write(&dmar_global_lock);
3056 
3057 	/*
3058 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3059 	 * complain later when we register it under the lock.
3060 	 */
3061 	dmar_register_bus_notifier();
3062 
3063 	down_write(&dmar_global_lock);
3064 
3065 	if (!no_iommu)
3066 		intel_iommu_debugfs_init();
3067 
3068 	if (no_iommu || dmar_disabled) {
3069 		/*
3070 		 * We exit the function here to ensure IOMMU's remapping and
3071 		 * mempool aren't setup, which means that the IOMMU's PMRs
3072 		 * won't be disabled via the call to init_dmars(). So disable
3073 		 * it explicitly here. The PMRs were setup by tboot prior to
3074 		 * calling SENTER, but the kernel is expected to reset/tear
3075 		 * down the PMRs.
3076 		 */
3077 		if (intel_iommu_tboot_noforce) {
3078 			for_each_iommu(iommu, drhd)
3079 				iommu_disable_protect_mem_regions(iommu);
3080 		}
3081 
3082 		/*
3083 		 * Make sure the IOMMUs are switched off, even when we
3084 		 * boot into a kexec kernel and the previous kernel left
3085 		 * them enabled
3086 		 */
3087 		intel_disable_iommus();
3088 		goto out_free_dmar;
3089 	}
3090 
3091 	if (list_empty(&dmar_rmrr_units))
3092 		pr_info("No RMRR found\n");
3093 
3094 	if (list_empty(&dmar_atsr_units))
3095 		pr_info("No ATSR found\n");
3096 
3097 	if (list_empty(&dmar_satc_units))
3098 		pr_info("No SATC found\n");
3099 
3100 	init_no_remapping_devices();
3101 
3102 	ret = init_dmars();
3103 	if (ret) {
3104 		if (force_on)
3105 			panic("tboot: Failed to initialize DMARs\n");
3106 		pr_err("Initialization failed\n");
3107 		goto out_free_dmar;
3108 	}
3109 	up_write(&dmar_global_lock);
3110 
3111 	init_iommu_pm_ops();
3112 
3113 	down_read(&dmar_global_lock);
3114 	for_each_active_iommu(iommu, drhd) {
3115 		/*
3116 		 * The flush queue implementation does not perform
3117 		 * page-selective invalidations that are required for efficient
3118 		 * TLB flushes in virtual environments.  The benefit of batching
3119 		 * is likely to be much lower than the overhead of synchronizing
3120 		 * the virtual and physical IOMMU page-tables.
3121 		 */
3122 		if (cap_caching_mode(iommu->cap) &&
3123 		    !first_level_by_default(iommu)) {
3124 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3125 			iommu_set_dma_strict();
3126 		}
3127 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3128 				       intel_iommu_groups,
3129 				       "%s", iommu->name);
3130 		/*
3131 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3132 		 * Release the dmar_global_lock before entering the device probe path
3133 		 * to avoid unnecessary lock order splat.
3134 		 */
3135 		up_read(&dmar_global_lock);
3136 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3137 		down_read(&dmar_global_lock);
3138 
3139 		iommu_pmu_register(iommu);
3140 	}
3141 
3142 	if (probe_acpi_namespace_devices())
3143 		pr_warn("ACPI name space devices didn't probe correctly\n");
3144 
3145 	/* Finally, we enable the DMA remapping hardware. */
3146 	for_each_iommu(iommu, drhd) {
3147 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3148 			iommu_enable_translation(iommu);
3149 
3150 		iommu_disable_protect_mem_regions(iommu);
3151 	}
3152 	up_read(&dmar_global_lock);
3153 
3154 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3155 
3156 	intel_iommu_enabled = 1;
3157 
3158 	return 0;
3159 
3160 out_free_dmar:
3161 	intel_iommu_free_dmars();
3162 	up_write(&dmar_global_lock);
3163 	return ret;
3164 }
3165 
3166 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3167 {
3168 	struct device_domain_info *info = opaque;
3169 
3170 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3171 	return 0;
3172 }
3173 
3174 /*
3175  * NB - intel-iommu lacks any sort of reference counting for the users of
3176  * dependent devices.  If multiple endpoints have intersecting dependent
3177  * devices, unbinding the driver from any one of them will possibly leave
3178  * the others unable to operate.
3179  */
3180 static void domain_context_clear(struct device_domain_info *info)
3181 {
3182 	if (!dev_is_pci(info->dev)) {
3183 		domain_context_clear_one(info, info->bus, info->devfn);
3184 		return;
3185 	}
3186 
3187 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3188 			       &domain_context_clear_one_cb, info);
3189 	iommu_disable_pci_ats(info);
3190 }
3191 
3192 /*
3193  * Clear the page table pointer in context or pasid table entries so that
3194  * all DMA requests without PASID from the device are blocked. If the page
3195  * table has been set, clean up the data structures.
3196  */
3197 void device_block_translation(struct device *dev)
3198 {
3199 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3200 	struct intel_iommu *iommu = info->iommu;
3201 	unsigned long flags;
3202 
3203 	/* Device in DMA blocking state. Noting to do. */
3204 	if (!info->domain_attached)
3205 		return;
3206 
3207 	if (info->domain)
3208 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3209 
3210 	if (!dev_is_real_dma_subdevice(dev)) {
3211 		if (sm_supported(iommu))
3212 			intel_pasid_tear_down_entry(iommu, dev,
3213 						    IOMMU_NO_PASID, false);
3214 		else
3215 			domain_context_clear(info);
3216 	}
3217 
3218 	/* Device now in DMA blocking state. */
3219 	info->domain_attached = false;
3220 
3221 	if (!info->domain)
3222 		return;
3223 
3224 	spin_lock_irqsave(&info->domain->lock, flags);
3225 	list_del(&info->link);
3226 	spin_unlock_irqrestore(&info->domain->lock, flags);
3227 
3228 	domain_detach_iommu(info->domain, iommu);
3229 	info->domain = NULL;
3230 }
3231 
3232 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3233 				      struct device *dev)
3234 {
3235 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3236 
3237 	iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
3238 	device_block_translation(dev);
3239 	return 0;
3240 }
3241 
3242 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3243 					 struct device *dev, ioasid_t pasid,
3244 					 struct iommu_domain *old);
3245 
3246 static struct iommu_domain blocking_domain = {
3247 	.type = IOMMU_DOMAIN_BLOCKED,
3248 	.ops = &(const struct iommu_domain_ops) {
3249 		.attach_dev	= blocking_domain_attach_dev,
3250 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3251 	}
3252 };
3253 
3254 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3255 {
3256 	if (!intel_iommu_superpage)
3257 		return 0;
3258 
3259 	if (first_stage)
3260 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3261 
3262 	return fls(cap_super_page_val(iommu->cap));
3263 }
3264 
3265 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3266 {
3267 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3268 	struct intel_iommu *iommu = info->iommu;
3269 	struct dmar_domain *domain;
3270 	int addr_width;
3271 
3272 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3273 	if (!domain)
3274 		return ERR_PTR(-ENOMEM);
3275 
3276 	INIT_LIST_HEAD(&domain->devices);
3277 	INIT_LIST_HEAD(&domain->dev_pasids);
3278 	INIT_LIST_HEAD(&domain->cache_tags);
3279 	spin_lock_init(&domain->lock);
3280 	spin_lock_init(&domain->cache_lock);
3281 	xa_init(&domain->iommu_array);
3282 	INIT_LIST_HEAD(&domain->s1_domains);
3283 	spin_lock_init(&domain->s1_lock);
3284 
3285 	domain->nid = dev_to_node(dev);
3286 	domain->use_first_level = first_stage;
3287 
3288 	domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
3289 
3290 	/* calculate the address width */
3291 	addr_width = agaw_to_width(iommu->agaw);
3292 	if (addr_width > cap_mgaw(iommu->cap))
3293 		addr_width = cap_mgaw(iommu->cap);
3294 	domain->gaw = addr_width;
3295 	domain->agaw = iommu->agaw;
3296 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3297 
3298 	/* iommu memory access coherency */
3299 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3300 
3301 	/* pagesize bitmap */
3302 	domain->domain.pgsize_bitmap = SZ_4K;
3303 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3304 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3305 
3306 	/*
3307 	 * IOVA aperture: First-level translation restricts the input-address
3308 	 * to a canonical address (i.e., address bits 63:N have the same value
3309 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3310 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3311 	 */
3312 	domain->domain.geometry.force_aperture = true;
3313 	domain->domain.geometry.aperture_start = 0;
3314 	if (first_stage)
3315 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3316 	else
3317 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3318 
3319 	/* always allocate the top pgd */
3320 	domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
3321 	if (!domain->pgd) {
3322 		kfree(domain);
3323 		return ERR_PTR(-ENOMEM);
3324 	}
3325 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3326 
3327 	return domain;
3328 }
3329 
3330 static struct iommu_domain *
3331 intel_iommu_domain_alloc_first_stage(struct device *dev,
3332 				     struct intel_iommu *iommu, u32 flags)
3333 {
3334 	struct dmar_domain *dmar_domain;
3335 
3336 	if (flags & ~IOMMU_HWPT_ALLOC_PASID)
3337 		return ERR_PTR(-EOPNOTSUPP);
3338 
3339 	/* Only SL is available in legacy mode */
3340 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3341 		return ERR_PTR(-EOPNOTSUPP);
3342 
3343 	dmar_domain = paging_domain_alloc(dev, true);
3344 	if (IS_ERR(dmar_domain))
3345 		return ERR_CAST(dmar_domain);
3346 
3347 	dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
3348 	/*
3349 	 * iotlb sync for map is only needed for legacy implementations that
3350 	 * explicitly require flushing internal write buffers to ensure memory
3351 	 * coherence.
3352 	 */
3353 	if (rwbf_required(iommu))
3354 		dmar_domain->iotlb_sync_map = true;
3355 
3356 	return &dmar_domain->domain;
3357 }
3358 
3359 static struct iommu_domain *
3360 intel_iommu_domain_alloc_second_stage(struct device *dev,
3361 				      struct intel_iommu *iommu, u32 flags)
3362 {
3363 	struct dmar_domain *dmar_domain;
3364 
3365 	if (flags &
3366 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3367 	       IOMMU_HWPT_ALLOC_PASID)))
3368 		return ERR_PTR(-EOPNOTSUPP);
3369 
3370 	if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
3371 	     !nested_supported(iommu)) ||
3372 	    ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
3373 	     !ssads_supported(iommu)))
3374 		return ERR_PTR(-EOPNOTSUPP);
3375 
3376 	/* Legacy mode always supports second stage */
3377 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3378 		return ERR_PTR(-EOPNOTSUPP);
3379 
3380 	dmar_domain = paging_domain_alloc(dev, false);
3381 	if (IS_ERR(dmar_domain))
3382 		return ERR_CAST(dmar_domain);
3383 
3384 	dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
3385 	dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3386 
3387 	if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
3388 		dmar_domain->domain.dirty_ops = &intel_dirty_ops;
3389 
3390 	/*
3391 	 * Besides the internal write buffer flush, the caching mode used for
3392 	 * legacy nested translation (which utilizes shadowing page tables)
3393 	 * also requires iotlb sync on map.
3394 	 */
3395 	if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
3396 		dmar_domain->iotlb_sync_map = true;
3397 
3398 	return &dmar_domain->domain;
3399 }
3400 
3401 static struct iommu_domain *
3402 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3403 				      const struct iommu_user_data *user_data)
3404 {
3405 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3406 	struct intel_iommu *iommu = info->iommu;
3407 	struct iommu_domain *domain;
3408 
3409 	if (user_data)
3410 		return ERR_PTR(-EOPNOTSUPP);
3411 
3412 	/* Prefer first stage if possible by default. */
3413 	domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
3414 	if (domain != ERR_PTR(-EOPNOTSUPP))
3415 		return domain;
3416 	return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
3417 }
3418 
3419 static void intel_iommu_domain_free(struct iommu_domain *domain)
3420 {
3421 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3422 
3423 	if (WARN_ON(dmar_domain->nested_parent &&
3424 		    !list_empty(&dmar_domain->s1_domains)))
3425 		return;
3426 
3427 	if (WARN_ON(!list_empty(&dmar_domain->devices)))
3428 		return;
3429 
3430 	if (dmar_domain->pgd) {
3431 		struct iommu_pages_list freelist =
3432 			IOMMU_PAGES_LIST_INIT(freelist);
3433 
3434 		domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
3435 			     &freelist);
3436 		iommu_put_pages_list(&freelist);
3437 	}
3438 
3439 	kfree(dmar_domain->qi_batch);
3440 	kfree(dmar_domain);
3441 }
3442 
3443 static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
3444 						struct intel_iommu *iommu)
3445 {
3446 	if (WARN_ON(dmar_domain->domain.dirty_ops ||
3447 		    dmar_domain->nested_parent))
3448 		return -EINVAL;
3449 
3450 	/* Only SL is available in legacy mode */
3451 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3452 		return -EINVAL;
3453 
3454 	/* Same page size support */
3455 	if (!cap_fl1gp_support(iommu->cap) &&
3456 	    (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3457 		return -EINVAL;
3458 
3459 	/* iotlb sync on map requirement */
3460 	if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
3461 		return -EINVAL;
3462 
3463 	return 0;
3464 }
3465 
3466 static int
3467 paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
3468 				      struct intel_iommu *iommu)
3469 {
3470 	unsigned int sslps = cap_super_page_val(iommu->cap);
3471 
3472 	if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
3473 		return -EINVAL;
3474 	if (dmar_domain->nested_parent && !nested_supported(iommu))
3475 		return -EINVAL;
3476 
3477 	/* Legacy mode always supports second stage */
3478 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3479 		return -EINVAL;
3480 
3481 	/* Same page size support */
3482 	if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
3483 		return -EINVAL;
3484 	if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3485 		return -EINVAL;
3486 
3487 	/* iotlb sync on map requirement */
3488 	if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
3489 	    !dmar_domain->iotlb_sync_map)
3490 		return -EINVAL;
3491 
3492 	return 0;
3493 }
3494 
3495 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3496 {
3497 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3498 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3499 	struct intel_iommu *iommu = info->iommu;
3500 	int ret = -EINVAL;
3501 	int addr_width;
3502 
3503 	if (intel_domain_is_fs_paging(dmar_domain))
3504 		ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
3505 	else if (intel_domain_is_ss_paging(dmar_domain))
3506 		ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
3507 	else if (WARN_ON(true))
3508 		ret = -EINVAL;
3509 	if (ret)
3510 		return ret;
3511 
3512 	/*
3513 	 * FIXME this is locked wrong, it needs to be under the
3514 	 * dmar_domain->lock
3515 	 */
3516 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3517 		return -EINVAL;
3518 
3519 	if (dmar_domain->iommu_coherency !=
3520 			iommu_paging_structure_coherency(iommu))
3521 		return -EINVAL;
3522 
3523 
3524 	/* check if this iommu agaw is sufficient for max mapped address */
3525 	addr_width = agaw_to_width(iommu->agaw);
3526 	if (addr_width > cap_mgaw(iommu->cap))
3527 		addr_width = cap_mgaw(iommu->cap);
3528 
3529 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3530 		return -EINVAL;
3531 
3532 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3533 	    context_copied(iommu, info->bus, info->devfn))
3534 		return intel_pasid_setup_sm_context(dev);
3535 
3536 	return 0;
3537 }
3538 
3539 static int intel_iommu_attach_device(struct iommu_domain *domain,
3540 				     struct device *dev)
3541 {
3542 	int ret;
3543 
3544 	device_block_translation(dev);
3545 
3546 	ret = paging_domain_compatible(domain, dev);
3547 	if (ret)
3548 		return ret;
3549 
3550 	ret = iopf_for_domain_set(domain, dev);
3551 	if (ret)
3552 		return ret;
3553 
3554 	ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
3555 	if (ret)
3556 		iopf_for_domain_remove(domain, dev);
3557 
3558 	return ret;
3559 }
3560 
3561 static int intel_iommu_map(struct iommu_domain *domain,
3562 			   unsigned long iova, phys_addr_t hpa,
3563 			   size_t size, int iommu_prot, gfp_t gfp)
3564 {
3565 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3566 	u64 max_addr;
3567 	int prot = 0;
3568 
3569 	if (iommu_prot & IOMMU_READ)
3570 		prot |= DMA_PTE_READ;
3571 	if (iommu_prot & IOMMU_WRITE)
3572 		prot |= DMA_PTE_WRITE;
3573 	if (dmar_domain->set_pte_snp)
3574 		prot |= DMA_PTE_SNP;
3575 
3576 	max_addr = iova + size;
3577 	if (dmar_domain->max_addr < max_addr) {
3578 		u64 end;
3579 
3580 		/* check if minimum agaw is sufficient for mapped address */
3581 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3582 		if (end < max_addr) {
3583 			pr_err("%s: iommu width (%d) is not "
3584 			       "sufficient for the mapped address (%llx)\n",
3585 			       __func__, dmar_domain->gaw, max_addr);
3586 			return -EFAULT;
3587 		}
3588 		dmar_domain->max_addr = max_addr;
3589 	}
3590 	/* Round up size to next multiple of PAGE_SIZE, if it and
3591 	   the low bits of hpa would take us onto the next page */
3592 	size = aligned_nrpages(hpa, size);
3593 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3594 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3595 }
3596 
3597 static int intel_iommu_map_pages(struct iommu_domain *domain,
3598 				 unsigned long iova, phys_addr_t paddr,
3599 				 size_t pgsize, size_t pgcount,
3600 				 int prot, gfp_t gfp, size_t *mapped)
3601 {
3602 	unsigned long pgshift = __ffs(pgsize);
3603 	size_t size = pgcount << pgshift;
3604 	int ret;
3605 
3606 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3607 		return -EINVAL;
3608 
3609 	if (!IS_ALIGNED(iova | paddr, pgsize))
3610 		return -EINVAL;
3611 
3612 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3613 	if (!ret && mapped)
3614 		*mapped = size;
3615 
3616 	return ret;
3617 }
3618 
3619 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3620 				unsigned long iova, size_t size,
3621 				struct iommu_iotlb_gather *gather)
3622 {
3623 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3624 	unsigned long start_pfn, last_pfn;
3625 	int level = 0;
3626 
3627 	/* Cope with horrid API which requires us to unmap more than the
3628 	   size argument if it happens to be a large-page mapping. */
3629 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3630 				     &level, GFP_ATOMIC)))
3631 		return 0;
3632 
3633 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3634 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3635 
3636 	start_pfn = iova >> VTD_PAGE_SHIFT;
3637 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3638 
3639 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3640 
3641 	if (dmar_domain->max_addr == iova + size)
3642 		dmar_domain->max_addr = iova;
3643 
3644 	/*
3645 	 * We do not use page-selective IOTLB invalidation in flush queue,
3646 	 * so there is no need to track page and sync iotlb.
3647 	 */
3648 	if (!iommu_iotlb_gather_queued(gather))
3649 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3650 
3651 	return size;
3652 }
3653 
3654 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3655 				      unsigned long iova,
3656 				      size_t pgsize, size_t pgcount,
3657 				      struct iommu_iotlb_gather *gather)
3658 {
3659 	unsigned long pgshift = __ffs(pgsize);
3660 	size_t size = pgcount << pgshift;
3661 
3662 	return intel_iommu_unmap(domain, iova, size, gather);
3663 }
3664 
3665 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3666 				 struct iommu_iotlb_gather *gather)
3667 {
3668 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3669 			      gather->end,
3670 			      iommu_pages_list_empty(&gather->freelist));
3671 	iommu_put_pages_list(&gather->freelist);
3672 }
3673 
3674 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3675 					    dma_addr_t iova)
3676 {
3677 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3678 	struct dma_pte *pte;
3679 	int level = 0;
3680 	u64 phys = 0;
3681 
3682 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3683 			     GFP_ATOMIC);
3684 	if (pte && dma_pte_present(pte))
3685 		phys = dma_pte_addr(pte) +
3686 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3687 						VTD_PAGE_SHIFT) - 1));
3688 
3689 	return phys;
3690 }
3691 
3692 static bool domain_support_force_snooping(struct dmar_domain *domain)
3693 {
3694 	struct device_domain_info *info;
3695 	bool support = true;
3696 
3697 	assert_spin_locked(&domain->lock);
3698 	list_for_each_entry(info, &domain->devices, link) {
3699 		if (!ecap_sc_support(info->iommu->ecap)) {
3700 			support = false;
3701 			break;
3702 		}
3703 	}
3704 
3705 	return support;
3706 }
3707 
3708 static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
3709 {
3710 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3711 	struct device_domain_info *info;
3712 
3713 	guard(spinlock_irqsave)(&dmar_domain->lock);
3714 
3715 	if (dmar_domain->force_snooping)
3716 		return true;
3717 
3718 	if (!domain_support_force_snooping(dmar_domain))
3719 		return false;
3720 
3721 	dmar_domain->force_snooping = true;
3722 	list_for_each_entry(info, &dmar_domain->devices, link)
3723 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3724 						     IOMMU_NO_PASID);
3725 	return true;
3726 }
3727 
3728 static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
3729 {
3730 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3731 
3732 	guard(spinlock_irqsave)(&dmar_domain->lock);
3733 	if (!domain_support_force_snooping(dmar_domain) ||
3734 	    dmar_domain->has_mappings)
3735 		return false;
3736 
3737 	/*
3738 	 * Second level page table supports per-PTE snoop control. The
3739 	 * iommu_map() interface will handle this by setting SNP bit.
3740 	 */
3741 	dmar_domain->set_pte_snp = true;
3742 	dmar_domain->force_snooping = true;
3743 	return true;
3744 }
3745 
3746 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3747 {
3748 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3749 
3750 	switch (cap) {
3751 	case IOMMU_CAP_CACHE_COHERENCY:
3752 	case IOMMU_CAP_DEFERRED_FLUSH:
3753 		return true;
3754 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3755 		return dmar_platform_optin();
3756 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3757 		return ecap_sc_support(info->iommu->ecap);
3758 	case IOMMU_CAP_DIRTY_TRACKING:
3759 		return ssads_supported(info->iommu);
3760 	default:
3761 		return false;
3762 	}
3763 }
3764 
3765 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3766 {
3767 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3768 	struct device_domain_info *info;
3769 	struct intel_iommu *iommu;
3770 	u8 bus, devfn;
3771 	int ret;
3772 
3773 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3774 	if (!iommu || !iommu->iommu.ops)
3775 		return ERR_PTR(-ENODEV);
3776 
3777 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3778 	if (!info)
3779 		return ERR_PTR(-ENOMEM);
3780 
3781 	if (dev_is_real_dma_subdevice(dev)) {
3782 		info->bus = pdev->bus->number;
3783 		info->devfn = pdev->devfn;
3784 		info->segment = pci_domain_nr(pdev->bus);
3785 	} else {
3786 		info->bus = bus;
3787 		info->devfn = devfn;
3788 		info->segment = iommu->segment;
3789 	}
3790 
3791 	info->dev = dev;
3792 	info->iommu = iommu;
3793 	if (dev_is_pci(dev)) {
3794 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3795 		    pci_ats_supported(pdev) &&
3796 		    dmar_ats_supported(pdev, iommu)) {
3797 			info->ats_supported = 1;
3798 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3799 
3800 			/*
3801 			 * For IOMMU that supports device IOTLB throttling
3802 			 * (DIT), we assign PFSID to the invalidation desc
3803 			 * of a VF such that IOMMU HW can gauge queue depth
3804 			 * at PF level. If DIT is not set, PFSID will be
3805 			 * treated as reserved, which should be set to 0.
3806 			 */
3807 			if (ecap_dit(iommu->ecap))
3808 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3809 			info->ats_qdep = pci_ats_queue_depth(pdev);
3810 		}
3811 		if (sm_supported(iommu)) {
3812 			if (pasid_supported(iommu)) {
3813 				int features = pci_pasid_features(pdev);
3814 
3815 				if (features >= 0)
3816 					info->pasid_supported = features | 1;
3817 			}
3818 
3819 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3820 			    pci_pri_supported(pdev))
3821 				info->pri_supported = 1;
3822 		}
3823 	}
3824 
3825 	dev_iommu_priv_set(dev, info);
3826 	if (pdev && pci_ats_supported(pdev)) {
3827 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3828 		ret = device_rbtree_insert(iommu, info);
3829 		if (ret)
3830 			goto free;
3831 	}
3832 
3833 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3834 		ret = intel_pasid_alloc_table(dev);
3835 		if (ret) {
3836 			dev_err(dev, "PASID table allocation failed\n");
3837 			goto clear_rbtree;
3838 		}
3839 
3840 		if (!context_copied(iommu, info->bus, info->devfn)) {
3841 			ret = intel_pasid_setup_sm_context(dev);
3842 			if (ret)
3843 				goto free_table;
3844 		}
3845 	}
3846 
3847 	intel_iommu_debugfs_create_dev(info);
3848 
3849 	return &iommu->iommu;
3850 free_table:
3851 	intel_pasid_free_table(dev);
3852 clear_rbtree:
3853 	device_rbtree_remove(info);
3854 free:
3855 	kfree(info);
3856 
3857 	return ERR_PTR(ret);
3858 }
3859 
3860 static void intel_iommu_probe_finalize(struct device *dev)
3861 {
3862 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3863 	struct intel_iommu *iommu = info->iommu;
3864 
3865 	/*
3866 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3867 	 * device is undefined if you enable PASID support after ATS support.
3868 	 * So always enable PASID support on devices which have it, even if
3869 	 * we can't yet know if we're ever going to use it.
3870 	 */
3871 	if (info->pasid_supported &&
3872 	    !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3873 		info->pasid_enabled = 1;
3874 
3875 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3876 		iommu_enable_pci_ats(info);
3877 		/* Assign a DEVTLB cache tag to the default domain. */
3878 		if (info->ats_enabled && info->domain) {
3879 			u16 did = domain_id_iommu(info->domain, iommu);
3880 
3881 			if (cache_tag_assign(info->domain, did, dev,
3882 					     IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
3883 				iommu_disable_pci_ats(info);
3884 		}
3885 	}
3886 	iommu_enable_pci_pri(info);
3887 }
3888 
3889 static void intel_iommu_release_device(struct device *dev)
3890 {
3891 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3892 	struct intel_iommu *iommu = info->iommu;
3893 
3894 	iommu_disable_pci_pri(info);
3895 	iommu_disable_pci_ats(info);
3896 
3897 	if (info->pasid_enabled) {
3898 		pci_disable_pasid(to_pci_dev(dev));
3899 		info->pasid_enabled = 0;
3900 	}
3901 
3902 	mutex_lock(&iommu->iopf_lock);
3903 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3904 		device_rbtree_remove(info);
3905 	mutex_unlock(&iommu->iopf_lock);
3906 
3907 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3908 	    !context_copied(iommu, info->bus, info->devfn))
3909 		intel_pasid_teardown_sm_context(dev);
3910 
3911 	intel_pasid_free_table(dev);
3912 	intel_iommu_debugfs_remove_dev(info);
3913 	kfree(info);
3914 }
3915 
3916 static void intel_iommu_get_resv_regions(struct device *device,
3917 					 struct list_head *head)
3918 {
3919 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3920 	struct iommu_resv_region *reg;
3921 	struct dmar_rmrr_unit *rmrr;
3922 	struct device *i_dev;
3923 	int i;
3924 
3925 	rcu_read_lock();
3926 	for_each_rmrr_units(rmrr) {
3927 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3928 					  i, i_dev) {
3929 			struct iommu_resv_region *resv;
3930 			enum iommu_resv_type type;
3931 			size_t length;
3932 
3933 			if (i_dev != device &&
3934 			    !is_downstream_to_pci_bridge(device, i_dev))
3935 				continue;
3936 
3937 			length = rmrr->end_address - rmrr->base_address + 1;
3938 
3939 			type = device_rmrr_is_relaxable(device) ?
3940 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3941 
3942 			resv = iommu_alloc_resv_region(rmrr->base_address,
3943 						       length, prot, type,
3944 						       GFP_ATOMIC);
3945 			if (!resv)
3946 				break;
3947 
3948 			list_add_tail(&resv->list, head);
3949 		}
3950 	}
3951 	rcu_read_unlock();
3952 
3953 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3954 	if (dev_is_pci(device)) {
3955 		struct pci_dev *pdev = to_pci_dev(device);
3956 
3957 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3958 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3959 					IOMMU_RESV_DIRECT_RELAXABLE,
3960 					GFP_KERNEL);
3961 			if (reg)
3962 				list_add_tail(&reg->list, head);
3963 		}
3964 	}
3965 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3966 
3967 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3968 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3969 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3970 	if (!reg)
3971 		return;
3972 	list_add_tail(&reg->list, head);
3973 }
3974 
3975 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3976 {
3977 	if (dev_is_pci(dev))
3978 		return pci_device_group(dev);
3979 	return generic_device_group(dev);
3980 }
3981 
3982 int intel_iommu_enable_iopf(struct device *dev)
3983 {
3984 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3985 	struct intel_iommu *iommu = info->iommu;
3986 	int ret;
3987 
3988 	if (!info->pri_enabled)
3989 		return -ENODEV;
3990 
3991 	/* pri_enabled is protected by the group mutex. */
3992 	iommu_group_mutex_assert(dev);
3993 	if (info->iopf_refcount) {
3994 		info->iopf_refcount++;
3995 		return 0;
3996 	}
3997 
3998 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3999 	if (ret)
4000 		return ret;
4001 
4002 	info->iopf_refcount = 1;
4003 
4004 	return 0;
4005 }
4006 
4007 void intel_iommu_disable_iopf(struct device *dev)
4008 {
4009 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4010 	struct intel_iommu *iommu = info->iommu;
4011 
4012 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
4013 		return;
4014 
4015 	iommu_group_mutex_assert(dev);
4016 	if (--info->iopf_refcount)
4017 		return;
4018 
4019 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4020 }
4021 
4022 static bool intel_iommu_is_attach_deferred(struct device *dev)
4023 {
4024 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4025 
4026 	return translation_pre_enabled(info->iommu) && !info->domain;
4027 }
4028 
4029 /*
4030  * Check that the device does not live on an external facing PCI port that is
4031  * marked as untrusted. Such devices should not be able to apply quirks and
4032  * thus not be able to bypass the IOMMU restrictions.
4033  */
4034 static bool risky_device(struct pci_dev *pdev)
4035 {
4036 	if (pdev->untrusted) {
4037 		pci_info(pdev,
4038 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4039 			 pdev->vendor, pdev->device);
4040 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4041 		return true;
4042 	}
4043 	return false;
4044 }
4045 
4046 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4047 				      unsigned long iova, size_t size)
4048 {
4049 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050 
4051 	if (dmar_domain->iotlb_sync_map)
4052 		cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
4053 
4054 	return 0;
4055 }
4056 
4057 void domain_remove_dev_pasid(struct iommu_domain *domain,
4058 			     struct device *dev, ioasid_t pasid)
4059 {
4060 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4061 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4062 	struct intel_iommu *iommu = info->iommu;
4063 	struct dmar_domain *dmar_domain;
4064 	unsigned long flags;
4065 
4066 	if (!domain)
4067 		return;
4068 
4069 	/* Identity domain has no meta data for pasid. */
4070 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4071 		return;
4072 
4073 	dmar_domain = to_dmar_domain(domain);
4074 	spin_lock_irqsave(&dmar_domain->lock, flags);
4075 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4076 		if (curr->dev == dev && curr->pasid == pasid) {
4077 			list_del(&curr->link_domain);
4078 			dev_pasid = curr;
4079 			break;
4080 		}
4081 	}
4082 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4083 
4084 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4085 	domain_detach_iommu(dmar_domain, iommu);
4086 	if (!WARN_ON_ONCE(!dev_pasid)) {
4087 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4088 		kfree(dev_pasid);
4089 	}
4090 }
4091 
4092 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4093 					 struct device *dev, ioasid_t pasid,
4094 					 struct iommu_domain *old)
4095 {
4096 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4097 
4098 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4099 	iopf_for_domain_remove(old, dev);
4100 	domain_remove_dev_pasid(old, dev, pasid);
4101 
4102 	return 0;
4103 }
4104 
4105 struct dev_pasid_info *
4106 domain_add_dev_pasid(struct iommu_domain *domain,
4107 		     struct device *dev, ioasid_t pasid)
4108 {
4109 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4110 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4111 	struct intel_iommu *iommu = info->iommu;
4112 	struct dev_pasid_info *dev_pasid;
4113 	unsigned long flags;
4114 	int ret;
4115 
4116 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4117 	if (!dev_pasid)
4118 		return ERR_PTR(-ENOMEM);
4119 
4120 	ret = domain_attach_iommu(dmar_domain, iommu);
4121 	if (ret)
4122 		goto out_free;
4123 
4124 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4125 	if (ret)
4126 		goto out_detach_iommu;
4127 
4128 	dev_pasid->dev = dev;
4129 	dev_pasid->pasid = pasid;
4130 	spin_lock_irqsave(&dmar_domain->lock, flags);
4131 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4132 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4133 
4134 	return dev_pasid;
4135 out_detach_iommu:
4136 	domain_detach_iommu(dmar_domain, iommu);
4137 out_free:
4138 	kfree(dev_pasid);
4139 	return ERR_PTR(ret);
4140 }
4141 
4142 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4143 				     struct device *dev, ioasid_t pasid,
4144 				     struct iommu_domain *old)
4145 {
4146 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4147 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4148 	struct intel_iommu *iommu = info->iommu;
4149 	struct dev_pasid_info *dev_pasid;
4150 	int ret;
4151 
4152 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4153 		return -EINVAL;
4154 
4155 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4156 		return -EOPNOTSUPP;
4157 
4158 	if (domain->dirty_ops)
4159 		return -EINVAL;
4160 
4161 	if (context_copied(iommu, info->bus, info->devfn))
4162 		return -EBUSY;
4163 
4164 	ret = paging_domain_compatible(domain, dev);
4165 	if (ret)
4166 		return ret;
4167 
4168 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4169 	if (IS_ERR(dev_pasid))
4170 		return PTR_ERR(dev_pasid);
4171 
4172 	ret = iopf_for_domain_replace(domain, old, dev);
4173 	if (ret)
4174 		goto out_remove_dev_pasid;
4175 
4176 	if (intel_domain_is_fs_paging(dmar_domain))
4177 		ret = domain_setup_first_level(iommu, dmar_domain,
4178 					       dev, pasid, old);
4179 	else if (intel_domain_is_ss_paging(dmar_domain))
4180 		ret = domain_setup_second_level(iommu, dmar_domain,
4181 						dev, pasid, old);
4182 	else if (WARN_ON(true))
4183 		ret = -EINVAL;
4184 
4185 	if (ret)
4186 		goto out_unwind_iopf;
4187 
4188 	domain_remove_dev_pasid(old, dev, pasid);
4189 
4190 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4191 
4192 	return 0;
4193 
4194 out_unwind_iopf:
4195 	iopf_for_domain_replace(old, domain, dev);
4196 out_remove_dev_pasid:
4197 	domain_remove_dev_pasid(domain, dev, pasid);
4198 	return ret;
4199 }
4200 
4201 static void *intel_iommu_hw_info(struct device *dev, u32 *length,
4202 				 enum iommu_hw_info_type *type)
4203 {
4204 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4205 	struct intel_iommu *iommu = info->iommu;
4206 	struct iommu_hw_info_vtd *vtd;
4207 
4208 	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
4209 	    *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
4210 		return ERR_PTR(-EOPNOTSUPP);
4211 
4212 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4213 	if (!vtd)
4214 		return ERR_PTR(-ENOMEM);
4215 
4216 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4217 	vtd->cap_reg = iommu->cap;
4218 	vtd->ecap_reg = iommu->ecap;
4219 	*length = sizeof(*vtd);
4220 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4221 	return vtd;
4222 }
4223 
4224 /*
4225  * Set dirty tracking for the device list of a domain. The caller must
4226  * hold the domain->lock when calling it.
4227  */
4228 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4229 {
4230 	struct device_domain_info *info;
4231 	int ret = 0;
4232 
4233 	list_for_each_entry(info, devices, link) {
4234 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4235 						       IOMMU_NO_PASID, enable);
4236 		if (ret)
4237 			break;
4238 	}
4239 
4240 	return ret;
4241 }
4242 
4243 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4244 					    bool enable)
4245 {
4246 	struct dmar_domain *s1_domain;
4247 	unsigned long flags;
4248 	int ret;
4249 
4250 	spin_lock(&domain->s1_lock);
4251 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4252 		spin_lock_irqsave(&s1_domain->lock, flags);
4253 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4254 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4255 		if (ret)
4256 			goto err_unwind;
4257 	}
4258 	spin_unlock(&domain->s1_lock);
4259 	return 0;
4260 
4261 err_unwind:
4262 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4263 		spin_lock_irqsave(&s1_domain->lock, flags);
4264 		device_set_dirty_tracking(&s1_domain->devices,
4265 					  domain->dirty_tracking);
4266 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4267 	}
4268 	spin_unlock(&domain->s1_lock);
4269 	return ret;
4270 }
4271 
4272 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4273 					  bool enable)
4274 {
4275 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4276 	int ret;
4277 
4278 	spin_lock(&dmar_domain->lock);
4279 	if (dmar_domain->dirty_tracking == enable)
4280 		goto out_unlock;
4281 
4282 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4283 	if (ret)
4284 		goto err_unwind;
4285 
4286 	if (dmar_domain->nested_parent) {
4287 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4288 		if (ret)
4289 			goto err_unwind;
4290 	}
4291 
4292 	dmar_domain->dirty_tracking = enable;
4293 out_unlock:
4294 	spin_unlock(&dmar_domain->lock);
4295 
4296 	return 0;
4297 
4298 err_unwind:
4299 	device_set_dirty_tracking(&dmar_domain->devices,
4300 				  dmar_domain->dirty_tracking);
4301 	spin_unlock(&dmar_domain->lock);
4302 	return ret;
4303 }
4304 
4305 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4306 					    unsigned long iova, size_t size,
4307 					    unsigned long flags,
4308 					    struct iommu_dirty_bitmap *dirty)
4309 {
4310 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4311 	unsigned long end = iova + size - 1;
4312 	unsigned long pgsize;
4313 
4314 	/*
4315 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4316 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4317 	 * have occurred when we stopped dirty tracking. This ensures that we
4318 	 * never inherit dirtied bits from a previous cycle.
4319 	 */
4320 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4321 		return -EINVAL;
4322 
4323 	do {
4324 		struct dma_pte *pte;
4325 		int lvl = 0;
4326 
4327 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4328 				     GFP_ATOMIC);
4329 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4330 		if (!pte || !dma_pte_present(pte)) {
4331 			iova += pgsize;
4332 			continue;
4333 		}
4334 
4335 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4336 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4337 		iova += pgsize;
4338 	} while (iova < end);
4339 
4340 	return 0;
4341 }
4342 
4343 static const struct iommu_dirty_ops intel_dirty_ops = {
4344 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4345 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4346 };
4347 
4348 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4349 {
4350 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4351 	struct intel_iommu *iommu = info->iommu;
4352 	struct context_entry *context;
4353 
4354 	spin_lock(&iommu->lock);
4355 	context = iommu_context_addr(iommu, bus, devfn, 1);
4356 	if (!context) {
4357 		spin_unlock(&iommu->lock);
4358 		return -ENOMEM;
4359 	}
4360 
4361 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4362 		spin_unlock(&iommu->lock);
4363 		return 0;
4364 	}
4365 
4366 	copied_context_tear_down(iommu, context, bus, devfn);
4367 	context_clear_entry(context);
4368 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4369 
4370 	/*
4371 	 * In pass through mode, AW must be programmed to indicate the largest
4372 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4373 	 */
4374 	context_set_address_width(context, iommu->msagaw);
4375 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4376 	context_set_fault_enable(context);
4377 	context_set_present(context);
4378 	if (!ecap_coherent(iommu->ecap))
4379 		clflush_cache_range(context, sizeof(*context));
4380 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4381 	spin_unlock(&iommu->lock);
4382 
4383 	return 0;
4384 }
4385 
4386 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4387 {
4388 	struct device *dev = data;
4389 
4390 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4391 }
4392 
4393 static int device_setup_pass_through(struct device *dev)
4394 {
4395 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4396 
4397 	if (!dev_is_pci(dev))
4398 		return context_setup_pass_through(dev, info->bus, info->devfn);
4399 
4400 	return pci_for_each_dma_alias(to_pci_dev(dev),
4401 				      context_setup_pass_through_cb, dev);
4402 }
4403 
4404 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4405 {
4406 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4407 	struct intel_iommu *iommu = info->iommu;
4408 	int ret;
4409 
4410 	device_block_translation(dev);
4411 
4412 	if (dev_is_real_dma_subdevice(dev))
4413 		return 0;
4414 
4415 	/*
4416 	 * No PRI support with the global identity domain. No need to enable or
4417 	 * disable PRI in this path as the iommu has been put in the blocking
4418 	 * state.
4419 	 */
4420 	if (sm_supported(iommu))
4421 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4422 	else
4423 		ret = device_setup_pass_through(dev);
4424 
4425 	if (!ret)
4426 		info->domain_attached = true;
4427 
4428 	return ret;
4429 }
4430 
4431 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4432 					 struct device *dev, ioasid_t pasid,
4433 					 struct iommu_domain *old)
4434 {
4435 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4436 	struct intel_iommu *iommu = info->iommu;
4437 	int ret;
4438 
4439 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4440 		return -EOPNOTSUPP;
4441 
4442 	ret = iopf_for_domain_replace(domain, old, dev);
4443 	if (ret)
4444 		return ret;
4445 
4446 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4447 	if (ret) {
4448 		iopf_for_domain_replace(old, domain, dev);
4449 		return ret;
4450 	}
4451 
4452 	domain_remove_dev_pasid(old, dev, pasid);
4453 	return 0;
4454 }
4455 
4456 static struct iommu_domain identity_domain = {
4457 	.type = IOMMU_DOMAIN_IDENTITY,
4458 	.ops = &(const struct iommu_domain_ops) {
4459 		.attach_dev	= identity_domain_attach_dev,
4460 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4461 	},
4462 };
4463 
4464 const struct iommu_domain_ops intel_fs_paging_domain_ops = {
4465 	.attach_dev = intel_iommu_attach_device,
4466 	.set_dev_pasid = intel_iommu_set_dev_pasid,
4467 	.map_pages = intel_iommu_map_pages,
4468 	.unmap_pages = intel_iommu_unmap_pages,
4469 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
4470 	.flush_iotlb_all = intel_flush_iotlb_all,
4471 	.iotlb_sync = intel_iommu_tlb_sync,
4472 	.iova_to_phys = intel_iommu_iova_to_phys,
4473 	.free = intel_iommu_domain_free,
4474 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
4475 };
4476 
4477 const struct iommu_domain_ops intel_ss_paging_domain_ops = {
4478 	.attach_dev = intel_iommu_attach_device,
4479 	.set_dev_pasid = intel_iommu_set_dev_pasid,
4480 	.map_pages = intel_iommu_map_pages,
4481 	.unmap_pages = intel_iommu_unmap_pages,
4482 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
4483 	.flush_iotlb_all = intel_flush_iotlb_all,
4484 	.iotlb_sync = intel_iommu_tlb_sync,
4485 	.iova_to_phys = intel_iommu_iova_to_phys,
4486 	.free = intel_iommu_domain_free,
4487 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
4488 };
4489 
4490 const struct iommu_ops intel_iommu_ops = {
4491 	.blocked_domain		= &blocking_domain,
4492 	.release_domain		= &blocking_domain,
4493 	.identity_domain	= &identity_domain,
4494 	.capable		= intel_iommu_capable,
4495 	.hw_info		= intel_iommu_hw_info,
4496 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4497 	.domain_alloc_sva	= intel_svm_domain_alloc,
4498 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4499 	.probe_device		= intel_iommu_probe_device,
4500 	.probe_finalize		= intel_iommu_probe_finalize,
4501 	.release_device		= intel_iommu_release_device,
4502 	.get_resv_regions	= intel_iommu_get_resv_regions,
4503 	.device_group		= intel_iommu_device_group,
4504 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4505 	.def_domain_type	= device_def_domain_type,
4506 	.page_response		= intel_iommu_page_response,
4507 };
4508 
4509 static void quirk_iommu_igfx(struct pci_dev *dev)
4510 {
4511 	if (risky_device(dev))
4512 		return;
4513 
4514 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4515 	disable_igfx_iommu = 1;
4516 }
4517 
4518 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4526 
4527 /* QM57/QS57 integrated gfx malfunctions with dmar */
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4529 
4530 /* Broadwell igfx malfunctions with dmar */
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4539 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4552 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4555 
4556 static void quirk_iommu_rwbf(struct pci_dev *dev)
4557 {
4558 	if (risky_device(dev))
4559 		return;
4560 
4561 	/*
4562 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4563 	 * but needs it. Same seems to hold for the desktop versions.
4564 	 */
4565 	pci_info(dev, "Forcing write-buffer flush capability\n");
4566 	rwbf_quirk = 1;
4567 }
4568 
4569 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4570 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4571 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4573 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4574 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4575 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4576 
4577 #define GGC 0x52
4578 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4579 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4580 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4581 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4582 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4583 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4584 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4585 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4586 
4587 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4588 {
4589 	unsigned short ggc;
4590 
4591 	if (risky_device(dev))
4592 		return;
4593 
4594 	if (pci_read_config_word(dev, GGC, &ggc))
4595 		return;
4596 
4597 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4598 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4599 		disable_igfx_iommu = 1;
4600 	} else if (!disable_igfx_iommu) {
4601 		/* we have to ensure the gfx device is idle before we flush */
4602 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4603 		iommu_set_dma_strict();
4604 	}
4605 }
4606 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4607 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4608 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4609 
4610 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4611 {
4612 	unsigned short ver;
4613 
4614 	if (!IS_GFX_DEVICE(dev))
4615 		return;
4616 
4617 	ver = (dev->device >> 8) & 0xff;
4618 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4619 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4620 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4621 		return;
4622 
4623 	if (risky_device(dev))
4624 		return;
4625 
4626 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4627 	iommu_skip_te_disable = 1;
4628 }
4629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4630 
4631 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4632    ISOCH DMAR unit for the Azalia sound device, but not give it any
4633    TLB entries, which causes it to deadlock. Check for that.  We do
4634    this in a function called from init_dmars(), instead of in a PCI
4635    quirk, because we don't want to print the obnoxious "BIOS broken"
4636    message if VT-d is actually disabled.
4637 */
4638 static void __init check_tylersburg_isoch(void)
4639 {
4640 	struct pci_dev *pdev;
4641 	uint32_t vtisochctrl;
4642 
4643 	/* If there's no Azalia in the system anyway, forget it. */
4644 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4645 	if (!pdev)
4646 		return;
4647 
4648 	if (risky_device(pdev)) {
4649 		pci_dev_put(pdev);
4650 		return;
4651 	}
4652 
4653 	pci_dev_put(pdev);
4654 
4655 	/* System Management Registers. Might be hidden, in which case
4656 	   we can't do the sanity check. But that's OK, because the
4657 	   known-broken BIOSes _don't_ actually hide it, so far. */
4658 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4659 	if (!pdev)
4660 		return;
4661 
4662 	if (risky_device(pdev)) {
4663 		pci_dev_put(pdev);
4664 		return;
4665 	}
4666 
4667 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4668 		pci_dev_put(pdev);
4669 		return;
4670 	}
4671 
4672 	pci_dev_put(pdev);
4673 
4674 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4675 	if (vtisochctrl & 1)
4676 		return;
4677 
4678 	/* Drop all bits other than the number of TLB entries */
4679 	vtisochctrl &= 0x1c;
4680 
4681 	/* If we have the recommended number of TLB entries (16), fine. */
4682 	if (vtisochctrl == 0x10)
4683 		return;
4684 
4685 	/* Zero TLB entries? You get to ride the short bus to school. */
4686 	if (!vtisochctrl) {
4687 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4688 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4689 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4690 		     dmi_get_system_info(DMI_BIOS_VERSION),
4691 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4692 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4693 		return;
4694 	}
4695 
4696 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4697 	       vtisochctrl);
4698 }
4699 
4700 /*
4701  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4702  * invalidation completion before posted writes initiated with translated address
4703  * that utilized translations matching the invalidation address range, violating
4704  * the invalidation completion ordering.
4705  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4706  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4707  * under the control of the trusted/privileged host device driver must use this
4708  * quirk.
4709  * Device TLBs are invalidated under the following six conditions:
4710  * 1. Device driver does DMA API unmap IOVA
4711  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4712  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4713  *    exit_mmap() due to crash
4714  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4715  *    VM has to free pages that were unmapped
4716  * 5. Userspace driver unmaps a DMA buffer
4717  * 6. Cache invalidation in vSVA usage (upcoming)
4718  *
4719  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4720  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4721  * invalidate TLB the same way as normal user unmap which will use this quirk.
4722  * The dTLB invalidation after PASID cache flush does not need this quirk.
4723  *
4724  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4725  */
4726 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4727 			       unsigned long address, unsigned long mask,
4728 			       u32 pasid, u16 qdep)
4729 {
4730 	u16 sid;
4731 
4732 	if (likely(!info->dtlb_extra_inval))
4733 		return;
4734 
4735 	sid = PCI_DEVID(info->bus, info->devfn);
4736 	if (pasid == IOMMU_NO_PASID) {
4737 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4738 				   qdep, address, mask);
4739 	} else {
4740 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4741 					 pasid, qdep, address, mask);
4742 	}
4743 }
4744 
4745 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4746 
4747 /*
4748  * Function to submit a command to the enhanced command interface. The
4749  * valid enhanced command descriptions are defined in Table 47 of the
4750  * VT-d spec. The VT-d hardware implementation may support some but not
4751  * all commands, which can be determined by checking the Enhanced
4752  * Command Capability Register.
4753  *
4754  * Return values:
4755  *  - 0: Command successful without any error;
4756  *  - Negative: software error value;
4757  *  - Nonzero positive: failure status code defined in Table 48.
4758  */
4759 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4760 {
4761 	unsigned long flags;
4762 	u64 res;
4763 	int ret;
4764 
4765 	if (!cap_ecmds(iommu->cap))
4766 		return -ENODEV;
4767 
4768 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4769 
4770 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4771 	if (res & DMA_ECMD_ECRSP_IP) {
4772 		ret = -EBUSY;
4773 		goto err;
4774 	}
4775 
4776 	/*
4777 	 * Unconditionally write the operand B, because
4778 	 * - There is no side effect if an ecmd doesn't require an
4779 	 *   operand B, but we set the register to some value.
4780 	 * - It's not invoked in any critical path. The extra MMIO
4781 	 *   write doesn't bring any performance concerns.
4782 	 */
4783 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4784 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4785 
4786 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4787 		      !(res & DMA_ECMD_ECRSP_IP), res);
4788 
4789 	if (res & DMA_ECMD_ECRSP_IP) {
4790 		ret = -ETIMEDOUT;
4791 		goto err;
4792 	}
4793 
4794 	ret = ecmd_get_status_code(res);
4795 err:
4796 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4797 
4798 	return ret;
4799 }
4800