xref: /linux/drivers/iommu/intel/iommu.c (revision 4e73826089ce899357580bbf6e0afe4e6f9900b7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 /* IO virtual address start page frame number */
59 #define IOVA_START_PFN		(1)
60 
61 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
62 
63 static void __init check_tylersburg_isoch(void);
64 static int rwbf_quirk;
65 
66 /*
67  * set to 1 to panic kernel if can't successfully enable VT-d
68  * (used when kernel is launched w/ TXT)
69  */
70 static int force_on = 0;
71 static int intel_iommu_tboot_noforce;
72 static int no_platform_optin;
73 
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 
76 /*
77  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
78  * if marked present.
79  */
80 static phys_addr_t root_entry_lctp(struct root_entry *re)
81 {
82 	if (!(re->lo & 1))
83 		return 0;
84 
85 	return re->lo & VTD_PAGE_MASK;
86 }
87 
88 /*
89  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
90  * if marked present.
91  */
92 static phys_addr_t root_entry_uctp(struct root_entry *re)
93 {
94 	if (!(re->hi & 1))
95 		return 0;
96 
97 	return re->hi & VTD_PAGE_MASK;
98 }
99 
100 /*
101  * This domain is a statically identity mapping domain.
102  *	1. This domain creats a static 1:1 mapping to all usable memory.
103  * 	2. It maps to each iommu if successful.
104  *	3. Each iommu mapps to this domain if successful.
105  */
106 static struct dmar_domain *si_domain;
107 static int hw_pass_through = 1;
108 
109 struct dmar_rmrr_unit {
110 	struct list_head list;		/* list of rmrr units	*/
111 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
112 	u64	base_address;		/* reserved base address*/
113 	u64	end_address;		/* reserved end address */
114 	struct dmar_dev_scope *devices;	/* target devices */
115 	int	devices_cnt;		/* target device count */
116 };
117 
118 struct dmar_atsr_unit {
119 	struct list_head list;		/* list of ATSR units */
120 	struct acpi_dmar_header *hdr;	/* ACPI header */
121 	struct dmar_dev_scope *devices;	/* target devices */
122 	int devices_cnt;		/* target device count */
123 	u8 include_all:1;		/* include all ports */
124 };
125 
126 struct dmar_satc_unit {
127 	struct list_head list;		/* list of SATC units */
128 	struct acpi_dmar_header *hdr;	/* ACPI header */
129 	struct dmar_dev_scope *devices;	/* target devices */
130 	struct intel_iommu *iommu;	/* the corresponding iommu */
131 	int devices_cnt;		/* target device count */
132 	u8 atc_required:1;		/* ATS is required */
133 };
134 
135 static LIST_HEAD(dmar_atsr_units);
136 static LIST_HEAD(dmar_rmrr_units);
137 static LIST_HEAD(dmar_satc_units);
138 
139 #define for_each_rmrr_units(rmrr) \
140 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
141 
142 static void intel_iommu_domain_free(struct iommu_domain *domain);
143 
144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
146 
147 int intel_iommu_enabled = 0;
148 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
149 
150 static int dmar_map_gfx = 1;
151 static int intel_iommu_superpage = 1;
152 static int iommu_identity_mapping;
153 static int iommu_skip_te_disable;
154 
155 #define IDENTMAP_GFX		2
156 #define IDENTMAP_AZALIA		4
157 
158 const struct iommu_ops intel_iommu_ops;
159 static const struct iommu_dirty_ops intel_dirty_ops;
160 
161 static bool translation_pre_enabled(struct intel_iommu *iommu)
162 {
163 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
164 }
165 
166 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
167 {
168 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
169 }
170 
171 static void init_translation_status(struct intel_iommu *iommu)
172 {
173 	u32 gsts;
174 
175 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
176 	if (gsts & DMA_GSTS_TES)
177 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
178 }
179 
180 static int __init intel_iommu_setup(char *str)
181 {
182 	if (!str)
183 		return -EINVAL;
184 
185 	while (*str) {
186 		if (!strncmp(str, "on", 2)) {
187 			dmar_disabled = 0;
188 			pr_info("IOMMU enabled\n");
189 		} else if (!strncmp(str, "off", 3)) {
190 			dmar_disabled = 1;
191 			no_platform_optin = 1;
192 			pr_info("IOMMU disabled\n");
193 		} else if (!strncmp(str, "igfx_off", 8)) {
194 			dmar_map_gfx = 0;
195 			pr_info("Disable GFX device mapping\n");
196 		} else if (!strncmp(str, "forcedac", 8)) {
197 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
198 			iommu_dma_forcedac = true;
199 		} else if (!strncmp(str, "strict", 6)) {
200 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
201 			iommu_set_dma_strict();
202 		} else if (!strncmp(str, "sp_off", 6)) {
203 			pr_info("Disable supported super page\n");
204 			intel_iommu_superpage = 0;
205 		} else if (!strncmp(str, "sm_on", 5)) {
206 			pr_info("Enable scalable mode if hardware supports\n");
207 			intel_iommu_sm = 1;
208 		} else if (!strncmp(str, "sm_off", 6)) {
209 			pr_info("Scalable mode is disallowed\n");
210 			intel_iommu_sm = 0;
211 		} else if (!strncmp(str, "tboot_noforce", 13)) {
212 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
213 			intel_iommu_tboot_noforce = 1;
214 		} else {
215 			pr_notice("Unknown option - '%s'\n", str);
216 		}
217 
218 		str += strcspn(str, ",");
219 		while (*str == ',')
220 			str++;
221 	}
222 
223 	return 1;
224 }
225 __setup("intel_iommu=", intel_iommu_setup);
226 
227 void *alloc_pgtable_page(int node, gfp_t gfp)
228 {
229 	struct page *page;
230 	void *vaddr = NULL;
231 
232 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
233 	if (page)
234 		vaddr = page_address(page);
235 	return vaddr;
236 }
237 
238 void free_pgtable_page(void *vaddr)
239 {
240 	free_page((unsigned long)vaddr);
241 }
242 
243 static int domain_type_is_si(struct dmar_domain *domain)
244 {
245 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
246 }
247 
248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
249 {
250 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
251 
252 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
253 }
254 
255 /*
256  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
257  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
258  * the returned SAGAW.
259  */
260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
261 {
262 	unsigned long fl_sagaw, sl_sagaw;
263 
264 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
265 	sl_sagaw = cap_sagaw(iommu->cap);
266 
267 	/* Second level only. */
268 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
269 		return sl_sagaw;
270 
271 	/* First level only. */
272 	if (!ecap_slts(iommu->ecap))
273 		return fl_sagaw;
274 
275 	return fl_sagaw & sl_sagaw;
276 }
277 
278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
279 {
280 	unsigned long sagaw;
281 	int agaw;
282 
283 	sagaw = __iommu_calculate_sagaw(iommu);
284 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
285 		if (test_bit(agaw, &sagaw))
286 			break;
287 	}
288 
289 	return agaw;
290 }
291 
292 /*
293  * Calculate max SAGAW for each iommu.
294  */
295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
296 {
297 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
298 }
299 
300 /*
301  * calculate agaw for each iommu.
302  * "SAGAW" may be different across iommus, use a default agaw, and
303  * get a supported less agaw for iommus that don't support the default agaw.
304  */
305 int iommu_calculate_agaw(struct intel_iommu *iommu)
306 {
307 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
308 }
309 
310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
311 {
312 	return sm_supported(iommu) ?
313 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
314 }
315 
316 static void domain_update_iommu_coherency(struct dmar_domain *domain)
317 {
318 	struct iommu_domain_info *info;
319 	struct dmar_drhd_unit *drhd;
320 	struct intel_iommu *iommu;
321 	bool found = false;
322 	unsigned long i;
323 
324 	domain->iommu_coherency = true;
325 	xa_for_each(&domain->iommu_array, i, info) {
326 		found = true;
327 		if (!iommu_paging_structure_coherency(info->iommu)) {
328 			domain->iommu_coherency = false;
329 			break;
330 		}
331 	}
332 	if (found)
333 		return;
334 
335 	/* No hardware attached; use lowest common denominator */
336 	rcu_read_lock();
337 	for_each_active_iommu(iommu, drhd) {
338 		if (!iommu_paging_structure_coherency(iommu)) {
339 			domain->iommu_coherency = false;
340 			break;
341 		}
342 	}
343 	rcu_read_unlock();
344 }
345 
346 static int domain_update_iommu_superpage(struct dmar_domain *domain,
347 					 struct intel_iommu *skip)
348 {
349 	struct dmar_drhd_unit *drhd;
350 	struct intel_iommu *iommu;
351 	int mask = 0x3;
352 
353 	if (!intel_iommu_superpage)
354 		return 0;
355 
356 	/* set iommu_superpage to the smallest common denominator */
357 	rcu_read_lock();
358 	for_each_active_iommu(iommu, drhd) {
359 		if (iommu != skip) {
360 			if (domain && domain->use_first_level) {
361 				if (!cap_fl1gp_support(iommu->cap))
362 					mask = 0x1;
363 			} else {
364 				mask &= cap_super_page_val(iommu->cap);
365 			}
366 
367 			if (!mask)
368 				break;
369 		}
370 	}
371 	rcu_read_unlock();
372 
373 	return fls(mask);
374 }
375 
376 static int domain_update_device_node(struct dmar_domain *domain)
377 {
378 	struct device_domain_info *info;
379 	int nid = NUMA_NO_NODE;
380 	unsigned long flags;
381 
382 	spin_lock_irqsave(&domain->lock, flags);
383 	list_for_each_entry(info, &domain->devices, link) {
384 		/*
385 		 * There could possibly be multiple device numa nodes as devices
386 		 * within the same domain may sit behind different IOMMUs. There
387 		 * isn't perfect answer in such situation, so we select first
388 		 * come first served policy.
389 		 */
390 		nid = dev_to_node(info->dev);
391 		if (nid != NUMA_NO_NODE)
392 			break;
393 	}
394 	spin_unlock_irqrestore(&domain->lock, flags);
395 
396 	return nid;
397 }
398 
399 static void domain_update_iotlb(struct dmar_domain *domain);
400 
401 /* Return the super pagesize bitmap if supported. */
402 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
403 {
404 	unsigned long bitmap = 0;
405 
406 	/*
407 	 * 1-level super page supports page size of 2MiB, 2-level super page
408 	 * supports page size of both 2MiB and 1GiB.
409 	 */
410 	if (domain->iommu_superpage == 1)
411 		bitmap |= SZ_2M;
412 	else if (domain->iommu_superpage == 2)
413 		bitmap |= SZ_2M | SZ_1G;
414 
415 	return bitmap;
416 }
417 
418 /* Some capabilities may be different across iommus */
419 void domain_update_iommu_cap(struct dmar_domain *domain)
420 {
421 	domain_update_iommu_coherency(domain);
422 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
423 
424 	/*
425 	 * If RHSA is missing, we should default to the device numa domain
426 	 * as fall back.
427 	 */
428 	if (domain->nid == NUMA_NO_NODE)
429 		domain->nid = domain_update_device_node(domain);
430 
431 	/*
432 	 * First-level translation restricts the input-address to a
433 	 * canonical address (i.e., address bits 63:N have the same
434 	 * value as address bit [N-1], where N is 48-bits with 4-level
435 	 * paging and 57-bits with 5-level paging). Hence, skip bit
436 	 * [N-1].
437 	 */
438 	if (domain->use_first_level)
439 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
440 	else
441 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
442 
443 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
444 	domain_update_iotlb(domain);
445 }
446 
447 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
448 					 u8 devfn, int alloc)
449 {
450 	struct root_entry *root = &iommu->root_entry[bus];
451 	struct context_entry *context;
452 	u64 *entry;
453 
454 	/*
455 	 * Except that the caller requested to allocate a new entry,
456 	 * returning a copied context entry makes no sense.
457 	 */
458 	if (!alloc && context_copied(iommu, bus, devfn))
459 		return NULL;
460 
461 	entry = &root->lo;
462 	if (sm_supported(iommu)) {
463 		if (devfn >= 0x80) {
464 			devfn -= 0x80;
465 			entry = &root->hi;
466 		}
467 		devfn *= 2;
468 	}
469 	if (*entry & 1)
470 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
471 	else {
472 		unsigned long phy_addr;
473 		if (!alloc)
474 			return NULL;
475 
476 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
477 		if (!context)
478 			return NULL;
479 
480 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
481 		phy_addr = virt_to_phys((void *)context);
482 		*entry = phy_addr | 1;
483 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
484 	}
485 	return &context[devfn];
486 }
487 
488 /**
489  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
490  *				 sub-hierarchy of a candidate PCI-PCI bridge
491  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
492  * @bridge: the candidate PCI-PCI bridge
493  *
494  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
495  */
496 static bool
497 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
498 {
499 	struct pci_dev *pdev, *pbridge;
500 
501 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
502 		return false;
503 
504 	pdev = to_pci_dev(dev);
505 	pbridge = to_pci_dev(bridge);
506 
507 	if (pbridge->subordinate &&
508 	    pbridge->subordinate->number <= pdev->bus->number &&
509 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
510 		return true;
511 
512 	return false;
513 }
514 
515 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
516 {
517 	struct dmar_drhd_unit *drhd;
518 	u32 vtbar;
519 	int rc;
520 
521 	/* We know that this device on this chipset has its own IOMMU.
522 	 * If we find it under a different IOMMU, then the BIOS is lying
523 	 * to us. Hope that the IOMMU for this device is actually
524 	 * disabled, and it needs no translation...
525 	 */
526 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
527 	if (rc) {
528 		/* "can't" happen */
529 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
530 		return false;
531 	}
532 	vtbar &= 0xffff0000;
533 
534 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
535 	drhd = dmar_find_matched_drhd_unit(pdev);
536 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
537 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
538 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
539 		return true;
540 	}
541 
542 	return false;
543 }
544 
545 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
546 {
547 	if (!iommu || iommu->drhd->ignored)
548 		return true;
549 
550 	if (dev_is_pci(dev)) {
551 		struct pci_dev *pdev = to_pci_dev(dev);
552 
553 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
554 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
555 		    quirk_ioat_snb_local_iommu(pdev))
556 			return true;
557 	}
558 
559 	return false;
560 }
561 
562 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
563 {
564 	struct dmar_drhd_unit *drhd = NULL;
565 	struct pci_dev *pdev = NULL;
566 	struct intel_iommu *iommu;
567 	struct device *tmp;
568 	u16 segment = 0;
569 	int i;
570 
571 	if (!dev)
572 		return NULL;
573 
574 	if (dev_is_pci(dev)) {
575 		struct pci_dev *pf_pdev;
576 
577 		pdev = pci_real_dma_dev(to_pci_dev(dev));
578 
579 		/* VFs aren't listed in scope tables; we need to look up
580 		 * the PF instead to find the IOMMU. */
581 		pf_pdev = pci_physfn(pdev);
582 		dev = &pf_pdev->dev;
583 		segment = pci_domain_nr(pdev->bus);
584 	} else if (has_acpi_companion(dev))
585 		dev = &ACPI_COMPANION(dev)->dev;
586 
587 	rcu_read_lock();
588 	for_each_iommu(iommu, drhd) {
589 		if (pdev && segment != drhd->segment)
590 			continue;
591 
592 		for_each_active_dev_scope(drhd->devices,
593 					  drhd->devices_cnt, i, tmp) {
594 			if (tmp == dev) {
595 				/* For a VF use its original BDF# not that of the PF
596 				 * which we used for the IOMMU lookup. Strictly speaking
597 				 * we could do this for all PCI devices; we only need to
598 				 * get the BDF# from the scope table for ACPI matches. */
599 				if (pdev && pdev->is_virtfn)
600 					goto got_pdev;
601 
602 				if (bus && devfn) {
603 					*bus = drhd->devices[i].bus;
604 					*devfn = drhd->devices[i].devfn;
605 				}
606 				goto out;
607 			}
608 
609 			if (is_downstream_to_pci_bridge(dev, tmp))
610 				goto got_pdev;
611 		}
612 
613 		if (pdev && drhd->include_all) {
614 got_pdev:
615 			if (bus && devfn) {
616 				*bus = pdev->bus->number;
617 				*devfn = pdev->devfn;
618 			}
619 			goto out;
620 		}
621 	}
622 	iommu = NULL;
623 out:
624 	if (iommu_is_dummy(iommu, dev))
625 		iommu = NULL;
626 
627 	rcu_read_unlock();
628 
629 	return iommu;
630 }
631 
632 static void domain_flush_cache(struct dmar_domain *domain,
633 			       void *addr, int size)
634 {
635 	if (!domain->iommu_coherency)
636 		clflush_cache_range(addr, size);
637 }
638 
639 static void free_context_table(struct intel_iommu *iommu)
640 {
641 	struct context_entry *context;
642 	int i;
643 
644 	if (!iommu->root_entry)
645 		return;
646 
647 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
648 		context = iommu_context_addr(iommu, i, 0, 0);
649 		if (context)
650 			free_pgtable_page(context);
651 
652 		if (!sm_supported(iommu))
653 			continue;
654 
655 		context = iommu_context_addr(iommu, i, 0x80, 0);
656 		if (context)
657 			free_pgtable_page(context);
658 	}
659 
660 	free_pgtable_page(iommu->root_entry);
661 	iommu->root_entry = NULL;
662 }
663 
664 #ifdef CONFIG_DMAR_DEBUG
665 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
666 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
667 {
668 	struct dma_pte *pte;
669 	int offset;
670 
671 	while (1) {
672 		offset = pfn_level_offset(pfn, level);
673 		pte = &parent[offset];
674 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
675 			pr_info("PTE not present at level %d\n", level);
676 			break;
677 		}
678 
679 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
680 
681 		if (level == 1)
682 			break;
683 
684 		parent = phys_to_virt(dma_pte_addr(pte));
685 		level--;
686 	}
687 }
688 
689 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
690 			  unsigned long long addr, u32 pasid)
691 {
692 	struct pasid_dir_entry *dir, *pde;
693 	struct pasid_entry *entries, *pte;
694 	struct context_entry *ctx_entry;
695 	struct root_entry *rt_entry;
696 	int i, dir_index, index, level;
697 	u8 devfn = source_id & 0xff;
698 	u8 bus = source_id >> 8;
699 	struct dma_pte *pgtable;
700 
701 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
702 
703 	/* root entry dump */
704 	rt_entry = &iommu->root_entry[bus];
705 	if (!rt_entry) {
706 		pr_info("root table entry is not present\n");
707 		return;
708 	}
709 
710 	if (sm_supported(iommu))
711 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
712 			rt_entry->hi, rt_entry->lo);
713 	else
714 		pr_info("root entry: 0x%016llx", rt_entry->lo);
715 
716 	/* context entry dump */
717 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
718 	if (!ctx_entry) {
719 		pr_info("context table entry is not present\n");
720 		return;
721 	}
722 
723 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
724 		ctx_entry->hi, ctx_entry->lo);
725 
726 	/* legacy mode does not require PASID entries */
727 	if (!sm_supported(iommu)) {
728 		level = agaw_to_level(ctx_entry->hi & 7);
729 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
730 		goto pgtable_walk;
731 	}
732 
733 	/* get the pointer to pasid directory entry */
734 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
735 	if (!dir) {
736 		pr_info("pasid directory entry is not present\n");
737 		return;
738 	}
739 	/* For request-without-pasid, get the pasid from context entry */
740 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
741 		pasid = IOMMU_NO_PASID;
742 
743 	dir_index = pasid >> PASID_PDE_SHIFT;
744 	pde = &dir[dir_index];
745 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
746 
747 	/* get the pointer to the pasid table entry */
748 	entries = get_pasid_table_from_pde(pde);
749 	if (!entries) {
750 		pr_info("pasid table entry is not present\n");
751 		return;
752 	}
753 	index = pasid & PASID_PTE_MASK;
754 	pte = &entries[index];
755 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
756 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
757 
758 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
759 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
760 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
761 	} else {
762 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
763 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
764 	}
765 
766 pgtable_walk:
767 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
768 }
769 #endif
770 
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772 				      unsigned long pfn, int *target_level,
773 				      gfp_t gfp)
774 {
775 	struct dma_pte *parent, *pte;
776 	int level = agaw_to_level(domain->agaw);
777 	int offset;
778 
779 	if (!domain_pfn_supported(domain, pfn))
780 		/* Address beyond IOMMU's addressing capabilities. */
781 		return NULL;
782 
783 	parent = domain->pgd;
784 
785 	while (1) {
786 		void *tmp_page;
787 
788 		offset = pfn_level_offset(pfn, level);
789 		pte = &parent[offset];
790 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791 			break;
792 		if (level == *target_level)
793 			break;
794 
795 		if (!dma_pte_present(pte)) {
796 			uint64_t pteval;
797 
798 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
799 
800 			if (!tmp_page)
801 				return NULL;
802 
803 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805 			if (domain->use_first_level)
806 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
807 
808 			if (cmpxchg64(&pte->val, 0ULL, pteval))
809 				/* Someone else set it while we were thinking; use theirs. */
810 				free_pgtable_page(tmp_page);
811 			else
812 				domain_flush_cache(domain, pte, sizeof(*pte));
813 		}
814 		if (level == 1)
815 			break;
816 
817 		parent = phys_to_virt(dma_pte_addr(pte));
818 		level--;
819 	}
820 
821 	if (!*target_level)
822 		*target_level = level;
823 
824 	return pte;
825 }
826 
827 /* return address's pte at specific level */
828 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
829 					 unsigned long pfn,
830 					 int level, int *large_page)
831 {
832 	struct dma_pte *parent, *pte;
833 	int total = agaw_to_level(domain->agaw);
834 	int offset;
835 
836 	parent = domain->pgd;
837 	while (level <= total) {
838 		offset = pfn_level_offset(pfn, total);
839 		pte = &parent[offset];
840 		if (level == total)
841 			return pte;
842 
843 		if (!dma_pte_present(pte)) {
844 			*large_page = total;
845 			break;
846 		}
847 
848 		if (dma_pte_superpage(pte)) {
849 			*large_page = total;
850 			return pte;
851 		}
852 
853 		parent = phys_to_virt(dma_pte_addr(pte));
854 		total--;
855 	}
856 	return NULL;
857 }
858 
859 /* clear last level pte, a tlb flush should be followed */
860 static void dma_pte_clear_range(struct dmar_domain *domain,
861 				unsigned long start_pfn,
862 				unsigned long last_pfn)
863 {
864 	unsigned int large_page;
865 	struct dma_pte *first_pte, *pte;
866 
867 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
868 	    WARN_ON(start_pfn > last_pfn))
869 		return;
870 
871 	/* we don't need lock here; nobody else touches the iova range */
872 	do {
873 		large_page = 1;
874 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
875 		if (!pte) {
876 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
877 			continue;
878 		}
879 		do {
880 			dma_clear_pte(pte);
881 			start_pfn += lvl_to_nr_pages(large_page);
882 			pte++;
883 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
884 
885 		domain_flush_cache(domain, first_pte,
886 				   (void *)pte - (void *)first_pte);
887 
888 	} while (start_pfn && start_pfn <= last_pfn);
889 }
890 
891 static void dma_pte_free_level(struct dmar_domain *domain, int level,
892 			       int retain_level, struct dma_pte *pte,
893 			       unsigned long pfn, unsigned long start_pfn,
894 			       unsigned long last_pfn)
895 {
896 	pfn = max(start_pfn, pfn);
897 	pte = &pte[pfn_level_offset(pfn, level)];
898 
899 	do {
900 		unsigned long level_pfn;
901 		struct dma_pte *level_pte;
902 
903 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
904 			goto next;
905 
906 		level_pfn = pfn & level_mask(level);
907 		level_pte = phys_to_virt(dma_pte_addr(pte));
908 
909 		if (level > 2) {
910 			dma_pte_free_level(domain, level - 1, retain_level,
911 					   level_pte, level_pfn, start_pfn,
912 					   last_pfn);
913 		}
914 
915 		/*
916 		 * Free the page table if we're below the level we want to
917 		 * retain and the range covers the entire table.
918 		 */
919 		if (level < retain_level && !(start_pfn > level_pfn ||
920 		      last_pfn < level_pfn + level_size(level) - 1)) {
921 			dma_clear_pte(pte);
922 			domain_flush_cache(domain, pte, sizeof(*pte));
923 			free_pgtable_page(level_pte);
924 		}
925 next:
926 		pfn += level_size(level);
927 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
928 }
929 
930 /*
931  * clear last level (leaf) ptes and free page table pages below the
932  * level we wish to keep intact.
933  */
934 static void dma_pte_free_pagetable(struct dmar_domain *domain,
935 				   unsigned long start_pfn,
936 				   unsigned long last_pfn,
937 				   int retain_level)
938 {
939 	dma_pte_clear_range(domain, start_pfn, last_pfn);
940 
941 	/* We don't need lock here; nobody else touches the iova range */
942 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
943 			   domain->pgd, 0, start_pfn, last_pfn);
944 
945 	/* free pgd */
946 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
947 		free_pgtable_page(domain->pgd);
948 		domain->pgd = NULL;
949 	}
950 }
951 
952 /* When a page at a given level is being unlinked from its parent, we don't
953    need to *modify* it at all. All we need to do is make a list of all the
954    pages which can be freed just as soon as we've flushed the IOTLB and we
955    know the hardware page-walk will no longer touch them.
956    The 'pte' argument is the *parent* PTE, pointing to the page that is to
957    be freed. */
958 static void dma_pte_list_pagetables(struct dmar_domain *domain,
959 				    int level, struct dma_pte *pte,
960 				    struct list_head *freelist)
961 {
962 	struct page *pg;
963 
964 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
965 	list_add_tail(&pg->lru, freelist);
966 
967 	if (level == 1)
968 		return;
969 
970 	pte = page_address(pg);
971 	do {
972 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
973 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
974 		pte++;
975 	} while (!first_pte_in_page(pte));
976 }
977 
978 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
979 				struct dma_pte *pte, unsigned long pfn,
980 				unsigned long start_pfn, unsigned long last_pfn,
981 				struct list_head *freelist)
982 {
983 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
984 
985 	pfn = max(start_pfn, pfn);
986 	pte = &pte[pfn_level_offset(pfn, level)];
987 
988 	do {
989 		unsigned long level_pfn = pfn & level_mask(level);
990 
991 		if (!dma_pte_present(pte))
992 			goto next;
993 
994 		/* If range covers entire pagetable, free it */
995 		if (start_pfn <= level_pfn &&
996 		    last_pfn >= level_pfn + level_size(level) - 1) {
997 			/* These suborbinate page tables are going away entirely. Don't
998 			   bother to clear them; we're just going to *free* them. */
999 			if (level > 1 && !dma_pte_superpage(pte))
1000 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1001 
1002 			dma_clear_pte(pte);
1003 			if (!first_pte)
1004 				first_pte = pte;
1005 			last_pte = pte;
1006 		} else if (level > 1) {
1007 			/* Recurse down into a level that isn't *entirely* obsolete */
1008 			dma_pte_clear_level(domain, level - 1,
1009 					    phys_to_virt(dma_pte_addr(pte)),
1010 					    level_pfn, start_pfn, last_pfn,
1011 					    freelist);
1012 		}
1013 next:
1014 		pfn = level_pfn + level_size(level);
1015 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1016 
1017 	if (first_pte)
1018 		domain_flush_cache(domain, first_pte,
1019 				   (void *)++last_pte - (void *)first_pte);
1020 }
1021 
1022 /* We can't just free the pages because the IOMMU may still be walking
1023    the page tables, and may have cached the intermediate levels. The
1024    pages can only be freed after the IOTLB flush has been done. */
1025 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1026 			 unsigned long last_pfn, struct list_head *freelist)
1027 {
1028 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1029 	    WARN_ON(start_pfn > last_pfn))
1030 		return;
1031 
1032 	/* we don't need lock here; nobody else touches the iova range */
1033 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1034 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1035 
1036 	/* free pgd */
1037 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1038 		struct page *pgd_page = virt_to_page(domain->pgd);
1039 		list_add_tail(&pgd_page->lru, freelist);
1040 		domain->pgd = NULL;
1041 	}
1042 }
1043 
1044 /* iommu handling */
1045 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1046 {
1047 	struct root_entry *root;
1048 
1049 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1050 	if (!root) {
1051 		pr_err("Allocating root entry for %s failed\n",
1052 			iommu->name);
1053 		return -ENOMEM;
1054 	}
1055 
1056 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1057 	iommu->root_entry = root;
1058 
1059 	return 0;
1060 }
1061 
1062 static void iommu_set_root_entry(struct intel_iommu *iommu)
1063 {
1064 	u64 addr;
1065 	u32 sts;
1066 	unsigned long flag;
1067 
1068 	addr = virt_to_phys(iommu->root_entry);
1069 	if (sm_supported(iommu))
1070 		addr |= DMA_RTADDR_SMT;
1071 
1072 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1074 
1075 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1076 
1077 	/* Make sure hardware complete it */
1078 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1079 		      readl, (sts & DMA_GSTS_RTPS), sts);
1080 
1081 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082 
1083 	/*
1084 	 * Hardware invalidates all DMA remapping hardware translation
1085 	 * caches as part of SRTP flow.
1086 	 */
1087 	if (cap_esrtps(iommu->cap))
1088 		return;
1089 
1090 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1091 	if (sm_supported(iommu))
1092 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1093 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1094 }
1095 
1096 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1097 {
1098 	u32 val;
1099 	unsigned long flag;
1100 
1101 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1102 		return;
1103 
1104 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1105 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1106 
1107 	/* Make sure hardware complete it */
1108 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1109 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1110 
1111 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1112 }
1113 
1114 /* return value determine if we need a write buffer flush */
1115 static void __iommu_flush_context(struct intel_iommu *iommu,
1116 				  u16 did, u16 source_id, u8 function_mask,
1117 				  u64 type)
1118 {
1119 	u64 val = 0;
1120 	unsigned long flag;
1121 
1122 	switch (type) {
1123 	case DMA_CCMD_GLOBAL_INVL:
1124 		val = DMA_CCMD_GLOBAL_INVL;
1125 		break;
1126 	case DMA_CCMD_DOMAIN_INVL:
1127 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1128 		break;
1129 	case DMA_CCMD_DEVICE_INVL:
1130 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1131 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1132 		break;
1133 	default:
1134 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1135 			iommu->name, type);
1136 		return;
1137 	}
1138 	val |= DMA_CCMD_ICC;
1139 
1140 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1142 
1143 	/* Make sure hardware complete it */
1144 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1146 
1147 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1148 }
1149 
1150 /* return value determine if we need a write buffer flush */
1151 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152 				u64 addr, unsigned int size_order, u64 type)
1153 {
1154 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155 	u64 val = 0, val_iva = 0;
1156 	unsigned long flag;
1157 
1158 	switch (type) {
1159 	case DMA_TLB_GLOBAL_FLUSH:
1160 		/* global flush doesn't need set IVA_REG */
1161 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1162 		break;
1163 	case DMA_TLB_DSI_FLUSH:
1164 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1165 		break;
1166 	case DMA_TLB_PSI_FLUSH:
1167 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168 		/* IH bit is passed in as part of address */
1169 		val_iva = size_order | addr;
1170 		break;
1171 	default:
1172 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1173 			iommu->name, type);
1174 		return;
1175 	}
1176 
1177 	if (cap_write_drain(iommu->cap))
1178 		val |= DMA_TLB_WRITE_DRAIN;
1179 
1180 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1181 	/* Note: Only uses first TLB reg currently */
1182 	if (val_iva)
1183 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1184 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1185 
1186 	/* Make sure hardware complete it */
1187 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1188 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1189 
1190 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1191 
1192 	/* check IOTLB invalidation granularity */
1193 	if (DMA_TLB_IAIG(val) == 0)
1194 		pr_err("Flush IOTLB failed\n");
1195 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1196 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1197 			(unsigned long long)DMA_TLB_IIRG(type),
1198 			(unsigned long long)DMA_TLB_IAIG(val));
1199 }
1200 
1201 static struct device_domain_info *
1202 domain_lookup_dev_info(struct dmar_domain *domain,
1203 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1204 {
1205 	struct device_domain_info *info;
1206 	unsigned long flags;
1207 
1208 	spin_lock_irqsave(&domain->lock, flags);
1209 	list_for_each_entry(info, &domain->devices, link) {
1210 		if (info->iommu == iommu && info->bus == bus &&
1211 		    info->devfn == devfn) {
1212 			spin_unlock_irqrestore(&domain->lock, flags);
1213 			return info;
1214 		}
1215 	}
1216 	spin_unlock_irqrestore(&domain->lock, flags);
1217 
1218 	return NULL;
1219 }
1220 
1221 static void domain_update_iotlb(struct dmar_domain *domain)
1222 {
1223 	struct dev_pasid_info *dev_pasid;
1224 	struct device_domain_info *info;
1225 	bool has_iotlb_device = false;
1226 	unsigned long flags;
1227 
1228 	spin_lock_irqsave(&domain->lock, flags);
1229 	list_for_each_entry(info, &domain->devices, link) {
1230 		if (info->ats_enabled) {
1231 			has_iotlb_device = true;
1232 			break;
1233 		}
1234 	}
1235 
1236 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1237 		info = dev_iommu_priv_get(dev_pasid->dev);
1238 		if (info->ats_enabled) {
1239 			has_iotlb_device = true;
1240 			break;
1241 		}
1242 	}
1243 	domain->has_iotlb_device = has_iotlb_device;
1244 	spin_unlock_irqrestore(&domain->lock, flags);
1245 }
1246 
1247 /*
1248  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1249  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1250  * check because it applies only to the built-in QAT devices and it doesn't
1251  * grant additional privileges.
1252  */
1253 #define BUGGY_QAT_DEVID_MASK 0x4940
1254 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1255 {
1256 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1257 		return false;
1258 
1259 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1260 		return false;
1261 
1262 	return true;
1263 }
1264 
1265 static void iommu_enable_pci_caps(struct device_domain_info *info)
1266 {
1267 	struct pci_dev *pdev;
1268 
1269 	if (!dev_is_pci(info->dev))
1270 		return;
1271 
1272 	pdev = to_pci_dev(info->dev);
1273 
1274 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1275 	   the device if you enable PASID support after ATS support is
1276 	   undefined. So always enable PASID support on devices which
1277 	   have it, even if we can't yet know if we're ever going to
1278 	   use it. */
1279 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1280 		info->pasid_enabled = 1;
1281 
1282 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1283 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1284 		info->ats_enabled = 1;
1285 		domain_update_iotlb(info->domain);
1286 	}
1287 }
1288 
1289 static void iommu_disable_pci_caps(struct device_domain_info *info)
1290 {
1291 	struct pci_dev *pdev;
1292 
1293 	if (!dev_is_pci(info->dev))
1294 		return;
1295 
1296 	pdev = to_pci_dev(info->dev);
1297 
1298 	if (info->ats_enabled) {
1299 		pci_disable_ats(pdev);
1300 		info->ats_enabled = 0;
1301 		domain_update_iotlb(info->domain);
1302 	}
1303 
1304 	if (info->pasid_enabled) {
1305 		pci_disable_pasid(pdev);
1306 		info->pasid_enabled = 0;
1307 	}
1308 }
1309 
1310 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1311 				    u64 addr, unsigned int mask)
1312 {
1313 	u16 sid, qdep;
1314 
1315 	if (!info || !info->ats_enabled)
1316 		return;
1317 
1318 	sid = info->bus << 8 | info->devfn;
1319 	qdep = info->ats_qdep;
1320 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1321 			   qdep, addr, mask);
1322 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1323 }
1324 
1325 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326 				  u64 addr, unsigned mask)
1327 {
1328 	struct dev_pasid_info *dev_pasid;
1329 	struct device_domain_info *info;
1330 	unsigned long flags;
1331 
1332 	if (!domain->has_iotlb_device)
1333 		return;
1334 
1335 	spin_lock_irqsave(&domain->lock, flags);
1336 	list_for_each_entry(info, &domain->devices, link)
1337 		__iommu_flush_dev_iotlb(info, addr, mask);
1338 
1339 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1340 		info = dev_iommu_priv_get(dev_pasid->dev);
1341 
1342 		if (!info->ats_enabled)
1343 			continue;
1344 
1345 		qi_flush_dev_iotlb_pasid(info->iommu,
1346 					 PCI_DEVID(info->bus, info->devfn),
1347 					 info->pfsid, dev_pasid->pasid,
1348 					 info->ats_qdep, addr,
1349 					 mask);
1350 	}
1351 	spin_unlock_irqrestore(&domain->lock, flags);
1352 }
1353 
1354 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1355 				     struct dmar_domain *domain, u64 addr,
1356 				     unsigned long npages, bool ih)
1357 {
1358 	u16 did = domain_id_iommu(domain, iommu);
1359 	struct dev_pasid_info *dev_pasid;
1360 	unsigned long flags;
1361 
1362 	spin_lock_irqsave(&domain->lock, flags);
1363 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1364 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1365 
1366 	if (!list_empty(&domain->devices))
1367 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1368 	spin_unlock_irqrestore(&domain->lock, flags);
1369 }
1370 
1371 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1372 				  struct dmar_domain *domain,
1373 				  unsigned long pfn, unsigned int pages,
1374 				  int ih, int map)
1375 {
1376 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1377 	unsigned int mask = ilog2(aligned_pages);
1378 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1379 	u16 did = domain_id_iommu(domain, iommu);
1380 
1381 	if (WARN_ON(!pages))
1382 		return;
1383 
1384 	if (ih)
1385 		ih = 1 << 6;
1386 
1387 	if (domain->use_first_level) {
1388 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1389 	} else {
1390 		unsigned long bitmask = aligned_pages - 1;
1391 
1392 		/*
1393 		 * PSI masks the low order bits of the base address. If the
1394 		 * address isn't aligned to the mask, then compute a mask value
1395 		 * needed to ensure the target range is flushed.
1396 		 */
1397 		if (unlikely(bitmask & pfn)) {
1398 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1399 
1400 			/*
1401 			 * Since end_pfn <= pfn + bitmask, the only way bits
1402 			 * higher than bitmask can differ in pfn and end_pfn is
1403 			 * by carrying. This means after masking out bitmask,
1404 			 * high bits starting with the first set bit in
1405 			 * shared_bits are all equal in both pfn and end_pfn.
1406 			 */
1407 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1408 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1409 		}
1410 
1411 		/*
1412 		 * Fallback to domain selective flush if no PSI support or
1413 		 * the size is too big.
1414 		 */
1415 		if (!cap_pgsel_inv(iommu->cap) ||
1416 		    mask > cap_max_amask_val(iommu->cap))
1417 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1418 							DMA_TLB_DSI_FLUSH);
1419 		else
1420 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1421 							DMA_TLB_PSI_FLUSH);
1422 	}
1423 
1424 	/*
1425 	 * In caching mode, changes of pages from non-present to present require
1426 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1427 	 */
1428 	if (!cap_caching_mode(iommu->cap) || !map)
1429 		iommu_flush_dev_iotlb(domain, addr, mask);
1430 }
1431 
1432 /* Notification for newly created mappings */
1433 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1434 				 unsigned long pfn, unsigned int pages)
1435 {
1436 	/*
1437 	 * It's a non-present to present mapping. Only flush if caching mode
1438 	 * and second level.
1439 	 */
1440 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1441 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1442 	else
1443 		iommu_flush_write_buffer(iommu);
1444 }
1445 
1446 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1447 {
1448 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1449 	struct iommu_domain_info *info;
1450 	unsigned long idx;
1451 
1452 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1453 		struct intel_iommu *iommu = info->iommu;
1454 		u16 did = domain_id_iommu(dmar_domain, iommu);
1455 
1456 		if (dmar_domain->use_first_level)
1457 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1458 		else
1459 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1460 						 DMA_TLB_DSI_FLUSH);
1461 
1462 		if (!cap_caching_mode(iommu->cap))
1463 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1464 	}
1465 }
1466 
1467 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1468 {
1469 	u32 pmen;
1470 	unsigned long flags;
1471 
1472 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1473 		return;
1474 
1475 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477 	pmen &= ~DMA_PMEN_EPM;
1478 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1479 
1480 	/* wait for the protected region status bit to clear */
1481 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1483 
1484 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1485 }
1486 
1487 static void iommu_enable_translation(struct intel_iommu *iommu)
1488 {
1489 	u32 sts;
1490 	unsigned long flags;
1491 
1492 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493 	iommu->gcmd |= DMA_GCMD_TE;
1494 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1495 
1496 	/* Make sure hardware complete it */
1497 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498 		      readl, (sts & DMA_GSTS_TES), sts);
1499 
1500 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1501 }
1502 
1503 static void iommu_disable_translation(struct intel_iommu *iommu)
1504 {
1505 	u32 sts;
1506 	unsigned long flag;
1507 
1508 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1509 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1510 		return;
1511 
1512 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1513 	iommu->gcmd &= ~DMA_GCMD_TE;
1514 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1515 
1516 	/* Make sure hardware complete it */
1517 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1518 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1519 
1520 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1521 }
1522 
1523 static int iommu_init_domains(struct intel_iommu *iommu)
1524 {
1525 	u32 ndomains;
1526 
1527 	ndomains = cap_ndoms(iommu->cap);
1528 	pr_debug("%s: Number of Domains supported <%d>\n",
1529 		 iommu->name, ndomains);
1530 
1531 	spin_lock_init(&iommu->lock);
1532 
1533 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1534 	if (!iommu->domain_ids)
1535 		return -ENOMEM;
1536 
1537 	/*
1538 	 * If Caching mode is set, then invalid translations are tagged
1539 	 * with domain-id 0, hence we need to pre-allocate it. We also
1540 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1541 	 * make sure it is not used for a real domain.
1542 	 */
1543 	set_bit(0, iommu->domain_ids);
1544 
1545 	/*
1546 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1547 	 * entry for first-level or pass-through translation modes should
1548 	 * be programmed with a domain id different from those used for
1549 	 * second-level or nested translation. We reserve a domain id for
1550 	 * this purpose.
1551 	 */
1552 	if (sm_supported(iommu))
1553 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1554 
1555 	return 0;
1556 }
1557 
1558 static void disable_dmar_iommu(struct intel_iommu *iommu)
1559 {
1560 	if (!iommu->domain_ids)
1561 		return;
1562 
1563 	/*
1564 	 * All iommu domains must have been detached from the devices,
1565 	 * hence there should be no domain IDs in use.
1566 	 */
1567 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1568 		    > NUM_RESERVED_DID))
1569 		return;
1570 
1571 	if (iommu->gcmd & DMA_GCMD_TE)
1572 		iommu_disable_translation(iommu);
1573 }
1574 
1575 static void free_dmar_iommu(struct intel_iommu *iommu)
1576 {
1577 	if (iommu->domain_ids) {
1578 		bitmap_free(iommu->domain_ids);
1579 		iommu->domain_ids = NULL;
1580 	}
1581 
1582 	if (iommu->copied_tables) {
1583 		bitmap_free(iommu->copied_tables);
1584 		iommu->copied_tables = NULL;
1585 	}
1586 
1587 	/* free context mapping */
1588 	free_context_table(iommu);
1589 
1590 #ifdef CONFIG_INTEL_IOMMU_SVM
1591 	if (pasid_supported(iommu)) {
1592 		if (ecap_prs(iommu->ecap))
1593 			intel_svm_finish_prq(iommu);
1594 	}
1595 #endif
1596 }
1597 
1598 /*
1599  * Check and return whether first level is used by default for
1600  * DMA translation.
1601  */
1602 static bool first_level_by_default(unsigned int type)
1603 {
1604 	/* Only SL is available in legacy mode */
1605 	if (!scalable_mode_support())
1606 		return false;
1607 
1608 	/* Only level (either FL or SL) is available, just use it */
1609 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1610 		return intel_cap_flts_sanity();
1611 
1612 	/* Both levels are available, decide it based on domain type */
1613 	return type != IOMMU_DOMAIN_UNMANAGED;
1614 }
1615 
1616 static struct dmar_domain *alloc_domain(unsigned int type)
1617 {
1618 	struct dmar_domain *domain;
1619 
1620 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1621 	if (!domain)
1622 		return NULL;
1623 
1624 	domain->nid = NUMA_NO_NODE;
1625 	if (first_level_by_default(type))
1626 		domain->use_first_level = true;
1627 	domain->has_iotlb_device = false;
1628 	INIT_LIST_HEAD(&domain->devices);
1629 	INIT_LIST_HEAD(&domain->dev_pasids);
1630 	spin_lock_init(&domain->lock);
1631 	xa_init(&domain->iommu_array);
1632 
1633 	return domain;
1634 }
1635 
1636 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1637 {
1638 	struct iommu_domain_info *info, *curr;
1639 	unsigned long ndomains;
1640 	int num, ret = -ENOSPC;
1641 
1642 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1643 	if (!info)
1644 		return -ENOMEM;
1645 
1646 	spin_lock(&iommu->lock);
1647 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1648 	if (curr) {
1649 		curr->refcnt++;
1650 		spin_unlock(&iommu->lock);
1651 		kfree(info);
1652 		return 0;
1653 	}
1654 
1655 	ndomains = cap_ndoms(iommu->cap);
1656 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1657 	if (num >= ndomains) {
1658 		pr_err("%s: No free domain ids\n", iommu->name);
1659 		goto err_unlock;
1660 	}
1661 
1662 	set_bit(num, iommu->domain_ids);
1663 	info->refcnt	= 1;
1664 	info->did	= num;
1665 	info->iommu	= iommu;
1666 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1667 			  NULL, info, GFP_ATOMIC);
1668 	if (curr) {
1669 		ret = xa_err(curr) ? : -EBUSY;
1670 		goto err_clear;
1671 	}
1672 	domain_update_iommu_cap(domain);
1673 
1674 	spin_unlock(&iommu->lock);
1675 	return 0;
1676 
1677 err_clear:
1678 	clear_bit(info->did, iommu->domain_ids);
1679 err_unlock:
1680 	spin_unlock(&iommu->lock);
1681 	kfree(info);
1682 	return ret;
1683 }
1684 
1685 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686 {
1687 	struct iommu_domain_info *info;
1688 
1689 	spin_lock(&iommu->lock);
1690 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1691 	if (--info->refcnt == 0) {
1692 		clear_bit(info->did, iommu->domain_ids);
1693 		xa_erase(&domain->iommu_array, iommu->seq_id);
1694 		domain->nid = NUMA_NO_NODE;
1695 		domain_update_iommu_cap(domain);
1696 		kfree(info);
1697 	}
1698 	spin_unlock(&iommu->lock);
1699 }
1700 
1701 static int guestwidth_to_adjustwidth(int gaw)
1702 {
1703 	int agaw;
1704 	int r = (gaw - 12) % 9;
1705 
1706 	if (r == 0)
1707 		agaw = gaw;
1708 	else
1709 		agaw = gaw + 9 - r;
1710 	if (agaw > 64)
1711 		agaw = 64;
1712 	return agaw;
1713 }
1714 
1715 static void domain_exit(struct dmar_domain *domain)
1716 {
1717 	if (domain->pgd) {
1718 		LIST_HEAD(freelist);
1719 
1720 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1721 		put_pages_list(&freelist);
1722 	}
1723 
1724 	if (WARN_ON(!list_empty(&domain->devices)))
1725 		return;
1726 
1727 	kfree(domain);
1728 }
1729 
1730 /*
1731  * Get the PASID directory size for scalable mode context entry.
1732  * Value of X in the PDTS field of a scalable mode context entry
1733  * indicates PASID directory with 2^(X + 7) entries.
1734  */
1735 static unsigned long context_get_sm_pds(struct pasid_table *table)
1736 {
1737 	unsigned long pds, max_pde;
1738 
1739 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1740 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1741 	if (pds < 7)
1742 		return 0;
1743 
1744 	return pds - 7;
1745 }
1746 
1747 static int domain_context_mapping_one(struct dmar_domain *domain,
1748 				      struct intel_iommu *iommu,
1749 				      struct pasid_table *table,
1750 				      u8 bus, u8 devfn)
1751 {
1752 	struct device_domain_info *info =
1753 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1754 	u16 did = domain_id_iommu(domain, iommu);
1755 	int translation = CONTEXT_TT_MULTI_LEVEL;
1756 	struct context_entry *context;
1757 	int ret;
1758 
1759 	if (hw_pass_through && domain_type_is_si(domain))
1760 		translation = CONTEXT_TT_PASS_THROUGH;
1761 
1762 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1763 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1764 
1765 	spin_lock(&iommu->lock);
1766 	ret = -ENOMEM;
1767 	context = iommu_context_addr(iommu, bus, devfn, 1);
1768 	if (!context)
1769 		goto out_unlock;
1770 
1771 	ret = 0;
1772 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1773 		goto out_unlock;
1774 
1775 	/*
1776 	 * For kdump cases, old valid entries may be cached due to the
1777 	 * in-flight DMA and copied pgtable, but there is no unmapping
1778 	 * behaviour for them, thus we need an explicit cache flush for
1779 	 * the newly-mapped device. For kdump, at this point, the device
1780 	 * is supposed to finish reset at its driver probe stage, so no
1781 	 * in-flight DMA will exist, and we don't need to worry anymore
1782 	 * hereafter.
1783 	 */
1784 	if (context_copied(iommu, bus, devfn)) {
1785 		u16 did_old = context_domain_id(context);
1786 
1787 		if (did_old < cap_ndoms(iommu->cap)) {
1788 			iommu->flush.flush_context(iommu, did_old,
1789 						   (((u16)bus) << 8) | devfn,
1790 						   DMA_CCMD_MASK_NOBIT,
1791 						   DMA_CCMD_DEVICE_INVL);
1792 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1793 						 DMA_TLB_DSI_FLUSH);
1794 		}
1795 
1796 		clear_context_copied(iommu, bus, devfn);
1797 	}
1798 
1799 	context_clear_entry(context);
1800 
1801 	if (sm_supported(iommu)) {
1802 		unsigned long pds;
1803 
1804 		/* Setup the PASID DIR pointer: */
1805 		pds = context_get_sm_pds(table);
1806 		context->lo = (u64)virt_to_phys(table->table) |
1807 				context_pdts(pds);
1808 
1809 		/* Setup the RID_PASID field: */
1810 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1811 
1812 		/*
1813 		 * Setup the Device-TLB enable bit and Page request
1814 		 * Enable bit:
1815 		 */
1816 		if (info && info->ats_supported)
1817 			context_set_sm_dte(context);
1818 		if (info && info->pri_supported)
1819 			context_set_sm_pre(context);
1820 		if (info && info->pasid_supported)
1821 			context_set_pasid(context);
1822 	} else {
1823 		struct dma_pte *pgd = domain->pgd;
1824 		int agaw;
1825 
1826 		context_set_domain_id(context, did);
1827 
1828 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1829 			/*
1830 			 * Skip top levels of page tables for iommu which has
1831 			 * less agaw than default. Unnecessary for PT mode.
1832 			 */
1833 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1834 				ret = -ENOMEM;
1835 				pgd = phys_to_virt(dma_pte_addr(pgd));
1836 				if (!dma_pte_present(pgd))
1837 					goto out_unlock;
1838 			}
1839 
1840 			if (info && info->ats_supported)
1841 				translation = CONTEXT_TT_DEV_IOTLB;
1842 			else
1843 				translation = CONTEXT_TT_MULTI_LEVEL;
1844 
1845 			context_set_address_root(context, virt_to_phys(pgd));
1846 			context_set_address_width(context, agaw);
1847 		} else {
1848 			/*
1849 			 * In pass through mode, AW must be programmed to
1850 			 * indicate the largest AGAW value supported by
1851 			 * hardware. And ASR is ignored by hardware.
1852 			 */
1853 			context_set_address_width(context, iommu->msagaw);
1854 		}
1855 
1856 		context_set_translation_type(context, translation);
1857 	}
1858 
1859 	context_set_fault_enable(context);
1860 	context_set_present(context);
1861 	if (!ecap_coherent(iommu->ecap))
1862 		clflush_cache_range(context, sizeof(*context));
1863 
1864 	/*
1865 	 * It's a non-present to present mapping. If hardware doesn't cache
1866 	 * non-present entry we only need to flush the write-buffer. If the
1867 	 * _does_ cache non-present entries, then it does so in the special
1868 	 * domain #0, which we have to flush:
1869 	 */
1870 	if (cap_caching_mode(iommu->cap)) {
1871 		iommu->flush.flush_context(iommu, 0,
1872 					   (((u16)bus) << 8) | devfn,
1873 					   DMA_CCMD_MASK_NOBIT,
1874 					   DMA_CCMD_DEVICE_INVL);
1875 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1876 	} else {
1877 		iommu_flush_write_buffer(iommu);
1878 	}
1879 
1880 	ret = 0;
1881 
1882 out_unlock:
1883 	spin_unlock(&iommu->lock);
1884 
1885 	return ret;
1886 }
1887 
1888 struct domain_context_mapping_data {
1889 	struct dmar_domain *domain;
1890 	struct intel_iommu *iommu;
1891 	struct pasid_table *table;
1892 };
1893 
1894 static int domain_context_mapping_cb(struct pci_dev *pdev,
1895 				     u16 alias, void *opaque)
1896 {
1897 	struct domain_context_mapping_data *data = opaque;
1898 
1899 	return domain_context_mapping_one(data->domain, data->iommu,
1900 					  data->table, PCI_BUS_NUM(alias),
1901 					  alias & 0xff);
1902 }
1903 
1904 static int
1905 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1906 {
1907 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1908 	struct domain_context_mapping_data data;
1909 	struct intel_iommu *iommu = info->iommu;
1910 	u8 bus = info->bus, devfn = info->devfn;
1911 	struct pasid_table *table;
1912 
1913 	table = intel_pasid_get_table(dev);
1914 
1915 	if (!dev_is_pci(dev))
1916 		return domain_context_mapping_one(domain, iommu, table,
1917 						  bus, devfn);
1918 
1919 	data.domain = domain;
1920 	data.iommu = iommu;
1921 	data.table = table;
1922 
1923 	return pci_for_each_dma_alias(to_pci_dev(dev),
1924 				      &domain_context_mapping_cb, &data);
1925 }
1926 
1927 /* Returns a number of VTD pages, but aligned to MM page size */
1928 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1929 {
1930 	host_addr &= ~PAGE_MASK;
1931 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1932 }
1933 
1934 /* Return largest possible superpage level for a given mapping */
1935 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1936 				   unsigned long phy_pfn, unsigned long pages)
1937 {
1938 	int support, level = 1;
1939 	unsigned long pfnmerge;
1940 
1941 	support = domain->iommu_superpage;
1942 
1943 	/* To use a large page, the virtual *and* physical addresses
1944 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1945 	   of them will mean we have to use smaller pages. So just
1946 	   merge them and check both at once. */
1947 	pfnmerge = iov_pfn | phy_pfn;
1948 
1949 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1950 		pages >>= VTD_STRIDE_SHIFT;
1951 		if (!pages)
1952 			break;
1953 		pfnmerge >>= VTD_STRIDE_SHIFT;
1954 		level++;
1955 		support--;
1956 	}
1957 	return level;
1958 }
1959 
1960 /*
1961  * Ensure that old small page tables are removed to make room for superpage(s).
1962  * We're going to add new large pages, so make sure we don't remove their parent
1963  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1964  */
1965 static void switch_to_super_page(struct dmar_domain *domain,
1966 				 unsigned long start_pfn,
1967 				 unsigned long end_pfn, int level)
1968 {
1969 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1970 	struct iommu_domain_info *info;
1971 	struct dma_pte *pte = NULL;
1972 	unsigned long i;
1973 
1974 	while (start_pfn <= end_pfn) {
1975 		if (!pte)
1976 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1977 					     GFP_ATOMIC);
1978 
1979 		if (dma_pte_present(pte)) {
1980 			dma_pte_free_pagetable(domain, start_pfn,
1981 					       start_pfn + lvl_pages - 1,
1982 					       level + 1);
1983 
1984 			xa_for_each(&domain->iommu_array, i, info)
1985 				iommu_flush_iotlb_psi(info->iommu, domain,
1986 						      start_pfn, lvl_pages,
1987 						      0, 0);
1988 		}
1989 
1990 		pte++;
1991 		start_pfn += lvl_pages;
1992 		if (first_pte_in_page(pte))
1993 			pte = NULL;
1994 	}
1995 }
1996 
1997 static int
1998 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1999 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2000 		 gfp_t gfp)
2001 {
2002 	struct dma_pte *first_pte = NULL, *pte = NULL;
2003 	unsigned int largepage_lvl = 0;
2004 	unsigned long lvl_pages = 0;
2005 	phys_addr_t pteval;
2006 	u64 attr;
2007 
2008 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2009 		return -EINVAL;
2010 
2011 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2012 		return -EINVAL;
2013 
2014 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2015 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2016 		return -EINVAL;
2017 	}
2018 
2019 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2020 	attr |= DMA_FL_PTE_PRESENT;
2021 	if (domain->use_first_level) {
2022 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2023 		if (prot & DMA_PTE_WRITE)
2024 			attr |= DMA_FL_PTE_DIRTY;
2025 	}
2026 
2027 	domain->has_mappings = true;
2028 
2029 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2030 
2031 	while (nr_pages > 0) {
2032 		uint64_t tmp;
2033 
2034 		if (!pte) {
2035 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2036 					phys_pfn, nr_pages);
2037 
2038 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2039 					     gfp);
2040 			if (!pte)
2041 				return -ENOMEM;
2042 			first_pte = pte;
2043 
2044 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2045 
2046 			/* It is large page*/
2047 			if (largepage_lvl > 1) {
2048 				unsigned long end_pfn;
2049 				unsigned long pages_to_remove;
2050 
2051 				pteval |= DMA_PTE_LARGE_PAGE;
2052 				pages_to_remove = min_t(unsigned long, nr_pages,
2053 							nr_pte_to_next_page(pte) * lvl_pages);
2054 				end_pfn = iov_pfn + pages_to_remove - 1;
2055 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2056 			} else {
2057 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2058 			}
2059 
2060 		}
2061 		/* We don't need lock here, nobody else
2062 		 * touches the iova range
2063 		 */
2064 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2065 		if (tmp) {
2066 			static int dumps = 5;
2067 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2068 				iov_pfn, tmp, (unsigned long long)pteval);
2069 			if (dumps) {
2070 				dumps--;
2071 				debug_dma_dump_mappings(NULL);
2072 			}
2073 			WARN_ON(1);
2074 		}
2075 
2076 		nr_pages -= lvl_pages;
2077 		iov_pfn += lvl_pages;
2078 		phys_pfn += lvl_pages;
2079 		pteval += lvl_pages * VTD_PAGE_SIZE;
2080 
2081 		/* If the next PTE would be the first in a new page, then we
2082 		 * need to flush the cache on the entries we've just written.
2083 		 * And then we'll need to recalculate 'pte', so clear it and
2084 		 * let it get set again in the if (!pte) block above.
2085 		 *
2086 		 * If we're done (!nr_pages) we need to flush the cache too.
2087 		 *
2088 		 * Also if we've been setting superpages, we may need to
2089 		 * recalculate 'pte' and switch back to smaller pages for the
2090 		 * end of the mapping, if the trailing size is not enough to
2091 		 * use another superpage (i.e. nr_pages < lvl_pages).
2092 		 */
2093 		pte++;
2094 		if (!nr_pages || first_pte_in_page(pte) ||
2095 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2096 			domain_flush_cache(domain, first_pte,
2097 					   (void *)pte - (void *)first_pte);
2098 			pte = NULL;
2099 		}
2100 	}
2101 
2102 	return 0;
2103 }
2104 
2105 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2106 {
2107 	struct intel_iommu *iommu = info->iommu;
2108 	struct context_entry *context;
2109 	u16 did_old;
2110 
2111 	if (!iommu)
2112 		return;
2113 
2114 	spin_lock(&iommu->lock);
2115 	context = iommu_context_addr(iommu, bus, devfn, 0);
2116 	if (!context) {
2117 		spin_unlock(&iommu->lock);
2118 		return;
2119 	}
2120 
2121 	if (sm_supported(iommu)) {
2122 		if (hw_pass_through && domain_type_is_si(info->domain))
2123 			did_old = FLPT_DEFAULT_DID;
2124 		else
2125 			did_old = domain_id_iommu(info->domain, iommu);
2126 	} else {
2127 		did_old = context_domain_id(context);
2128 	}
2129 
2130 	context_clear_entry(context);
2131 	__iommu_flush_cache(iommu, context, sizeof(*context));
2132 	spin_unlock(&iommu->lock);
2133 	iommu->flush.flush_context(iommu,
2134 				   did_old,
2135 				   (((u16)bus) << 8) | devfn,
2136 				   DMA_CCMD_MASK_NOBIT,
2137 				   DMA_CCMD_DEVICE_INVL);
2138 
2139 	if (sm_supported(iommu))
2140 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2141 
2142 	iommu->flush.flush_iotlb(iommu,
2143 				 did_old,
2144 				 0,
2145 				 0,
2146 				 DMA_TLB_DSI_FLUSH);
2147 
2148 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2149 }
2150 
2151 static int domain_setup_first_level(struct intel_iommu *iommu,
2152 				    struct dmar_domain *domain,
2153 				    struct device *dev,
2154 				    u32 pasid)
2155 {
2156 	struct dma_pte *pgd = domain->pgd;
2157 	int agaw, level;
2158 	int flags = 0;
2159 
2160 	/*
2161 	 * Skip top levels of page tables for iommu which has
2162 	 * less agaw than default. Unnecessary for PT mode.
2163 	 */
2164 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2165 		pgd = phys_to_virt(dma_pte_addr(pgd));
2166 		if (!dma_pte_present(pgd))
2167 			return -ENOMEM;
2168 	}
2169 
2170 	level = agaw_to_level(agaw);
2171 	if (level != 4 && level != 5)
2172 		return -EINVAL;
2173 
2174 	if (level == 5)
2175 		flags |= PASID_FLAG_FL5LP;
2176 
2177 	if (domain->force_snooping)
2178 		flags |= PASID_FLAG_PAGE_SNOOP;
2179 
2180 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2181 					     domain_id_iommu(domain, iommu),
2182 					     flags);
2183 }
2184 
2185 static bool dev_is_real_dma_subdevice(struct device *dev)
2186 {
2187 	return dev && dev_is_pci(dev) &&
2188 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2189 }
2190 
2191 static int iommu_domain_identity_map(struct dmar_domain *domain,
2192 				     unsigned long first_vpfn,
2193 				     unsigned long last_vpfn)
2194 {
2195 	/*
2196 	 * RMRR range might have overlap with physical memory range,
2197 	 * clear it first
2198 	 */
2199 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2200 
2201 	return __domain_mapping(domain, first_vpfn,
2202 				first_vpfn, last_vpfn - first_vpfn + 1,
2203 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2204 }
2205 
2206 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2207 
2208 static int __init si_domain_init(int hw)
2209 {
2210 	struct dmar_rmrr_unit *rmrr;
2211 	struct device *dev;
2212 	int i, nid, ret;
2213 
2214 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2215 	if (!si_domain)
2216 		return -EFAULT;
2217 
2218 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2219 		domain_exit(si_domain);
2220 		si_domain = NULL;
2221 		return -EFAULT;
2222 	}
2223 
2224 	if (hw)
2225 		return 0;
2226 
2227 	for_each_online_node(nid) {
2228 		unsigned long start_pfn, end_pfn;
2229 		int i;
2230 
2231 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2232 			ret = iommu_domain_identity_map(si_domain,
2233 					mm_to_dma_pfn_start(start_pfn),
2234 					mm_to_dma_pfn_end(end_pfn));
2235 			if (ret)
2236 				return ret;
2237 		}
2238 	}
2239 
2240 	/*
2241 	 * Identity map the RMRRs so that devices with RMRRs could also use
2242 	 * the si_domain.
2243 	 */
2244 	for_each_rmrr_units(rmrr) {
2245 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2246 					  i, dev) {
2247 			unsigned long long start = rmrr->base_address;
2248 			unsigned long long end = rmrr->end_address;
2249 
2250 			if (WARN_ON(end < start ||
2251 				    end >> agaw_to_width(si_domain->agaw)))
2252 				continue;
2253 
2254 			ret = iommu_domain_identity_map(si_domain,
2255 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2256 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2257 			if (ret)
2258 				return ret;
2259 		}
2260 	}
2261 
2262 	return 0;
2263 }
2264 
2265 static int dmar_domain_attach_device(struct dmar_domain *domain,
2266 				     struct device *dev)
2267 {
2268 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2269 	struct intel_iommu *iommu = info->iommu;
2270 	unsigned long flags;
2271 	int ret;
2272 
2273 	ret = domain_attach_iommu(domain, iommu);
2274 	if (ret)
2275 		return ret;
2276 	info->domain = domain;
2277 	spin_lock_irqsave(&domain->lock, flags);
2278 	list_add(&info->link, &domain->devices);
2279 	spin_unlock_irqrestore(&domain->lock, flags);
2280 
2281 	/* PASID table is mandatory for a PCI device in scalable mode. */
2282 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2283 		/* Setup the PASID entry for requests without PASID: */
2284 		if (hw_pass_through && domain_type_is_si(domain))
2285 			ret = intel_pasid_setup_pass_through(iommu,
2286 					dev, IOMMU_NO_PASID);
2287 		else if (domain->use_first_level)
2288 			ret = domain_setup_first_level(iommu, domain, dev,
2289 					IOMMU_NO_PASID);
2290 		else
2291 			ret = intel_pasid_setup_second_level(iommu, domain,
2292 					dev, IOMMU_NO_PASID);
2293 		if (ret) {
2294 			dev_err(dev, "Setup RID2PASID failed\n");
2295 			device_block_translation(dev);
2296 			return ret;
2297 		}
2298 	}
2299 
2300 	ret = domain_context_mapping(domain, dev);
2301 	if (ret) {
2302 		dev_err(dev, "Domain context map failed\n");
2303 		device_block_translation(dev);
2304 		return ret;
2305 	}
2306 
2307 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2308 		iommu_enable_pci_caps(info);
2309 
2310 	return 0;
2311 }
2312 
2313 /**
2314  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2315  * is relaxable (ie. is allowed to be not enforced under some conditions)
2316  * @dev: device handle
2317  *
2318  * We assume that PCI USB devices with RMRRs have them largely
2319  * for historical reasons and that the RMRR space is not actively used post
2320  * boot.  This exclusion may change if vendors begin to abuse it.
2321  *
2322  * The same exception is made for graphics devices, with the requirement that
2323  * any use of the RMRR regions will be torn down before assigning the device
2324  * to a guest.
2325  *
2326  * Return: true if the RMRR is relaxable, false otherwise
2327  */
2328 static bool device_rmrr_is_relaxable(struct device *dev)
2329 {
2330 	struct pci_dev *pdev;
2331 
2332 	if (!dev_is_pci(dev))
2333 		return false;
2334 
2335 	pdev = to_pci_dev(dev);
2336 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2337 		return true;
2338 	else
2339 		return false;
2340 }
2341 
2342 /*
2343  * Return the required default domain type for a specific device.
2344  *
2345  * @dev: the device in query
2346  * @startup: true if this is during early boot
2347  *
2348  * Returns:
2349  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2350  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2351  *  - 0: both identity and dynamic domains work for this device
2352  */
2353 static int device_def_domain_type(struct device *dev)
2354 {
2355 	if (dev_is_pci(dev)) {
2356 		struct pci_dev *pdev = to_pci_dev(dev);
2357 
2358 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359 			return IOMMU_DOMAIN_IDENTITY;
2360 
2361 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362 			return IOMMU_DOMAIN_IDENTITY;
2363 	}
2364 
2365 	return 0;
2366 }
2367 
2368 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2369 {
2370 	/*
2371 	 * Start from the sane iommu hardware state.
2372 	 * If the queued invalidation is already initialized by us
2373 	 * (for example, while enabling interrupt-remapping) then
2374 	 * we got the things already rolling from a sane state.
2375 	 */
2376 	if (!iommu->qi) {
2377 		/*
2378 		 * Clear any previous faults.
2379 		 */
2380 		dmar_fault(-1, iommu);
2381 		/*
2382 		 * Disable queued invalidation if supported and already enabled
2383 		 * before OS handover.
2384 		 */
2385 		dmar_disable_qi(iommu);
2386 	}
2387 
2388 	if (dmar_enable_qi(iommu)) {
2389 		/*
2390 		 * Queued Invalidate not enabled, use Register Based Invalidate
2391 		 */
2392 		iommu->flush.flush_context = __iommu_flush_context;
2393 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2394 		pr_info("%s: Using Register based invalidation\n",
2395 			iommu->name);
2396 	} else {
2397 		iommu->flush.flush_context = qi_flush_context;
2398 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2399 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2400 	}
2401 }
2402 
2403 static int copy_context_table(struct intel_iommu *iommu,
2404 			      struct root_entry *old_re,
2405 			      struct context_entry **tbl,
2406 			      int bus, bool ext)
2407 {
2408 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2409 	struct context_entry *new_ce = NULL, ce;
2410 	struct context_entry *old_ce = NULL;
2411 	struct root_entry re;
2412 	phys_addr_t old_ce_phys;
2413 
2414 	tbl_idx = ext ? bus * 2 : bus;
2415 	memcpy(&re, old_re, sizeof(re));
2416 
2417 	for (devfn = 0; devfn < 256; devfn++) {
2418 		/* First calculate the correct index */
2419 		idx = (ext ? devfn * 2 : devfn) % 256;
2420 
2421 		if (idx == 0) {
2422 			/* First save what we may have and clean up */
2423 			if (new_ce) {
2424 				tbl[tbl_idx] = new_ce;
2425 				__iommu_flush_cache(iommu, new_ce,
2426 						    VTD_PAGE_SIZE);
2427 				pos = 1;
2428 			}
2429 
2430 			if (old_ce)
2431 				memunmap(old_ce);
2432 
2433 			ret = 0;
2434 			if (devfn < 0x80)
2435 				old_ce_phys = root_entry_lctp(&re);
2436 			else
2437 				old_ce_phys = root_entry_uctp(&re);
2438 
2439 			if (!old_ce_phys) {
2440 				if (ext && devfn == 0) {
2441 					/* No LCTP, try UCTP */
2442 					devfn = 0x7f;
2443 					continue;
2444 				} else {
2445 					goto out;
2446 				}
2447 			}
2448 
2449 			ret = -ENOMEM;
2450 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2451 					MEMREMAP_WB);
2452 			if (!old_ce)
2453 				goto out;
2454 
2455 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2456 			if (!new_ce)
2457 				goto out_unmap;
2458 
2459 			ret = 0;
2460 		}
2461 
2462 		/* Now copy the context entry */
2463 		memcpy(&ce, old_ce + idx, sizeof(ce));
2464 
2465 		if (!context_present(&ce))
2466 			continue;
2467 
2468 		did = context_domain_id(&ce);
2469 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2470 			set_bit(did, iommu->domain_ids);
2471 
2472 		set_context_copied(iommu, bus, devfn);
2473 		new_ce[idx] = ce;
2474 	}
2475 
2476 	tbl[tbl_idx + pos] = new_ce;
2477 
2478 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2479 
2480 out_unmap:
2481 	memunmap(old_ce);
2482 
2483 out:
2484 	return ret;
2485 }
2486 
2487 static int copy_translation_tables(struct intel_iommu *iommu)
2488 {
2489 	struct context_entry **ctxt_tbls;
2490 	struct root_entry *old_rt;
2491 	phys_addr_t old_rt_phys;
2492 	int ctxt_table_entries;
2493 	u64 rtaddr_reg;
2494 	int bus, ret;
2495 	bool new_ext, ext;
2496 
2497 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2498 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2499 	new_ext    = !!sm_supported(iommu);
2500 
2501 	/*
2502 	 * The RTT bit can only be changed when translation is disabled,
2503 	 * but disabling translation means to open a window for data
2504 	 * corruption. So bail out and don't copy anything if we would
2505 	 * have to change the bit.
2506 	 */
2507 	if (new_ext != ext)
2508 		return -EINVAL;
2509 
2510 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2511 	if (!iommu->copied_tables)
2512 		return -ENOMEM;
2513 
2514 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2515 	if (!old_rt_phys)
2516 		return -EINVAL;
2517 
2518 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2519 	if (!old_rt)
2520 		return -ENOMEM;
2521 
2522 	/* This is too big for the stack - allocate it from slab */
2523 	ctxt_table_entries = ext ? 512 : 256;
2524 	ret = -ENOMEM;
2525 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2526 	if (!ctxt_tbls)
2527 		goto out_unmap;
2528 
2529 	for (bus = 0; bus < 256; bus++) {
2530 		ret = copy_context_table(iommu, &old_rt[bus],
2531 					 ctxt_tbls, bus, ext);
2532 		if (ret) {
2533 			pr_err("%s: Failed to copy context table for bus %d\n",
2534 				iommu->name, bus);
2535 			continue;
2536 		}
2537 	}
2538 
2539 	spin_lock(&iommu->lock);
2540 
2541 	/* Context tables are copied, now write them to the root_entry table */
2542 	for (bus = 0; bus < 256; bus++) {
2543 		int idx = ext ? bus * 2 : bus;
2544 		u64 val;
2545 
2546 		if (ctxt_tbls[idx]) {
2547 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2548 			iommu->root_entry[bus].lo = val;
2549 		}
2550 
2551 		if (!ext || !ctxt_tbls[idx + 1])
2552 			continue;
2553 
2554 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2555 		iommu->root_entry[bus].hi = val;
2556 	}
2557 
2558 	spin_unlock(&iommu->lock);
2559 
2560 	kfree(ctxt_tbls);
2561 
2562 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2563 
2564 	ret = 0;
2565 
2566 out_unmap:
2567 	memunmap(old_rt);
2568 
2569 	return ret;
2570 }
2571 
2572 static int __init init_dmars(void)
2573 {
2574 	struct dmar_drhd_unit *drhd;
2575 	struct intel_iommu *iommu;
2576 	int ret;
2577 
2578 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2579 	if (ret)
2580 		goto free_iommu;
2581 
2582 	for_each_iommu(iommu, drhd) {
2583 		if (drhd->ignored) {
2584 			iommu_disable_translation(iommu);
2585 			continue;
2586 		}
2587 
2588 		/*
2589 		 * Find the max pasid size of all IOMMU's in the system.
2590 		 * We need to ensure the system pasid table is no bigger
2591 		 * than the smallest supported.
2592 		 */
2593 		if (pasid_supported(iommu)) {
2594 			u32 temp = 2 << ecap_pss(iommu->ecap);
2595 
2596 			intel_pasid_max_id = min_t(u32, temp,
2597 						   intel_pasid_max_id);
2598 		}
2599 
2600 		intel_iommu_init_qi(iommu);
2601 
2602 		ret = iommu_init_domains(iommu);
2603 		if (ret)
2604 			goto free_iommu;
2605 
2606 		init_translation_status(iommu);
2607 
2608 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2609 			iommu_disable_translation(iommu);
2610 			clear_translation_pre_enabled(iommu);
2611 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2612 				iommu->name);
2613 		}
2614 
2615 		/*
2616 		 * TBD:
2617 		 * we could share the same root & context tables
2618 		 * among all IOMMU's. Need to Split it later.
2619 		 */
2620 		ret = iommu_alloc_root_entry(iommu);
2621 		if (ret)
2622 			goto free_iommu;
2623 
2624 		if (translation_pre_enabled(iommu)) {
2625 			pr_info("Translation already enabled - trying to copy translation structures\n");
2626 
2627 			ret = copy_translation_tables(iommu);
2628 			if (ret) {
2629 				/*
2630 				 * We found the IOMMU with translation
2631 				 * enabled - but failed to copy over the
2632 				 * old root-entry table. Try to proceed
2633 				 * by disabling translation now and
2634 				 * allocating a clean root-entry table.
2635 				 * This might cause DMAR faults, but
2636 				 * probably the dump will still succeed.
2637 				 */
2638 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2639 				       iommu->name);
2640 				iommu_disable_translation(iommu);
2641 				clear_translation_pre_enabled(iommu);
2642 			} else {
2643 				pr_info("Copied translation tables from previous kernel for %s\n",
2644 					iommu->name);
2645 			}
2646 		}
2647 
2648 		if (!ecap_pass_through(iommu->ecap))
2649 			hw_pass_through = 0;
2650 		intel_svm_check(iommu);
2651 	}
2652 
2653 	/*
2654 	 * Now that qi is enabled on all iommus, set the root entry and flush
2655 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2656 	 * flush_context function will loop forever and the boot hangs.
2657 	 */
2658 	for_each_active_iommu(iommu, drhd) {
2659 		iommu_flush_write_buffer(iommu);
2660 		iommu_set_root_entry(iommu);
2661 	}
2662 
2663 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2664 	dmar_map_gfx = 0;
2665 #endif
2666 
2667 	if (!dmar_map_gfx)
2668 		iommu_identity_mapping |= IDENTMAP_GFX;
2669 
2670 	check_tylersburg_isoch();
2671 
2672 	ret = si_domain_init(hw_pass_through);
2673 	if (ret)
2674 		goto free_iommu;
2675 
2676 	/*
2677 	 * for each drhd
2678 	 *   enable fault log
2679 	 *   global invalidate context cache
2680 	 *   global invalidate iotlb
2681 	 *   enable translation
2682 	 */
2683 	for_each_iommu(iommu, drhd) {
2684 		if (drhd->ignored) {
2685 			/*
2686 			 * we always have to disable PMRs or DMA may fail on
2687 			 * this device
2688 			 */
2689 			if (force_on)
2690 				iommu_disable_protect_mem_regions(iommu);
2691 			continue;
2692 		}
2693 
2694 		iommu_flush_write_buffer(iommu);
2695 
2696 #ifdef CONFIG_INTEL_IOMMU_SVM
2697 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2698 			/*
2699 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2700 			 * could cause possible lock race condition.
2701 			 */
2702 			up_write(&dmar_global_lock);
2703 			ret = intel_svm_enable_prq(iommu);
2704 			down_write(&dmar_global_lock);
2705 			if (ret)
2706 				goto free_iommu;
2707 		}
2708 #endif
2709 		ret = dmar_set_interrupt(iommu);
2710 		if (ret)
2711 			goto free_iommu;
2712 	}
2713 
2714 	return 0;
2715 
2716 free_iommu:
2717 	for_each_active_iommu(iommu, drhd) {
2718 		disable_dmar_iommu(iommu);
2719 		free_dmar_iommu(iommu);
2720 	}
2721 	if (si_domain) {
2722 		domain_exit(si_domain);
2723 		si_domain = NULL;
2724 	}
2725 
2726 	return ret;
2727 }
2728 
2729 static void __init init_no_remapping_devices(void)
2730 {
2731 	struct dmar_drhd_unit *drhd;
2732 	struct device *dev;
2733 	int i;
2734 
2735 	for_each_drhd_unit(drhd) {
2736 		if (!drhd->include_all) {
2737 			for_each_active_dev_scope(drhd->devices,
2738 						  drhd->devices_cnt, i, dev)
2739 				break;
2740 			/* ignore DMAR unit if no devices exist */
2741 			if (i == drhd->devices_cnt)
2742 				drhd->ignored = 1;
2743 		}
2744 	}
2745 
2746 	for_each_active_drhd_unit(drhd) {
2747 		if (drhd->include_all)
2748 			continue;
2749 
2750 		for_each_active_dev_scope(drhd->devices,
2751 					  drhd->devices_cnt, i, dev)
2752 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2753 				break;
2754 		if (i < drhd->devices_cnt)
2755 			continue;
2756 
2757 		/* This IOMMU has *only* gfx devices. Either bypass it or
2758 		   set the gfx_mapped flag, as appropriate */
2759 		drhd->gfx_dedicated = 1;
2760 		if (!dmar_map_gfx)
2761 			drhd->ignored = 1;
2762 	}
2763 }
2764 
2765 #ifdef CONFIG_SUSPEND
2766 static int init_iommu_hw(void)
2767 {
2768 	struct dmar_drhd_unit *drhd;
2769 	struct intel_iommu *iommu = NULL;
2770 	int ret;
2771 
2772 	for_each_active_iommu(iommu, drhd) {
2773 		if (iommu->qi) {
2774 			ret = dmar_reenable_qi(iommu);
2775 			if (ret)
2776 				return ret;
2777 		}
2778 	}
2779 
2780 	for_each_iommu(iommu, drhd) {
2781 		if (drhd->ignored) {
2782 			/*
2783 			 * we always have to disable PMRs or DMA may fail on
2784 			 * this device
2785 			 */
2786 			if (force_on)
2787 				iommu_disable_protect_mem_regions(iommu);
2788 			continue;
2789 		}
2790 
2791 		iommu_flush_write_buffer(iommu);
2792 		iommu_set_root_entry(iommu);
2793 		iommu_enable_translation(iommu);
2794 		iommu_disable_protect_mem_regions(iommu);
2795 	}
2796 
2797 	return 0;
2798 }
2799 
2800 static void iommu_flush_all(void)
2801 {
2802 	struct dmar_drhd_unit *drhd;
2803 	struct intel_iommu *iommu;
2804 
2805 	for_each_active_iommu(iommu, drhd) {
2806 		iommu->flush.flush_context(iommu, 0, 0, 0,
2807 					   DMA_CCMD_GLOBAL_INVL);
2808 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809 					 DMA_TLB_GLOBAL_FLUSH);
2810 	}
2811 }
2812 
2813 static int iommu_suspend(void)
2814 {
2815 	struct dmar_drhd_unit *drhd;
2816 	struct intel_iommu *iommu = NULL;
2817 	unsigned long flag;
2818 
2819 	iommu_flush_all();
2820 
2821 	for_each_active_iommu(iommu, drhd) {
2822 		iommu_disable_translation(iommu);
2823 
2824 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2825 
2826 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2827 			readl(iommu->reg + DMAR_FECTL_REG);
2828 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2829 			readl(iommu->reg + DMAR_FEDATA_REG);
2830 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2831 			readl(iommu->reg + DMAR_FEADDR_REG);
2832 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2833 			readl(iommu->reg + DMAR_FEUADDR_REG);
2834 
2835 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2836 	}
2837 	return 0;
2838 }
2839 
2840 static void iommu_resume(void)
2841 {
2842 	struct dmar_drhd_unit *drhd;
2843 	struct intel_iommu *iommu = NULL;
2844 	unsigned long flag;
2845 
2846 	if (init_iommu_hw()) {
2847 		if (force_on)
2848 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2849 		else
2850 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2851 		return;
2852 	}
2853 
2854 	for_each_active_iommu(iommu, drhd) {
2855 
2856 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2857 
2858 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2859 			iommu->reg + DMAR_FECTL_REG);
2860 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2861 			iommu->reg + DMAR_FEDATA_REG);
2862 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2863 			iommu->reg + DMAR_FEADDR_REG);
2864 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2865 			iommu->reg + DMAR_FEUADDR_REG);
2866 
2867 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2868 	}
2869 }
2870 
2871 static struct syscore_ops iommu_syscore_ops = {
2872 	.resume		= iommu_resume,
2873 	.suspend	= iommu_suspend,
2874 };
2875 
2876 static void __init init_iommu_pm_ops(void)
2877 {
2878 	register_syscore_ops(&iommu_syscore_ops);
2879 }
2880 
2881 #else
2882 static inline void init_iommu_pm_ops(void) {}
2883 #endif	/* CONFIG_PM */
2884 
2885 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2886 {
2887 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2888 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2889 	    rmrr->end_address <= rmrr->base_address ||
2890 	    arch_rmrr_sanity_check(rmrr))
2891 		return -EINVAL;
2892 
2893 	return 0;
2894 }
2895 
2896 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2897 {
2898 	struct acpi_dmar_reserved_memory *rmrr;
2899 	struct dmar_rmrr_unit *rmrru;
2900 
2901 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2902 	if (rmrr_sanity_check(rmrr)) {
2903 		pr_warn(FW_BUG
2904 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2905 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2906 			   rmrr->base_address, rmrr->end_address,
2907 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2908 			   dmi_get_system_info(DMI_BIOS_VERSION),
2909 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2910 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2911 	}
2912 
2913 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2914 	if (!rmrru)
2915 		goto out;
2916 
2917 	rmrru->hdr = header;
2918 
2919 	rmrru->base_address = rmrr->base_address;
2920 	rmrru->end_address = rmrr->end_address;
2921 
2922 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2923 				((void *)rmrr) + rmrr->header.length,
2924 				&rmrru->devices_cnt);
2925 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2926 		goto free_rmrru;
2927 
2928 	list_add(&rmrru->list, &dmar_rmrr_units);
2929 
2930 	return 0;
2931 free_rmrru:
2932 	kfree(rmrru);
2933 out:
2934 	return -ENOMEM;
2935 }
2936 
2937 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2938 {
2939 	struct dmar_atsr_unit *atsru;
2940 	struct acpi_dmar_atsr *tmp;
2941 
2942 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2943 				dmar_rcu_check()) {
2944 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2945 		if (atsr->segment != tmp->segment)
2946 			continue;
2947 		if (atsr->header.length != tmp->header.length)
2948 			continue;
2949 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2950 			return atsru;
2951 	}
2952 
2953 	return NULL;
2954 }
2955 
2956 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2957 {
2958 	struct acpi_dmar_atsr *atsr;
2959 	struct dmar_atsr_unit *atsru;
2960 
2961 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2962 		return 0;
2963 
2964 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2965 	atsru = dmar_find_atsr(atsr);
2966 	if (atsru)
2967 		return 0;
2968 
2969 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2970 	if (!atsru)
2971 		return -ENOMEM;
2972 
2973 	/*
2974 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2975 	 * copy the memory content because the memory buffer will be freed
2976 	 * on return.
2977 	 */
2978 	atsru->hdr = (void *)(atsru + 1);
2979 	memcpy(atsru->hdr, hdr, hdr->length);
2980 	atsru->include_all = atsr->flags & 0x1;
2981 	if (!atsru->include_all) {
2982 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2983 				(void *)atsr + atsr->header.length,
2984 				&atsru->devices_cnt);
2985 		if (atsru->devices_cnt && atsru->devices == NULL) {
2986 			kfree(atsru);
2987 			return -ENOMEM;
2988 		}
2989 	}
2990 
2991 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2992 
2993 	return 0;
2994 }
2995 
2996 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2997 {
2998 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2999 	kfree(atsru);
3000 }
3001 
3002 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3003 {
3004 	struct acpi_dmar_atsr *atsr;
3005 	struct dmar_atsr_unit *atsru;
3006 
3007 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3008 	atsru = dmar_find_atsr(atsr);
3009 	if (atsru) {
3010 		list_del_rcu(&atsru->list);
3011 		synchronize_rcu();
3012 		intel_iommu_free_atsr(atsru);
3013 	}
3014 
3015 	return 0;
3016 }
3017 
3018 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3019 {
3020 	int i;
3021 	struct device *dev;
3022 	struct acpi_dmar_atsr *atsr;
3023 	struct dmar_atsr_unit *atsru;
3024 
3025 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3026 	atsru = dmar_find_atsr(atsr);
3027 	if (!atsru)
3028 		return 0;
3029 
3030 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3031 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3032 					  i, dev)
3033 			return -EBUSY;
3034 	}
3035 
3036 	return 0;
3037 }
3038 
3039 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3040 {
3041 	struct dmar_satc_unit *satcu;
3042 	struct acpi_dmar_satc *tmp;
3043 
3044 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3045 				dmar_rcu_check()) {
3046 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3047 		if (satc->segment != tmp->segment)
3048 			continue;
3049 		if (satc->header.length != tmp->header.length)
3050 			continue;
3051 		if (memcmp(satc, tmp, satc->header.length) == 0)
3052 			return satcu;
3053 	}
3054 
3055 	return NULL;
3056 }
3057 
3058 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3059 {
3060 	struct acpi_dmar_satc *satc;
3061 	struct dmar_satc_unit *satcu;
3062 
3063 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3064 		return 0;
3065 
3066 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3067 	satcu = dmar_find_satc(satc);
3068 	if (satcu)
3069 		return 0;
3070 
3071 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3072 	if (!satcu)
3073 		return -ENOMEM;
3074 
3075 	satcu->hdr = (void *)(satcu + 1);
3076 	memcpy(satcu->hdr, hdr, hdr->length);
3077 	satcu->atc_required = satc->flags & 0x1;
3078 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3079 					      (void *)satc + satc->header.length,
3080 					      &satcu->devices_cnt);
3081 	if (satcu->devices_cnt && !satcu->devices) {
3082 		kfree(satcu);
3083 		return -ENOMEM;
3084 	}
3085 	list_add_rcu(&satcu->list, &dmar_satc_units);
3086 
3087 	return 0;
3088 }
3089 
3090 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3091 {
3092 	int sp, ret;
3093 	struct intel_iommu *iommu = dmaru->iommu;
3094 
3095 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3096 	if (ret)
3097 		goto out;
3098 
3099 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3100 		pr_warn("%s: Doesn't support hardware pass through.\n",
3101 			iommu->name);
3102 		return -ENXIO;
3103 	}
3104 
3105 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3106 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3107 		pr_warn("%s: Doesn't support large page.\n",
3108 			iommu->name);
3109 		return -ENXIO;
3110 	}
3111 
3112 	/*
3113 	 * Disable translation if already enabled prior to OS handover.
3114 	 */
3115 	if (iommu->gcmd & DMA_GCMD_TE)
3116 		iommu_disable_translation(iommu);
3117 
3118 	ret = iommu_init_domains(iommu);
3119 	if (ret == 0)
3120 		ret = iommu_alloc_root_entry(iommu);
3121 	if (ret)
3122 		goto out;
3123 
3124 	intel_svm_check(iommu);
3125 
3126 	if (dmaru->ignored) {
3127 		/*
3128 		 * we always have to disable PMRs or DMA may fail on this device
3129 		 */
3130 		if (force_on)
3131 			iommu_disable_protect_mem_regions(iommu);
3132 		return 0;
3133 	}
3134 
3135 	intel_iommu_init_qi(iommu);
3136 	iommu_flush_write_buffer(iommu);
3137 
3138 #ifdef CONFIG_INTEL_IOMMU_SVM
3139 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3140 		ret = intel_svm_enable_prq(iommu);
3141 		if (ret)
3142 			goto disable_iommu;
3143 	}
3144 #endif
3145 	ret = dmar_set_interrupt(iommu);
3146 	if (ret)
3147 		goto disable_iommu;
3148 
3149 	iommu_set_root_entry(iommu);
3150 	iommu_enable_translation(iommu);
3151 
3152 	iommu_disable_protect_mem_regions(iommu);
3153 	return 0;
3154 
3155 disable_iommu:
3156 	disable_dmar_iommu(iommu);
3157 out:
3158 	free_dmar_iommu(iommu);
3159 	return ret;
3160 }
3161 
3162 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3163 {
3164 	int ret = 0;
3165 	struct intel_iommu *iommu = dmaru->iommu;
3166 
3167 	if (!intel_iommu_enabled)
3168 		return 0;
3169 	if (iommu == NULL)
3170 		return -EINVAL;
3171 
3172 	if (insert) {
3173 		ret = intel_iommu_add(dmaru);
3174 	} else {
3175 		disable_dmar_iommu(iommu);
3176 		free_dmar_iommu(iommu);
3177 	}
3178 
3179 	return ret;
3180 }
3181 
3182 static void intel_iommu_free_dmars(void)
3183 {
3184 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3185 	struct dmar_atsr_unit *atsru, *atsr_n;
3186 	struct dmar_satc_unit *satcu, *satc_n;
3187 
3188 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3189 		list_del(&rmrru->list);
3190 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3191 		kfree(rmrru);
3192 	}
3193 
3194 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3195 		list_del(&atsru->list);
3196 		intel_iommu_free_atsr(atsru);
3197 	}
3198 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3199 		list_del(&satcu->list);
3200 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3201 		kfree(satcu);
3202 	}
3203 }
3204 
3205 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3206 {
3207 	struct dmar_satc_unit *satcu;
3208 	struct acpi_dmar_satc *satc;
3209 	struct device *tmp;
3210 	int i;
3211 
3212 	dev = pci_physfn(dev);
3213 	rcu_read_lock();
3214 
3215 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3216 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3217 		if (satc->segment != pci_domain_nr(dev->bus))
3218 			continue;
3219 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3220 			if (to_pci_dev(tmp) == dev)
3221 				goto out;
3222 	}
3223 	satcu = NULL;
3224 out:
3225 	rcu_read_unlock();
3226 	return satcu;
3227 }
3228 
3229 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3230 {
3231 	int i, ret = 1;
3232 	struct pci_bus *bus;
3233 	struct pci_dev *bridge = NULL;
3234 	struct device *tmp;
3235 	struct acpi_dmar_atsr *atsr;
3236 	struct dmar_atsr_unit *atsru;
3237 	struct dmar_satc_unit *satcu;
3238 
3239 	dev = pci_physfn(dev);
3240 	satcu = dmar_find_matched_satc_unit(dev);
3241 	if (satcu)
3242 		/*
3243 		 * This device supports ATS as it is in SATC table.
3244 		 * When IOMMU is in legacy mode, enabling ATS is done
3245 		 * automatically by HW for the device that requires
3246 		 * ATS, hence OS should not enable this device ATS
3247 		 * to avoid duplicated TLB invalidation.
3248 		 */
3249 		return !(satcu->atc_required && !sm_supported(iommu));
3250 
3251 	for (bus = dev->bus; bus; bus = bus->parent) {
3252 		bridge = bus->self;
3253 		/* If it's an integrated device, allow ATS */
3254 		if (!bridge)
3255 			return 1;
3256 		/* Connected via non-PCIe: no ATS */
3257 		if (!pci_is_pcie(bridge) ||
3258 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3259 			return 0;
3260 		/* If we found the root port, look it up in the ATSR */
3261 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3262 			break;
3263 	}
3264 
3265 	rcu_read_lock();
3266 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3267 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3268 		if (atsr->segment != pci_domain_nr(dev->bus))
3269 			continue;
3270 
3271 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3272 			if (tmp == &bridge->dev)
3273 				goto out;
3274 
3275 		if (atsru->include_all)
3276 			goto out;
3277 	}
3278 	ret = 0;
3279 out:
3280 	rcu_read_unlock();
3281 
3282 	return ret;
3283 }
3284 
3285 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3286 {
3287 	int ret;
3288 	struct dmar_rmrr_unit *rmrru;
3289 	struct dmar_atsr_unit *atsru;
3290 	struct dmar_satc_unit *satcu;
3291 	struct acpi_dmar_atsr *atsr;
3292 	struct acpi_dmar_reserved_memory *rmrr;
3293 	struct acpi_dmar_satc *satc;
3294 
3295 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3296 		return 0;
3297 
3298 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3299 		rmrr = container_of(rmrru->hdr,
3300 				    struct acpi_dmar_reserved_memory, header);
3301 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3302 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3303 				((void *)rmrr) + rmrr->header.length,
3304 				rmrr->segment, rmrru->devices,
3305 				rmrru->devices_cnt);
3306 			if (ret < 0)
3307 				return ret;
3308 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3309 			dmar_remove_dev_scope(info, rmrr->segment,
3310 				rmrru->devices, rmrru->devices_cnt);
3311 		}
3312 	}
3313 
3314 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3315 		if (atsru->include_all)
3316 			continue;
3317 
3318 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3319 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3320 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3321 					(void *)atsr + atsr->header.length,
3322 					atsr->segment, atsru->devices,
3323 					atsru->devices_cnt);
3324 			if (ret > 0)
3325 				break;
3326 			else if (ret < 0)
3327 				return ret;
3328 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3329 			if (dmar_remove_dev_scope(info, atsr->segment,
3330 					atsru->devices, atsru->devices_cnt))
3331 				break;
3332 		}
3333 	}
3334 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3335 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3336 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3337 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3338 					(void *)satc + satc->header.length,
3339 					satc->segment, satcu->devices,
3340 					satcu->devices_cnt);
3341 			if (ret > 0)
3342 				break;
3343 			else if (ret < 0)
3344 				return ret;
3345 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3346 			if (dmar_remove_dev_scope(info, satc->segment,
3347 					satcu->devices, satcu->devices_cnt))
3348 				break;
3349 		}
3350 	}
3351 
3352 	return 0;
3353 }
3354 
3355 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3356 				       unsigned long val, void *v)
3357 {
3358 	struct memory_notify *mhp = v;
3359 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3360 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3361 			mhp->nr_pages - 1);
3362 
3363 	switch (val) {
3364 	case MEM_GOING_ONLINE:
3365 		if (iommu_domain_identity_map(si_domain,
3366 					      start_vpfn, last_vpfn)) {
3367 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3368 				start_vpfn, last_vpfn);
3369 			return NOTIFY_BAD;
3370 		}
3371 		break;
3372 
3373 	case MEM_OFFLINE:
3374 	case MEM_CANCEL_ONLINE:
3375 		{
3376 			struct dmar_drhd_unit *drhd;
3377 			struct intel_iommu *iommu;
3378 			LIST_HEAD(freelist);
3379 
3380 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3381 
3382 			rcu_read_lock();
3383 			for_each_active_iommu(iommu, drhd)
3384 				iommu_flush_iotlb_psi(iommu, si_domain,
3385 					start_vpfn, mhp->nr_pages,
3386 					list_empty(&freelist), 0);
3387 			rcu_read_unlock();
3388 			put_pages_list(&freelist);
3389 		}
3390 		break;
3391 	}
3392 
3393 	return NOTIFY_OK;
3394 }
3395 
3396 static struct notifier_block intel_iommu_memory_nb = {
3397 	.notifier_call = intel_iommu_memory_notifier,
3398 	.priority = 0
3399 };
3400 
3401 static void intel_disable_iommus(void)
3402 {
3403 	struct intel_iommu *iommu = NULL;
3404 	struct dmar_drhd_unit *drhd;
3405 
3406 	for_each_iommu(iommu, drhd)
3407 		iommu_disable_translation(iommu);
3408 }
3409 
3410 void intel_iommu_shutdown(void)
3411 {
3412 	struct dmar_drhd_unit *drhd;
3413 	struct intel_iommu *iommu = NULL;
3414 
3415 	if (no_iommu || dmar_disabled)
3416 		return;
3417 
3418 	down_write(&dmar_global_lock);
3419 
3420 	/* Disable PMRs explicitly here. */
3421 	for_each_iommu(iommu, drhd)
3422 		iommu_disable_protect_mem_regions(iommu);
3423 
3424 	/* Make sure the IOMMUs are switched off */
3425 	intel_disable_iommus();
3426 
3427 	up_write(&dmar_global_lock);
3428 }
3429 
3430 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3431 {
3432 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3433 
3434 	return container_of(iommu_dev, struct intel_iommu, iommu);
3435 }
3436 
3437 static ssize_t version_show(struct device *dev,
3438 			    struct device_attribute *attr, char *buf)
3439 {
3440 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3441 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3442 	return sysfs_emit(buf, "%d:%d\n",
3443 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3444 }
3445 static DEVICE_ATTR_RO(version);
3446 
3447 static ssize_t address_show(struct device *dev,
3448 			    struct device_attribute *attr, char *buf)
3449 {
3450 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3451 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3452 }
3453 static DEVICE_ATTR_RO(address);
3454 
3455 static ssize_t cap_show(struct device *dev,
3456 			struct device_attribute *attr, char *buf)
3457 {
3458 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3459 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3460 }
3461 static DEVICE_ATTR_RO(cap);
3462 
3463 static ssize_t ecap_show(struct device *dev,
3464 			 struct device_attribute *attr, char *buf)
3465 {
3466 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3467 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3468 }
3469 static DEVICE_ATTR_RO(ecap);
3470 
3471 static ssize_t domains_supported_show(struct device *dev,
3472 				      struct device_attribute *attr, char *buf)
3473 {
3474 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3475 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3476 }
3477 static DEVICE_ATTR_RO(domains_supported);
3478 
3479 static ssize_t domains_used_show(struct device *dev,
3480 				 struct device_attribute *attr, char *buf)
3481 {
3482 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3483 	return sysfs_emit(buf, "%d\n",
3484 			  bitmap_weight(iommu->domain_ids,
3485 					cap_ndoms(iommu->cap)));
3486 }
3487 static DEVICE_ATTR_RO(domains_used);
3488 
3489 static struct attribute *intel_iommu_attrs[] = {
3490 	&dev_attr_version.attr,
3491 	&dev_attr_address.attr,
3492 	&dev_attr_cap.attr,
3493 	&dev_attr_ecap.attr,
3494 	&dev_attr_domains_supported.attr,
3495 	&dev_attr_domains_used.attr,
3496 	NULL,
3497 };
3498 
3499 static struct attribute_group intel_iommu_group = {
3500 	.name = "intel-iommu",
3501 	.attrs = intel_iommu_attrs,
3502 };
3503 
3504 const struct attribute_group *intel_iommu_groups[] = {
3505 	&intel_iommu_group,
3506 	NULL,
3507 };
3508 
3509 static bool has_external_pci(void)
3510 {
3511 	struct pci_dev *pdev = NULL;
3512 
3513 	for_each_pci_dev(pdev)
3514 		if (pdev->external_facing) {
3515 			pci_dev_put(pdev);
3516 			return true;
3517 		}
3518 
3519 	return false;
3520 }
3521 
3522 static int __init platform_optin_force_iommu(void)
3523 {
3524 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3525 		return 0;
3526 
3527 	if (no_iommu || dmar_disabled)
3528 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3529 
3530 	/*
3531 	 * If Intel-IOMMU is disabled by default, we will apply identity
3532 	 * map for all devices except those marked as being untrusted.
3533 	 */
3534 	if (dmar_disabled)
3535 		iommu_set_default_passthrough(false);
3536 
3537 	dmar_disabled = 0;
3538 	no_iommu = 0;
3539 
3540 	return 1;
3541 }
3542 
3543 static int __init probe_acpi_namespace_devices(void)
3544 {
3545 	struct dmar_drhd_unit *drhd;
3546 	/* To avoid a -Wunused-but-set-variable warning. */
3547 	struct intel_iommu *iommu __maybe_unused;
3548 	struct device *dev;
3549 	int i, ret = 0;
3550 
3551 	for_each_active_iommu(iommu, drhd) {
3552 		for_each_active_dev_scope(drhd->devices,
3553 					  drhd->devices_cnt, i, dev) {
3554 			struct acpi_device_physical_node *pn;
3555 			struct acpi_device *adev;
3556 
3557 			if (dev->bus != &acpi_bus_type)
3558 				continue;
3559 
3560 			adev = to_acpi_device(dev);
3561 			mutex_lock(&adev->physical_node_lock);
3562 			list_for_each_entry(pn,
3563 					    &adev->physical_node_list, node) {
3564 				ret = iommu_probe_device(pn->dev);
3565 				if (ret)
3566 					break;
3567 			}
3568 			mutex_unlock(&adev->physical_node_lock);
3569 
3570 			if (ret)
3571 				return ret;
3572 		}
3573 	}
3574 
3575 	return 0;
3576 }
3577 
3578 static __init int tboot_force_iommu(void)
3579 {
3580 	if (!tboot_enabled())
3581 		return 0;
3582 
3583 	if (no_iommu || dmar_disabled)
3584 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3585 
3586 	dmar_disabled = 0;
3587 	no_iommu = 0;
3588 
3589 	return 1;
3590 }
3591 
3592 int __init intel_iommu_init(void)
3593 {
3594 	int ret = -ENODEV;
3595 	struct dmar_drhd_unit *drhd;
3596 	struct intel_iommu *iommu;
3597 
3598 	/*
3599 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3600 	 * opt in, so enforce that.
3601 	 */
3602 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3603 		    platform_optin_force_iommu();
3604 
3605 	down_write(&dmar_global_lock);
3606 	if (dmar_table_init()) {
3607 		if (force_on)
3608 			panic("tboot: Failed to initialize DMAR table\n");
3609 		goto out_free_dmar;
3610 	}
3611 
3612 	if (dmar_dev_scope_init() < 0) {
3613 		if (force_on)
3614 			panic("tboot: Failed to initialize DMAR device scope\n");
3615 		goto out_free_dmar;
3616 	}
3617 
3618 	up_write(&dmar_global_lock);
3619 
3620 	/*
3621 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3622 	 * complain later when we register it under the lock.
3623 	 */
3624 	dmar_register_bus_notifier();
3625 
3626 	down_write(&dmar_global_lock);
3627 
3628 	if (!no_iommu)
3629 		intel_iommu_debugfs_init();
3630 
3631 	if (no_iommu || dmar_disabled) {
3632 		/*
3633 		 * We exit the function here to ensure IOMMU's remapping and
3634 		 * mempool aren't setup, which means that the IOMMU's PMRs
3635 		 * won't be disabled via the call to init_dmars(). So disable
3636 		 * it explicitly here. The PMRs were setup by tboot prior to
3637 		 * calling SENTER, but the kernel is expected to reset/tear
3638 		 * down the PMRs.
3639 		 */
3640 		if (intel_iommu_tboot_noforce) {
3641 			for_each_iommu(iommu, drhd)
3642 				iommu_disable_protect_mem_regions(iommu);
3643 		}
3644 
3645 		/*
3646 		 * Make sure the IOMMUs are switched off, even when we
3647 		 * boot into a kexec kernel and the previous kernel left
3648 		 * them enabled
3649 		 */
3650 		intel_disable_iommus();
3651 		goto out_free_dmar;
3652 	}
3653 
3654 	if (list_empty(&dmar_rmrr_units))
3655 		pr_info("No RMRR found\n");
3656 
3657 	if (list_empty(&dmar_atsr_units))
3658 		pr_info("No ATSR found\n");
3659 
3660 	if (list_empty(&dmar_satc_units))
3661 		pr_info("No SATC found\n");
3662 
3663 	init_no_remapping_devices();
3664 
3665 	ret = init_dmars();
3666 	if (ret) {
3667 		if (force_on)
3668 			panic("tboot: Failed to initialize DMARs\n");
3669 		pr_err("Initialization failed\n");
3670 		goto out_free_dmar;
3671 	}
3672 	up_write(&dmar_global_lock);
3673 
3674 	init_iommu_pm_ops();
3675 
3676 	down_read(&dmar_global_lock);
3677 	for_each_active_iommu(iommu, drhd) {
3678 		/*
3679 		 * The flush queue implementation does not perform
3680 		 * page-selective invalidations that are required for efficient
3681 		 * TLB flushes in virtual environments.  The benefit of batching
3682 		 * is likely to be much lower than the overhead of synchronizing
3683 		 * the virtual and physical IOMMU page-tables.
3684 		 */
3685 		if (cap_caching_mode(iommu->cap) &&
3686 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3687 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3688 			iommu_set_dma_strict();
3689 		}
3690 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3691 				       intel_iommu_groups,
3692 				       "%s", iommu->name);
3693 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3694 
3695 		iommu_pmu_register(iommu);
3696 	}
3697 	up_read(&dmar_global_lock);
3698 
3699 	if (si_domain && !hw_pass_through)
3700 		register_memory_notifier(&intel_iommu_memory_nb);
3701 
3702 	down_read(&dmar_global_lock);
3703 	if (probe_acpi_namespace_devices())
3704 		pr_warn("ACPI name space devices didn't probe correctly\n");
3705 
3706 	/* Finally, we enable the DMA remapping hardware. */
3707 	for_each_iommu(iommu, drhd) {
3708 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3709 			iommu_enable_translation(iommu);
3710 
3711 		iommu_disable_protect_mem_regions(iommu);
3712 	}
3713 	up_read(&dmar_global_lock);
3714 
3715 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3716 
3717 	intel_iommu_enabled = 1;
3718 
3719 	return 0;
3720 
3721 out_free_dmar:
3722 	intel_iommu_free_dmars();
3723 	up_write(&dmar_global_lock);
3724 	return ret;
3725 }
3726 
3727 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3728 {
3729 	struct device_domain_info *info = opaque;
3730 
3731 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3732 	return 0;
3733 }
3734 
3735 /*
3736  * NB - intel-iommu lacks any sort of reference counting for the users of
3737  * dependent devices.  If multiple endpoints have intersecting dependent
3738  * devices, unbinding the driver from any one of them will possibly leave
3739  * the others unable to operate.
3740  */
3741 static void domain_context_clear(struct device_domain_info *info)
3742 {
3743 	if (!dev_is_pci(info->dev))
3744 		domain_context_clear_one(info, info->bus, info->devfn);
3745 
3746 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3747 			       &domain_context_clear_one_cb, info);
3748 }
3749 
3750 static void dmar_remove_one_dev_info(struct device *dev)
3751 {
3752 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3753 	struct dmar_domain *domain = info->domain;
3754 	struct intel_iommu *iommu = info->iommu;
3755 	unsigned long flags;
3756 
3757 	if (!dev_is_real_dma_subdevice(info->dev)) {
3758 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3759 			intel_pasid_tear_down_entry(iommu, info->dev,
3760 					IOMMU_NO_PASID, false);
3761 
3762 		iommu_disable_pci_caps(info);
3763 		domain_context_clear(info);
3764 	}
3765 
3766 	spin_lock_irqsave(&domain->lock, flags);
3767 	list_del(&info->link);
3768 	spin_unlock_irqrestore(&domain->lock, flags);
3769 
3770 	domain_detach_iommu(domain, iommu);
3771 	info->domain = NULL;
3772 }
3773 
3774 /*
3775  * Clear the page table pointer in context or pasid table entries so that
3776  * all DMA requests without PASID from the device are blocked. If the page
3777  * table has been set, clean up the data structures.
3778  */
3779 void device_block_translation(struct device *dev)
3780 {
3781 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3782 	struct intel_iommu *iommu = info->iommu;
3783 	unsigned long flags;
3784 
3785 	iommu_disable_pci_caps(info);
3786 	if (!dev_is_real_dma_subdevice(dev)) {
3787 		if (sm_supported(iommu))
3788 			intel_pasid_tear_down_entry(iommu, dev,
3789 						    IOMMU_NO_PASID, false);
3790 		else
3791 			domain_context_clear(info);
3792 	}
3793 
3794 	if (!info->domain)
3795 		return;
3796 
3797 	spin_lock_irqsave(&info->domain->lock, flags);
3798 	list_del(&info->link);
3799 	spin_unlock_irqrestore(&info->domain->lock, flags);
3800 
3801 	domain_detach_iommu(info->domain, iommu);
3802 	info->domain = NULL;
3803 }
3804 
3805 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3806 {
3807 	int adjust_width;
3808 
3809 	/* calculate AGAW */
3810 	domain->gaw = guest_width;
3811 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3812 	domain->agaw = width_to_agaw(adjust_width);
3813 
3814 	domain->iommu_coherency = false;
3815 	domain->iommu_superpage = 0;
3816 	domain->max_addr = 0;
3817 
3818 	/* always allocate the top pgd */
3819 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3820 	if (!domain->pgd)
3821 		return -ENOMEM;
3822 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3823 	return 0;
3824 }
3825 
3826 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3827 				      struct device *dev)
3828 {
3829 	device_block_translation(dev);
3830 	return 0;
3831 }
3832 
3833 static struct iommu_domain blocking_domain = {
3834 	.type = IOMMU_DOMAIN_BLOCKED,
3835 	.ops = &(const struct iommu_domain_ops) {
3836 		.attach_dev	= blocking_domain_attach_dev,
3837 	}
3838 };
3839 
3840 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3841 {
3842 	struct dmar_domain *dmar_domain;
3843 	struct iommu_domain *domain;
3844 
3845 	switch (type) {
3846 	case IOMMU_DOMAIN_DMA:
3847 	case IOMMU_DOMAIN_UNMANAGED:
3848 		dmar_domain = alloc_domain(type);
3849 		if (!dmar_domain) {
3850 			pr_err("Can't allocate dmar_domain\n");
3851 			return NULL;
3852 		}
3853 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3854 			pr_err("Domain initialization failed\n");
3855 			domain_exit(dmar_domain);
3856 			return NULL;
3857 		}
3858 
3859 		domain = &dmar_domain->domain;
3860 		domain->geometry.aperture_start = 0;
3861 		domain->geometry.aperture_end   =
3862 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3863 		domain->geometry.force_aperture = true;
3864 
3865 		return domain;
3866 	case IOMMU_DOMAIN_IDENTITY:
3867 		return &si_domain->domain;
3868 	case IOMMU_DOMAIN_SVA:
3869 		return intel_svm_domain_alloc();
3870 	default:
3871 		return NULL;
3872 	}
3873 
3874 	return NULL;
3875 }
3876 
3877 static struct iommu_domain *
3878 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3879 			      struct iommu_domain *parent,
3880 			      const struct iommu_user_data *user_data)
3881 {
3882 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3883 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3884 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3885 	struct intel_iommu *iommu = info->iommu;
3886 	struct iommu_domain *domain;
3887 
3888 	/* Must be NESTING domain */
3889 	if (parent) {
3890 		if (!nested_supported(iommu) || flags)
3891 			return ERR_PTR(-EOPNOTSUPP);
3892 		return intel_nested_domain_alloc(parent, user_data);
3893 	}
3894 
3895 	if (flags &
3896 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3897 		return ERR_PTR(-EOPNOTSUPP);
3898 	if (nested_parent && !nested_supported(iommu))
3899 		return ERR_PTR(-EOPNOTSUPP);
3900 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3901 		return ERR_PTR(-EOPNOTSUPP);
3902 
3903 	/*
3904 	 * domain_alloc_user op needs to fully initialize a domain before
3905 	 * return, so uses iommu_domain_alloc() here for simple.
3906 	 */
3907 	domain = iommu_domain_alloc(dev->bus);
3908 	if (!domain)
3909 		return ERR_PTR(-ENOMEM);
3910 
3911 	if (nested_parent)
3912 		to_dmar_domain(domain)->nested_parent = true;
3913 
3914 	if (dirty_tracking) {
3915 		if (to_dmar_domain(domain)->use_first_level) {
3916 			iommu_domain_free(domain);
3917 			return ERR_PTR(-EOPNOTSUPP);
3918 		}
3919 		domain->dirty_ops = &intel_dirty_ops;
3920 	}
3921 
3922 	return domain;
3923 }
3924 
3925 static void intel_iommu_domain_free(struct iommu_domain *domain)
3926 {
3927 	if (domain != &si_domain->domain)
3928 		domain_exit(to_dmar_domain(domain));
3929 }
3930 
3931 int prepare_domain_attach_device(struct iommu_domain *domain,
3932 				 struct device *dev)
3933 {
3934 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3935 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3936 	struct intel_iommu *iommu = info->iommu;
3937 	int addr_width;
3938 
3939 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3940 		return -EINVAL;
3941 
3942 	if (domain->dirty_ops && !ssads_supported(iommu))
3943 		return -EINVAL;
3944 
3945 	/* check if this iommu agaw is sufficient for max mapped address */
3946 	addr_width = agaw_to_width(iommu->agaw);
3947 	if (addr_width > cap_mgaw(iommu->cap))
3948 		addr_width = cap_mgaw(iommu->cap);
3949 
3950 	if (dmar_domain->max_addr > (1LL << addr_width))
3951 		return -EINVAL;
3952 	dmar_domain->gaw = addr_width;
3953 
3954 	/*
3955 	 * Knock out extra levels of page tables if necessary
3956 	 */
3957 	while (iommu->agaw < dmar_domain->agaw) {
3958 		struct dma_pte *pte;
3959 
3960 		pte = dmar_domain->pgd;
3961 		if (dma_pte_present(pte)) {
3962 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3963 			free_pgtable_page(pte);
3964 		}
3965 		dmar_domain->agaw--;
3966 	}
3967 
3968 	return 0;
3969 }
3970 
3971 static int intel_iommu_attach_device(struct iommu_domain *domain,
3972 				     struct device *dev)
3973 {
3974 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3975 	int ret;
3976 
3977 	if (info->domain)
3978 		device_block_translation(dev);
3979 
3980 	ret = prepare_domain_attach_device(domain, dev);
3981 	if (ret)
3982 		return ret;
3983 
3984 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3985 }
3986 
3987 static int intel_iommu_map(struct iommu_domain *domain,
3988 			   unsigned long iova, phys_addr_t hpa,
3989 			   size_t size, int iommu_prot, gfp_t gfp)
3990 {
3991 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3992 	u64 max_addr;
3993 	int prot = 0;
3994 
3995 	if (iommu_prot & IOMMU_READ)
3996 		prot |= DMA_PTE_READ;
3997 	if (iommu_prot & IOMMU_WRITE)
3998 		prot |= DMA_PTE_WRITE;
3999 	if (dmar_domain->set_pte_snp)
4000 		prot |= DMA_PTE_SNP;
4001 
4002 	max_addr = iova + size;
4003 	if (dmar_domain->max_addr < max_addr) {
4004 		u64 end;
4005 
4006 		/* check if minimum agaw is sufficient for mapped address */
4007 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4008 		if (end < max_addr) {
4009 			pr_err("%s: iommu width (%d) is not "
4010 			       "sufficient for the mapped address (%llx)\n",
4011 			       __func__, dmar_domain->gaw, max_addr);
4012 			return -EFAULT;
4013 		}
4014 		dmar_domain->max_addr = max_addr;
4015 	}
4016 	/* Round up size to next multiple of PAGE_SIZE, if it and
4017 	   the low bits of hpa would take us onto the next page */
4018 	size = aligned_nrpages(hpa, size);
4019 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4020 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4021 }
4022 
4023 static int intel_iommu_map_pages(struct iommu_domain *domain,
4024 				 unsigned long iova, phys_addr_t paddr,
4025 				 size_t pgsize, size_t pgcount,
4026 				 int prot, gfp_t gfp, size_t *mapped)
4027 {
4028 	unsigned long pgshift = __ffs(pgsize);
4029 	size_t size = pgcount << pgshift;
4030 	int ret;
4031 
4032 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4033 		return -EINVAL;
4034 
4035 	if (!IS_ALIGNED(iova | paddr, pgsize))
4036 		return -EINVAL;
4037 
4038 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4039 	if (!ret && mapped)
4040 		*mapped = size;
4041 
4042 	return ret;
4043 }
4044 
4045 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4046 				unsigned long iova, size_t size,
4047 				struct iommu_iotlb_gather *gather)
4048 {
4049 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050 	unsigned long start_pfn, last_pfn;
4051 	int level = 0;
4052 
4053 	/* Cope with horrid API which requires us to unmap more than the
4054 	   size argument if it happens to be a large-page mapping. */
4055 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056 				     &level, GFP_ATOMIC)))
4057 		return 0;
4058 
4059 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4060 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4061 
4062 	start_pfn = iova >> VTD_PAGE_SHIFT;
4063 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4064 
4065 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4066 
4067 	if (dmar_domain->max_addr == iova + size)
4068 		dmar_domain->max_addr = iova;
4069 
4070 	/*
4071 	 * We do not use page-selective IOTLB invalidation in flush queue,
4072 	 * so there is no need to track page and sync iotlb.
4073 	 */
4074 	if (!iommu_iotlb_gather_queued(gather))
4075 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4076 
4077 	return size;
4078 }
4079 
4080 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4081 				      unsigned long iova,
4082 				      size_t pgsize, size_t pgcount,
4083 				      struct iommu_iotlb_gather *gather)
4084 {
4085 	unsigned long pgshift = __ffs(pgsize);
4086 	size_t size = pgcount << pgshift;
4087 
4088 	return intel_iommu_unmap(domain, iova, size, gather);
4089 }
4090 
4091 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4092 				 struct iommu_iotlb_gather *gather)
4093 {
4094 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4095 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4096 	size_t size = gather->end - gather->start;
4097 	struct iommu_domain_info *info;
4098 	unsigned long start_pfn;
4099 	unsigned long nrpages;
4100 	unsigned long i;
4101 
4102 	nrpages = aligned_nrpages(gather->start, size);
4103 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4104 
4105 	xa_for_each(&dmar_domain->iommu_array, i, info)
4106 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4107 				      start_pfn, nrpages,
4108 				      list_empty(&gather->freelist), 0);
4109 
4110 	put_pages_list(&gather->freelist);
4111 }
4112 
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114 					    dma_addr_t iova)
4115 {
4116 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117 	struct dma_pte *pte;
4118 	int level = 0;
4119 	u64 phys = 0;
4120 
4121 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4122 			     GFP_ATOMIC);
4123 	if (pte && dma_pte_present(pte))
4124 		phys = dma_pte_addr(pte) +
4125 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4126 						VTD_PAGE_SHIFT) - 1));
4127 
4128 	return phys;
4129 }
4130 
4131 static bool domain_support_force_snooping(struct dmar_domain *domain)
4132 {
4133 	struct device_domain_info *info;
4134 	bool support = true;
4135 
4136 	assert_spin_locked(&domain->lock);
4137 	list_for_each_entry(info, &domain->devices, link) {
4138 		if (!ecap_sc_support(info->iommu->ecap)) {
4139 			support = false;
4140 			break;
4141 		}
4142 	}
4143 
4144 	return support;
4145 }
4146 
4147 static void domain_set_force_snooping(struct dmar_domain *domain)
4148 {
4149 	struct device_domain_info *info;
4150 
4151 	assert_spin_locked(&domain->lock);
4152 	/*
4153 	 * Second level page table supports per-PTE snoop control. The
4154 	 * iommu_map() interface will handle this by setting SNP bit.
4155 	 */
4156 	if (!domain->use_first_level) {
4157 		domain->set_pte_snp = true;
4158 		return;
4159 	}
4160 
4161 	list_for_each_entry(info, &domain->devices, link)
4162 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4163 						     IOMMU_NO_PASID);
4164 }
4165 
4166 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4167 {
4168 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169 	unsigned long flags;
4170 
4171 	if (dmar_domain->force_snooping)
4172 		return true;
4173 
4174 	spin_lock_irqsave(&dmar_domain->lock, flags);
4175 	if (!domain_support_force_snooping(dmar_domain) ||
4176 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4177 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4178 		return false;
4179 	}
4180 
4181 	domain_set_force_snooping(dmar_domain);
4182 	dmar_domain->force_snooping = true;
4183 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4184 
4185 	return true;
4186 }
4187 
4188 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4189 {
4190 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4191 
4192 	switch (cap) {
4193 	case IOMMU_CAP_CACHE_COHERENCY:
4194 	case IOMMU_CAP_DEFERRED_FLUSH:
4195 		return true;
4196 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4197 		return dmar_platform_optin();
4198 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4199 		return ecap_sc_support(info->iommu->ecap);
4200 	case IOMMU_CAP_DIRTY_TRACKING:
4201 		return ssads_supported(info->iommu);
4202 	default:
4203 		return false;
4204 	}
4205 }
4206 
4207 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4208 {
4209 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4210 	struct device_domain_info *info;
4211 	struct intel_iommu *iommu;
4212 	u8 bus, devfn;
4213 	int ret;
4214 
4215 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4216 	if (!iommu || !iommu->iommu.ops)
4217 		return ERR_PTR(-ENODEV);
4218 
4219 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4220 	if (!info)
4221 		return ERR_PTR(-ENOMEM);
4222 
4223 	if (dev_is_real_dma_subdevice(dev)) {
4224 		info->bus = pdev->bus->number;
4225 		info->devfn = pdev->devfn;
4226 		info->segment = pci_domain_nr(pdev->bus);
4227 	} else {
4228 		info->bus = bus;
4229 		info->devfn = devfn;
4230 		info->segment = iommu->segment;
4231 	}
4232 
4233 	info->dev = dev;
4234 	info->iommu = iommu;
4235 	if (dev_is_pci(dev)) {
4236 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4237 		    pci_ats_supported(pdev) &&
4238 		    dmar_ats_supported(pdev, iommu)) {
4239 			info->ats_supported = 1;
4240 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4241 
4242 			/*
4243 			 * For IOMMU that supports device IOTLB throttling
4244 			 * (DIT), we assign PFSID to the invalidation desc
4245 			 * of a VF such that IOMMU HW can gauge queue depth
4246 			 * at PF level. If DIT is not set, PFSID will be
4247 			 * treated as reserved, which should be set to 0.
4248 			 */
4249 			if (ecap_dit(iommu->ecap))
4250 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4251 			info->ats_qdep = pci_ats_queue_depth(pdev);
4252 		}
4253 		if (sm_supported(iommu)) {
4254 			if (pasid_supported(iommu)) {
4255 				int features = pci_pasid_features(pdev);
4256 
4257 				if (features >= 0)
4258 					info->pasid_supported = features | 1;
4259 			}
4260 
4261 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4262 			    pci_pri_supported(pdev))
4263 				info->pri_supported = 1;
4264 		}
4265 	}
4266 
4267 	dev_iommu_priv_set(dev, info);
4268 
4269 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4270 		ret = intel_pasid_alloc_table(dev);
4271 		if (ret) {
4272 			dev_err(dev, "PASID table allocation failed\n");
4273 			kfree(info);
4274 			return ERR_PTR(ret);
4275 		}
4276 	}
4277 
4278 	intel_iommu_debugfs_create_dev(info);
4279 
4280 	return &iommu->iommu;
4281 }
4282 
4283 static void intel_iommu_release_device(struct device *dev)
4284 {
4285 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4286 
4287 	dmar_remove_one_dev_info(dev);
4288 	intel_pasid_free_table(dev);
4289 	intel_iommu_debugfs_remove_dev(info);
4290 	kfree(info);
4291 	set_dma_ops(dev, NULL);
4292 }
4293 
4294 static void intel_iommu_probe_finalize(struct device *dev)
4295 {
4296 	set_dma_ops(dev, NULL);
4297 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4298 }
4299 
4300 static void intel_iommu_get_resv_regions(struct device *device,
4301 					 struct list_head *head)
4302 {
4303 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4304 	struct iommu_resv_region *reg;
4305 	struct dmar_rmrr_unit *rmrr;
4306 	struct device *i_dev;
4307 	int i;
4308 
4309 	rcu_read_lock();
4310 	for_each_rmrr_units(rmrr) {
4311 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4312 					  i, i_dev) {
4313 			struct iommu_resv_region *resv;
4314 			enum iommu_resv_type type;
4315 			size_t length;
4316 
4317 			if (i_dev != device &&
4318 			    !is_downstream_to_pci_bridge(device, i_dev))
4319 				continue;
4320 
4321 			length = rmrr->end_address - rmrr->base_address + 1;
4322 
4323 			type = device_rmrr_is_relaxable(device) ?
4324 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4325 
4326 			resv = iommu_alloc_resv_region(rmrr->base_address,
4327 						       length, prot, type,
4328 						       GFP_ATOMIC);
4329 			if (!resv)
4330 				break;
4331 
4332 			list_add_tail(&resv->list, head);
4333 		}
4334 	}
4335 	rcu_read_unlock();
4336 
4337 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4338 	if (dev_is_pci(device)) {
4339 		struct pci_dev *pdev = to_pci_dev(device);
4340 
4341 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4342 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4343 					IOMMU_RESV_DIRECT_RELAXABLE,
4344 					GFP_KERNEL);
4345 			if (reg)
4346 				list_add_tail(&reg->list, head);
4347 		}
4348 	}
4349 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4350 
4351 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4352 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4353 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4354 	if (!reg)
4355 		return;
4356 	list_add_tail(&reg->list, head);
4357 }
4358 
4359 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4360 {
4361 	if (dev_is_pci(dev))
4362 		return pci_device_group(dev);
4363 	return generic_device_group(dev);
4364 }
4365 
4366 static int intel_iommu_enable_sva(struct device *dev)
4367 {
4368 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4369 	struct intel_iommu *iommu;
4370 
4371 	if (!info || dmar_disabled)
4372 		return -EINVAL;
4373 
4374 	iommu = info->iommu;
4375 	if (!iommu)
4376 		return -EINVAL;
4377 
4378 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4379 		return -ENODEV;
4380 
4381 	if (!info->pasid_enabled || !info->ats_enabled)
4382 		return -EINVAL;
4383 
4384 	/*
4385 	 * Devices having device-specific I/O fault handling should not
4386 	 * support PCI/PRI. The IOMMU side has no means to check the
4387 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4388 	 * default that if the device driver enables SVA on a non-PRI
4389 	 * device, it will handle IOPF in its own way.
4390 	 */
4391 	if (!info->pri_supported)
4392 		return 0;
4393 
4394 	/* Devices supporting PRI should have it enabled. */
4395 	if (!info->pri_enabled)
4396 		return -EINVAL;
4397 
4398 	return 0;
4399 }
4400 
4401 static int intel_iommu_enable_iopf(struct device *dev)
4402 {
4403 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4404 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4405 	struct intel_iommu *iommu;
4406 	int ret;
4407 
4408 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4409 		return -ENODEV;
4410 
4411 	if (info->pri_enabled)
4412 		return -EBUSY;
4413 
4414 	iommu = info->iommu;
4415 	if (!iommu)
4416 		return -EINVAL;
4417 
4418 	/* PASID is required in PRG Response Message. */
4419 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4420 		return -EINVAL;
4421 
4422 	ret = pci_reset_pri(pdev);
4423 	if (ret)
4424 		return ret;
4425 
4426 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4427 	if (ret)
4428 		return ret;
4429 
4430 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4431 	if (ret)
4432 		goto iopf_remove_device;
4433 
4434 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4435 	if (ret)
4436 		goto iopf_unregister_handler;
4437 	info->pri_enabled = 1;
4438 
4439 	return 0;
4440 
4441 iopf_unregister_handler:
4442 	iommu_unregister_device_fault_handler(dev);
4443 iopf_remove_device:
4444 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4445 
4446 	return ret;
4447 }
4448 
4449 static int intel_iommu_disable_iopf(struct device *dev)
4450 {
4451 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4452 	struct intel_iommu *iommu = info->iommu;
4453 
4454 	if (!info->pri_enabled)
4455 		return -EINVAL;
4456 
4457 	/*
4458 	 * PCIe spec states that by clearing PRI enable bit, the Page
4459 	 * Request Interface will not issue new page requests, but has
4460 	 * outstanding page requests that have been transmitted or are
4461 	 * queued for transmission. This is supposed to be called after
4462 	 * the device driver has stopped DMA, all PASIDs have been
4463 	 * unbound and the outstanding PRQs have been drained.
4464 	 */
4465 	pci_disable_pri(to_pci_dev(dev));
4466 	info->pri_enabled = 0;
4467 
4468 	/*
4469 	 * With PRI disabled and outstanding PRQs drained, unregistering
4470 	 * fault handler and removing device from iopf queue should never
4471 	 * fail.
4472 	 */
4473 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4474 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4475 
4476 	return 0;
4477 }
4478 
4479 static int
4480 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4481 {
4482 	switch (feat) {
4483 	case IOMMU_DEV_FEAT_IOPF:
4484 		return intel_iommu_enable_iopf(dev);
4485 
4486 	case IOMMU_DEV_FEAT_SVA:
4487 		return intel_iommu_enable_sva(dev);
4488 
4489 	default:
4490 		return -ENODEV;
4491 	}
4492 }
4493 
4494 static int
4495 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4496 {
4497 	switch (feat) {
4498 	case IOMMU_DEV_FEAT_IOPF:
4499 		return intel_iommu_disable_iopf(dev);
4500 
4501 	case IOMMU_DEV_FEAT_SVA:
4502 		return 0;
4503 
4504 	default:
4505 		return -ENODEV;
4506 	}
4507 }
4508 
4509 static bool intel_iommu_is_attach_deferred(struct device *dev)
4510 {
4511 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4512 
4513 	return translation_pre_enabled(info->iommu) && !info->domain;
4514 }
4515 
4516 /*
4517  * Check that the device does not live on an external facing PCI port that is
4518  * marked as untrusted. Such devices should not be able to apply quirks and
4519  * thus not be able to bypass the IOMMU restrictions.
4520  */
4521 static bool risky_device(struct pci_dev *pdev)
4522 {
4523 	if (pdev->untrusted) {
4524 		pci_info(pdev,
4525 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4526 			 pdev->vendor, pdev->device);
4527 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4528 		return true;
4529 	}
4530 	return false;
4531 }
4532 
4533 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4534 				      unsigned long iova, size_t size)
4535 {
4536 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4537 	unsigned long pages = aligned_nrpages(iova, size);
4538 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4539 	struct iommu_domain_info *info;
4540 	unsigned long i;
4541 
4542 	xa_for_each(&dmar_domain->iommu_array, i, info)
4543 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4544 	return 0;
4545 }
4546 
4547 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4548 {
4549 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4550 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4551 	struct intel_iommu *iommu = info->iommu;
4552 	struct dmar_domain *dmar_domain;
4553 	struct iommu_domain *domain;
4554 	unsigned long flags;
4555 
4556 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4557 	if (WARN_ON_ONCE(!domain))
4558 		goto out_tear_down;
4559 
4560 	/*
4561 	 * The SVA implementation needs to handle its own stuffs like the mm
4562 	 * notification. Before consolidating that code into iommu core, let
4563 	 * the intel sva code handle it.
4564 	 */
4565 	if (domain->type == IOMMU_DOMAIN_SVA) {
4566 		intel_svm_remove_dev_pasid(dev, pasid);
4567 		goto out_tear_down;
4568 	}
4569 
4570 	dmar_domain = to_dmar_domain(domain);
4571 	spin_lock_irqsave(&dmar_domain->lock, flags);
4572 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4573 		if (curr->dev == dev && curr->pasid == pasid) {
4574 			list_del(&curr->link_domain);
4575 			dev_pasid = curr;
4576 			break;
4577 		}
4578 	}
4579 	WARN_ON_ONCE(!dev_pasid);
4580 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4581 
4582 	domain_detach_iommu(dmar_domain, iommu);
4583 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4584 	kfree(dev_pasid);
4585 out_tear_down:
4586 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4587 	intel_drain_pasid_prq(dev, pasid);
4588 }
4589 
4590 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4591 				     struct device *dev, ioasid_t pasid)
4592 {
4593 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4594 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4595 	struct intel_iommu *iommu = info->iommu;
4596 	struct dev_pasid_info *dev_pasid;
4597 	unsigned long flags;
4598 	int ret;
4599 
4600 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4601 		return -EOPNOTSUPP;
4602 
4603 	if (domain->dirty_ops)
4604 		return -EINVAL;
4605 
4606 	if (context_copied(iommu, info->bus, info->devfn))
4607 		return -EBUSY;
4608 
4609 	ret = prepare_domain_attach_device(domain, dev);
4610 	if (ret)
4611 		return ret;
4612 
4613 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4614 	if (!dev_pasid)
4615 		return -ENOMEM;
4616 
4617 	ret = domain_attach_iommu(dmar_domain, iommu);
4618 	if (ret)
4619 		goto out_free;
4620 
4621 	if (domain_type_is_si(dmar_domain))
4622 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4623 	else if (dmar_domain->use_first_level)
4624 		ret = domain_setup_first_level(iommu, dmar_domain,
4625 					       dev, pasid);
4626 	else
4627 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4628 						     dev, pasid);
4629 	if (ret)
4630 		goto out_detach_iommu;
4631 
4632 	dev_pasid->dev = dev;
4633 	dev_pasid->pasid = pasid;
4634 	spin_lock_irqsave(&dmar_domain->lock, flags);
4635 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4636 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4637 
4638 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4639 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4640 
4641 	return 0;
4642 out_detach_iommu:
4643 	domain_detach_iommu(dmar_domain, iommu);
4644 out_free:
4645 	kfree(dev_pasid);
4646 	return ret;
4647 }
4648 
4649 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4650 {
4651 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4652 	struct intel_iommu *iommu = info->iommu;
4653 	struct iommu_hw_info_vtd *vtd;
4654 
4655 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4656 	if (!vtd)
4657 		return ERR_PTR(-ENOMEM);
4658 
4659 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4660 	vtd->cap_reg = iommu->cap;
4661 	vtd->ecap_reg = iommu->ecap;
4662 	*length = sizeof(*vtd);
4663 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4664 	return vtd;
4665 }
4666 
4667 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4668 					  bool enable)
4669 {
4670 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4671 	struct device_domain_info *info;
4672 	int ret;
4673 
4674 	spin_lock(&dmar_domain->lock);
4675 	if (dmar_domain->dirty_tracking == enable)
4676 		goto out_unlock;
4677 
4678 	list_for_each_entry(info, &dmar_domain->devices, link) {
4679 		ret = intel_pasid_setup_dirty_tracking(info->iommu,
4680 						       info->domain, info->dev,
4681 						       IOMMU_NO_PASID, enable);
4682 		if (ret)
4683 			goto err_unwind;
4684 	}
4685 
4686 	dmar_domain->dirty_tracking = enable;
4687 out_unlock:
4688 	spin_unlock(&dmar_domain->lock);
4689 
4690 	return 0;
4691 
4692 err_unwind:
4693 	list_for_each_entry(info, &dmar_domain->devices, link)
4694 		intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4695 						 info->dev, IOMMU_NO_PASID,
4696 						 dmar_domain->dirty_tracking);
4697 	spin_unlock(&dmar_domain->lock);
4698 	return ret;
4699 }
4700 
4701 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4702 					    unsigned long iova, size_t size,
4703 					    unsigned long flags,
4704 					    struct iommu_dirty_bitmap *dirty)
4705 {
4706 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4707 	unsigned long end = iova + size - 1;
4708 	unsigned long pgsize;
4709 
4710 	/*
4711 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4712 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4713 	 * have occurred when we stopped dirty tracking. This ensures that we
4714 	 * never inherit dirtied bits from a previous cycle.
4715 	 */
4716 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4717 		return -EINVAL;
4718 
4719 	do {
4720 		struct dma_pte *pte;
4721 		int lvl = 0;
4722 
4723 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4724 				     GFP_ATOMIC);
4725 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4726 		if (!pte || !dma_pte_present(pte)) {
4727 			iova += pgsize;
4728 			continue;
4729 		}
4730 
4731 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4732 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4733 		iova += pgsize;
4734 	} while (iova < end);
4735 
4736 	return 0;
4737 }
4738 
4739 static const struct iommu_dirty_ops intel_dirty_ops = {
4740 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4741 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4742 };
4743 
4744 const struct iommu_ops intel_iommu_ops = {
4745 	.blocked_domain		= &blocking_domain,
4746 	.capable		= intel_iommu_capable,
4747 	.hw_info		= intel_iommu_hw_info,
4748 	.domain_alloc		= intel_iommu_domain_alloc,
4749 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4750 	.probe_device		= intel_iommu_probe_device,
4751 	.probe_finalize		= intel_iommu_probe_finalize,
4752 	.release_device		= intel_iommu_release_device,
4753 	.get_resv_regions	= intel_iommu_get_resv_regions,
4754 	.device_group		= intel_iommu_device_group,
4755 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4756 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4757 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4758 	.def_domain_type	= device_def_domain_type,
4759 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4760 	.pgsize_bitmap		= SZ_4K,
4761 #ifdef CONFIG_INTEL_IOMMU_SVM
4762 	.page_response		= intel_svm_page_response,
4763 #endif
4764 	.default_domain_ops = &(const struct iommu_domain_ops) {
4765 		.attach_dev		= intel_iommu_attach_device,
4766 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4767 		.map_pages		= intel_iommu_map_pages,
4768 		.unmap_pages		= intel_iommu_unmap_pages,
4769 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4770 		.flush_iotlb_all        = intel_flush_iotlb_all,
4771 		.iotlb_sync		= intel_iommu_tlb_sync,
4772 		.iova_to_phys		= intel_iommu_iova_to_phys,
4773 		.free			= intel_iommu_domain_free,
4774 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4775 	}
4776 };
4777 
4778 static void quirk_iommu_igfx(struct pci_dev *dev)
4779 {
4780 	if (risky_device(dev))
4781 		return;
4782 
4783 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4784 	dmar_map_gfx = 0;
4785 }
4786 
4787 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4795 
4796 /* Broadwell igfx malfunctions with dmar */
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4821 
4822 static void quirk_iommu_rwbf(struct pci_dev *dev)
4823 {
4824 	if (risky_device(dev))
4825 		return;
4826 
4827 	/*
4828 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4829 	 * but needs it. Same seems to hold for the desktop versions.
4830 	 */
4831 	pci_info(dev, "Forcing write-buffer flush capability\n");
4832 	rwbf_quirk = 1;
4833 }
4834 
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4842 
4843 #define GGC 0x52
4844 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4845 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4846 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4847 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4848 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4849 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4850 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4851 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4852 
4853 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4854 {
4855 	unsigned short ggc;
4856 
4857 	if (risky_device(dev))
4858 		return;
4859 
4860 	if (pci_read_config_word(dev, GGC, &ggc))
4861 		return;
4862 
4863 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4864 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4865 		dmar_map_gfx = 0;
4866 	} else if (dmar_map_gfx) {
4867 		/* we have to ensure the gfx device is idle before we flush */
4868 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4869 		iommu_set_dma_strict();
4870 	}
4871 }
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4876 
4877 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4878 {
4879 	unsigned short ver;
4880 
4881 	if (!IS_GFX_DEVICE(dev))
4882 		return;
4883 
4884 	ver = (dev->device >> 8) & 0xff;
4885 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4886 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4887 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4888 		return;
4889 
4890 	if (risky_device(dev))
4891 		return;
4892 
4893 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4894 	iommu_skip_te_disable = 1;
4895 }
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4897 
4898 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4899    ISOCH DMAR unit for the Azalia sound device, but not give it any
4900    TLB entries, which causes it to deadlock. Check for that.  We do
4901    this in a function called from init_dmars(), instead of in a PCI
4902    quirk, because we don't want to print the obnoxious "BIOS broken"
4903    message if VT-d is actually disabled.
4904 */
4905 static void __init check_tylersburg_isoch(void)
4906 {
4907 	struct pci_dev *pdev;
4908 	uint32_t vtisochctrl;
4909 
4910 	/* If there's no Azalia in the system anyway, forget it. */
4911 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4912 	if (!pdev)
4913 		return;
4914 
4915 	if (risky_device(pdev)) {
4916 		pci_dev_put(pdev);
4917 		return;
4918 	}
4919 
4920 	pci_dev_put(pdev);
4921 
4922 	/* System Management Registers. Might be hidden, in which case
4923 	   we can't do the sanity check. But that's OK, because the
4924 	   known-broken BIOSes _don't_ actually hide it, so far. */
4925 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4926 	if (!pdev)
4927 		return;
4928 
4929 	if (risky_device(pdev)) {
4930 		pci_dev_put(pdev);
4931 		return;
4932 	}
4933 
4934 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4935 		pci_dev_put(pdev);
4936 		return;
4937 	}
4938 
4939 	pci_dev_put(pdev);
4940 
4941 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4942 	if (vtisochctrl & 1)
4943 		return;
4944 
4945 	/* Drop all bits other than the number of TLB entries */
4946 	vtisochctrl &= 0x1c;
4947 
4948 	/* If we have the recommended number of TLB entries (16), fine. */
4949 	if (vtisochctrl == 0x10)
4950 		return;
4951 
4952 	/* Zero TLB entries? You get to ride the short bus to school. */
4953 	if (!vtisochctrl) {
4954 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4955 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4956 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4957 		     dmi_get_system_info(DMI_BIOS_VERSION),
4958 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4959 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4960 		return;
4961 	}
4962 
4963 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4964 	       vtisochctrl);
4965 }
4966 
4967 /*
4968  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4969  * invalidation completion before posted writes initiated with translated address
4970  * that utilized translations matching the invalidation address range, violating
4971  * the invalidation completion ordering.
4972  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4973  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4974  * under the control of the trusted/privileged host device driver must use this
4975  * quirk.
4976  * Device TLBs are invalidated under the following six conditions:
4977  * 1. Device driver does DMA API unmap IOVA
4978  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4979  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4980  *    exit_mmap() due to crash
4981  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4982  *    VM has to free pages that were unmapped
4983  * 5. Userspace driver unmaps a DMA buffer
4984  * 6. Cache invalidation in vSVA usage (upcoming)
4985  *
4986  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4987  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4988  * invalidate TLB the same way as normal user unmap which will use this quirk.
4989  * The dTLB invalidation after PASID cache flush does not need this quirk.
4990  *
4991  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4992  */
4993 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4994 			       unsigned long address, unsigned long mask,
4995 			       u32 pasid, u16 qdep)
4996 {
4997 	u16 sid;
4998 
4999 	if (likely(!info->dtlb_extra_inval))
5000 		return;
5001 
5002 	sid = PCI_DEVID(info->bus, info->devfn);
5003 	if (pasid == IOMMU_NO_PASID) {
5004 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5005 				   qdep, address, mask);
5006 	} else {
5007 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5008 					 pasid, qdep, address, mask);
5009 	}
5010 }
5011 
5012 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5013 
5014 /*
5015  * Function to submit a command to the enhanced command interface. The
5016  * valid enhanced command descriptions are defined in Table 47 of the
5017  * VT-d spec. The VT-d hardware implementation may support some but not
5018  * all commands, which can be determined by checking the Enhanced
5019  * Command Capability Register.
5020  *
5021  * Return values:
5022  *  - 0: Command successful without any error;
5023  *  - Negative: software error value;
5024  *  - Nonzero positive: failure status code defined in Table 48.
5025  */
5026 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5027 {
5028 	unsigned long flags;
5029 	u64 res;
5030 	int ret;
5031 
5032 	if (!cap_ecmds(iommu->cap))
5033 		return -ENODEV;
5034 
5035 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5036 
5037 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5038 	if (res & DMA_ECMD_ECRSP_IP) {
5039 		ret = -EBUSY;
5040 		goto err;
5041 	}
5042 
5043 	/*
5044 	 * Unconditionally write the operand B, because
5045 	 * - There is no side effect if an ecmd doesn't require an
5046 	 *   operand B, but we set the register to some value.
5047 	 * - It's not invoked in any critical path. The extra MMIO
5048 	 *   write doesn't bring any performance concerns.
5049 	 */
5050 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5051 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5052 
5053 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5054 		      !(res & DMA_ECMD_ECRSP_IP), res);
5055 
5056 	if (res & DMA_ECMD_ECRSP_IP) {
5057 		ret = -ETIMEDOUT;
5058 		goto err;
5059 	}
5060 
5061 	ret = ecmd_get_status_code(res);
5062 err:
5063 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5064 
5065 	return ret;
5066 }
5067