xref: /linux/drivers/iommu/intel/iommu.c (revision 7cc9196675234d4de0e1e19b9da1a8b86ecfeedd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 /* IO virtual address start page frame number */
59 #define IOVA_START_PFN		(1)
60 
61 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
62 
63 static void __init check_tylersburg_isoch(void);
64 static int rwbf_quirk;
65 
66 /*
67  * set to 1 to panic kernel if can't successfully enable VT-d
68  * (used when kernel is launched w/ TXT)
69  */
70 static int force_on = 0;
71 static int intel_iommu_tboot_noforce;
72 static int no_platform_optin;
73 
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 
76 /*
77  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
78  * if marked present.
79  */
80 static phys_addr_t root_entry_lctp(struct root_entry *re)
81 {
82 	if (!(re->lo & 1))
83 		return 0;
84 
85 	return re->lo & VTD_PAGE_MASK;
86 }
87 
88 /*
89  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
90  * if marked present.
91  */
92 static phys_addr_t root_entry_uctp(struct root_entry *re)
93 {
94 	if (!(re->hi & 1))
95 		return 0;
96 
97 	return re->hi & VTD_PAGE_MASK;
98 }
99 
100 /*
101  * This domain is a statically identity mapping domain.
102  *	1. This domain creats a static 1:1 mapping to all usable memory.
103  * 	2. It maps to each iommu if successful.
104  *	3. Each iommu mapps to this domain if successful.
105  */
106 static struct dmar_domain *si_domain;
107 static int hw_pass_through = 1;
108 
109 struct dmar_rmrr_unit {
110 	struct list_head list;		/* list of rmrr units	*/
111 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
112 	u64	base_address;		/* reserved base address*/
113 	u64	end_address;		/* reserved end address */
114 	struct dmar_dev_scope *devices;	/* target devices */
115 	int	devices_cnt;		/* target device count */
116 };
117 
118 struct dmar_atsr_unit {
119 	struct list_head list;		/* list of ATSR units */
120 	struct acpi_dmar_header *hdr;	/* ACPI header */
121 	struct dmar_dev_scope *devices;	/* target devices */
122 	int devices_cnt;		/* target device count */
123 	u8 include_all:1;		/* include all ports */
124 };
125 
126 struct dmar_satc_unit {
127 	struct list_head list;		/* list of SATC units */
128 	struct acpi_dmar_header *hdr;	/* ACPI header */
129 	struct dmar_dev_scope *devices;	/* target devices */
130 	struct intel_iommu *iommu;	/* the corresponding iommu */
131 	int devices_cnt;		/* target device count */
132 	u8 atc_required:1;		/* ATS is required */
133 };
134 
135 static LIST_HEAD(dmar_atsr_units);
136 static LIST_HEAD(dmar_rmrr_units);
137 static LIST_HEAD(dmar_satc_units);
138 
139 #define for_each_rmrr_units(rmrr) \
140 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
141 
142 static void intel_iommu_domain_free(struct iommu_domain *domain);
143 
144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
146 
147 int intel_iommu_enabled = 0;
148 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
149 
150 static int dmar_map_gfx = 1;
151 static int intel_iommu_superpage = 1;
152 static int iommu_identity_mapping;
153 static int iommu_skip_te_disable;
154 
155 #define IDENTMAP_GFX		2
156 #define IDENTMAP_AZALIA		4
157 
158 const struct iommu_ops intel_iommu_ops;
159 static const struct iommu_dirty_ops intel_dirty_ops;
160 
161 static bool translation_pre_enabled(struct intel_iommu *iommu)
162 {
163 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
164 }
165 
166 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
167 {
168 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
169 }
170 
171 static void init_translation_status(struct intel_iommu *iommu)
172 {
173 	u32 gsts;
174 
175 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
176 	if (gsts & DMA_GSTS_TES)
177 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
178 }
179 
180 static int __init intel_iommu_setup(char *str)
181 {
182 	if (!str)
183 		return -EINVAL;
184 
185 	while (*str) {
186 		if (!strncmp(str, "on", 2)) {
187 			dmar_disabled = 0;
188 			pr_info("IOMMU enabled\n");
189 		} else if (!strncmp(str, "off", 3)) {
190 			dmar_disabled = 1;
191 			no_platform_optin = 1;
192 			pr_info("IOMMU disabled\n");
193 		} else if (!strncmp(str, "igfx_off", 8)) {
194 			dmar_map_gfx = 0;
195 			pr_info("Disable GFX device mapping\n");
196 		} else if (!strncmp(str, "forcedac", 8)) {
197 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
198 			iommu_dma_forcedac = true;
199 		} else if (!strncmp(str, "strict", 6)) {
200 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
201 			iommu_set_dma_strict();
202 		} else if (!strncmp(str, "sp_off", 6)) {
203 			pr_info("Disable supported super page\n");
204 			intel_iommu_superpage = 0;
205 		} else if (!strncmp(str, "sm_on", 5)) {
206 			pr_info("Enable scalable mode if hardware supports\n");
207 			intel_iommu_sm = 1;
208 		} else if (!strncmp(str, "sm_off", 6)) {
209 			pr_info("Scalable mode is disallowed\n");
210 			intel_iommu_sm = 0;
211 		} else if (!strncmp(str, "tboot_noforce", 13)) {
212 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
213 			intel_iommu_tboot_noforce = 1;
214 		} else {
215 			pr_notice("Unknown option - '%s'\n", str);
216 		}
217 
218 		str += strcspn(str, ",");
219 		while (*str == ',')
220 			str++;
221 	}
222 
223 	return 1;
224 }
225 __setup("intel_iommu=", intel_iommu_setup);
226 
227 void *alloc_pgtable_page(int node, gfp_t gfp)
228 {
229 	struct page *page;
230 	void *vaddr = NULL;
231 
232 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
233 	if (page)
234 		vaddr = page_address(page);
235 	return vaddr;
236 }
237 
238 void free_pgtable_page(void *vaddr)
239 {
240 	free_page((unsigned long)vaddr);
241 }
242 
243 static int domain_type_is_si(struct dmar_domain *domain)
244 {
245 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
246 }
247 
248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
249 {
250 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
251 
252 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
253 }
254 
255 /*
256  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
257  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
258  * the returned SAGAW.
259  */
260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
261 {
262 	unsigned long fl_sagaw, sl_sagaw;
263 
264 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
265 	sl_sagaw = cap_sagaw(iommu->cap);
266 
267 	/* Second level only. */
268 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
269 		return sl_sagaw;
270 
271 	/* First level only. */
272 	if (!ecap_slts(iommu->ecap))
273 		return fl_sagaw;
274 
275 	return fl_sagaw & sl_sagaw;
276 }
277 
278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
279 {
280 	unsigned long sagaw;
281 	int agaw;
282 
283 	sagaw = __iommu_calculate_sagaw(iommu);
284 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
285 		if (test_bit(agaw, &sagaw))
286 			break;
287 	}
288 
289 	return agaw;
290 }
291 
292 /*
293  * Calculate max SAGAW for each iommu.
294  */
295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
296 {
297 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
298 }
299 
300 /*
301  * calculate agaw for each iommu.
302  * "SAGAW" may be different across iommus, use a default agaw, and
303  * get a supported less agaw for iommus that don't support the default agaw.
304  */
305 int iommu_calculate_agaw(struct intel_iommu *iommu)
306 {
307 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
308 }
309 
310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
311 {
312 	return sm_supported(iommu) ?
313 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
314 }
315 
316 static void domain_update_iommu_coherency(struct dmar_domain *domain)
317 {
318 	struct iommu_domain_info *info;
319 	struct dmar_drhd_unit *drhd;
320 	struct intel_iommu *iommu;
321 	bool found = false;
322 	unsigned long i;
323 
324 	domain->iommu_coherency = true;
325 	xa_for_each(&domain->iommu_array, i, info) {
326 		found = true;
327 		if (!iommu_paging_structure_coherency(info->iommu)) {
328 			domain->iommu_coherency = false;
329 			break;
330 		}
331 	}
332 	if (found)
333 		return;
334 
335 	/* No hardware attached; use lowest common denominator */
336 	rcu_read_lock();
337 	for_each_active_iommu(iommu, drhd) {
338 		if (!iommu_paging_structure_coherency(iommu)) {
339 			domain->iommu_coherency = false;
340 			break;
341 		}
342 	}
343 	rcu_read_unlock();
344 }
345 
346 static int domain_update_iommu_superpage(struct dmar_domain *domain,
347 					 struct intel_iommu *skip)
348 {
349 	struct dmar_drhd_unit *drhd;
350 	struct intel_iommu *iommu;
351 	int mask = 0x3;
352 
353 	if (!intel_iommu_superpage)
354 		return 0;
355 
356 	/* set iommu_superpage to the smallest common denominator */
357 	rcu_read_lock();
358 	for_each_active_iommu(iommu, drhd) {
359 		if (iommu != skip) {
360 			if (domain && domain->use_first_level) {
361 				if (!cap_fl1gp_support(iommu->cap))
362 					mask = 0x1;
363 			} else {
364 				mask &= cap_super_page_val(iommu->cap);
365 			}
366 
367 			if (!mask)
368 				break;
369 		}
370 	}
371 	rcu_read_unlock();
372 
373 	return fls(mask);
374 }
375 
376 static int domain_update_device_node(struct dmar_domain *domain)
377 {
378 	struct device_domain_info *info;
379 	int nid = NUMA_NO_NODE;
380 	unsigned long flags;
381 
382 	spin_lock_irqsave(&domain->lock, flags);
383 	list_for_each_entry(info, &domain->devices, link) {
384 		/*
385 		 * There could possibly be multiple device numa nodes as devices
386 		 * within the same domain may sit behind different IOMMUs. There
387 		 * isn't perfect answer in such situation, so we select first
388 		 * come first served policy.
389 		 */
390 		nid = dev_to_node(info->dev);
391 		if (nid != NUMA_NO_NODE)
392 			break;
393 	}
394 	spin_unlock_irqrestore(&domain->lock, flags);
395 
396 	return nid;
397 }
398 
399 /* Return the super pagesize bitmap if supported. */
400 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
401 {
402 	unsigned long bitmap = 0;
403 
404 	/*
405 	 * 1-level super page supports page size of 2MiB, 2-level super page
406 	 * supports page size of both 2MiB and 1GiB.
407 	 */
408 	if (domain->iommu_superpage == 1)
409 		bitmap |= SZ_2M;
410 	else if (domain->iommu_superpage == 2)
411 		bitmap |= SZ_2M | SZ_1G;
412 
413 	return bitmap;
414 }
415 
416 /* Some capabilities may be different across iommus */
417 void domain_update_iommu_cap(struct dmar_domain *domain)
418 {
419 	domain_update_iommu_coherency(domain);
420 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
421 
422 	/*
423 	 * If RHSA is missing, we should default to the device numa domain
424 	 * as fall back.
425 	 */
426 	if (domain->nid == NUMA_NO_NODE)
427 		domain->nid = domain_update_device_node(domain);
428 
429 	/*
430 	 * First-level translation restricts the input-address to a
431 	 * canonical address (i.e., address bits 63:N have the same
432 	 * value as address bit [N-1], where N is 48-bits with 4-level
433 	 * paging and 57-bits with 5-level paging). Hence, skip bit
434 	 * [N-1].
435 	 */
436 	if (domain->use_first_level)
437 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
438 	else
439 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
440 
441 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
442 	domain_update_iotlb(domain);
443 }
444 
445 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
446 					 u8 devfn, int alloc)
447 {
448 	struct root_entry *root = &iommu->root_entry[bus];
449 	struct context_entry *context;
450 	u64 *entry;
451 
452 	/*
453 	 * Except that the caller requested to allocate a new entry,
454 	 * returning a copied context entry makes no sense.
455 	 */
456 	if (!alloc && context_copied(iommu, bus, devfn))
457 		return NULL;
458 
459 	entry = &root->lo;
460 	if (sm_supported(iommu)) {
461 		if (devfn >= 0x80) {
462 			devfn -= 0x80;
463 			entry = &root->hi;
464 		}
465 		devfn *= 2;
466 	}
467 	if (*entry & 1)
468 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
469 	else {
470 		unsigned long phy_addr;
471 		if (!alloc)
472 			return NULL;
473 
474 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
475 		if (!context)
476 			return NULL;
477 
478 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
479 		phy_addr = virt_to_phys((void *)context);
480 		*entry = phy_addr | 1;
481 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
482 	}
483 	return &context[devfn];
484 }
485 
486 /**
487  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
488  *				 sub-hierarchy of a candidate PCI-PCI bridge
489  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
490  * @bridge: the candidate PCI-PCI bridge
491  *
492  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
493  */
494 static bool
495 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
496 {
497 	struct pci_dev *pdev, *pbridge;
498 
499 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
500 		return false;
501 
502 	pdev = to_pci_dev(dev);
503 	pbridge = to_pci_dev(bridge);
504 
505 	if (pbridge->subordinate &&
506 	    pbridge->subordinate->number <= pdev->bus->number &&
507 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
508 		return true;
509 
510 	return false;
511 }
512 
513 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
514 {
515 	struct dmar_drhd_unit *drhd;
516 	u32 vtbar;
517 	int rc;
518 
519 	/* We know that this device on this chipset has its own IOMMU.
520 	 * If we find it under a different IOMMU, then the BIOS is lying
521 	 * to us. Hope that the IOMMU for this device is actually
522 	 * disabled, and it needs no translation...
523 	 */
524 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
525 	if (rc) {
526 		/* "can't" happen */
527 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
528 		return false;
529 	}
530 	vtbar &= 0xffff0000;
531 
532 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
533 	drhd = dmar_find_matched_drhd_unit(pdev);
534 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
535 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
536 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
537 		return true;
538 	}
539 
540 	return false;
541 }
542 
543 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
544 {
545 	if (!iommu || iommu->drhd->ignored)
546 		return true;
547 
548 	if (dev_is_pci(dev)) {
549 		struct pci_dev *pdev = to_pci_dev(dev);
550 
551 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
552 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
553 		    quirk_ioat_snb_local_iommu(pdev))
554 			return true;
555 	}
556 
557 	return false;
558 }
559 
560 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
561 {
562 	struct dmar_drhd_unit *drhd = NULL;
563 	struct pci_dev *pdev = NULL;
564 	struct intel_iommu *iommu;
565 	struct device *tmp;
566 	u16 segment = 0;
567 	int i;
568 
569 	if (!dev)
570 		return NULL;
571 
572 	if (dev_is_pci(dev)) {
573 		struct pci_dev *pf_pdev;
574 
575 		pdev = pci_real_dma_dev(to_pci_dev(dev));
576 
577 		/* VFs aren't listed in scope tables; we need to look up
578 		 * the PF instead to find the IOMMU. */
579 		pf_pdev = pci_physfn(pdev);
580 		dev = &pf_pdev->dev;
581 		segment = pci_domain_nr(pdev->bus);
582 	} else if (has_acpi_companion(dev))
583 		dev = &ACPI_COMPANION(dev)->dev;
584 
585 	rcu_read_lock();
586 	for_each_iommu(iommu, drhd) {
587 		if (pdev && segment != drhd->segment)
588 			continue;
589 
590 		for_each_active_dev_scope(drhd->devices,
591 					  drhd->devices_cnt, i, tmp) {
592 			if (tmp == dev) {
593 				/* For a VF use its original BDF# not that of the PF
594 				 * which we used for the IOMMU lookup. Strictly speaking
595 				 * we could do this for all PCI devices; we only need to
596 				 * get the BDF# from the scope table for ACPI matches. */
597 				if (pdev && pdev->is_virtfn)
598 					goto got_pdev;
599 
600 				if (bus && devfn) {
601 					*bus = drhd->devices[i].bus;
602 					*devfn = drhd->devices[i].devfn;
603 				}
604 				goto out;
605 			}
606 
607 			if (is_downstream_to_pci_bridge(dev, tmp))
608 				goto got_pdev;
609 		}
610 
611 		if (pdev && drhd->include_all) {
612 got_pdev:
613 			if (bus && devfn) {
614 				*bus = pdev->bus->number;
615 				*devfn = pdev->devfn;
616 			}
617 			goto out;
618 		}
619 	}
620 	iommu = NULL;
621 out:
622 	if (iommu_is_dummy(iommu, dev))
623 		iommu = NULL;
624 
625 	rcu_read_unlock();
626 
627 	return iommu;
628 }
629 
630 static void domain_flush_cache(struct dmar_domain *domain,
631 			       void *addr, int size)
632 {
633 	if (!domain->iommu_coherency)
634 		clflush_cache_range(addr, size);
635 }
636 
637 static void free_context_table(struct intel_iommu *iommu)
638 {
639 	struct context_entry *context;
640 	int i;
641 
642 	if (!iommu->root_entry)
643 		return;
644 
645 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
646 		context = iommu_context_addr(iommu, i, 0, 0);
647 		if (context)
648 			free_pgtable_page(context);
649 
650 		if (!sm_supported(iommu))
651 			continue;
652 
653 		context = iommu_context_addr(iommu, i, 0x80, 0);
654 		if (context)
655 			free_pgtable_page(context);
656 	}
657 
658 	free_pgtable_page(iommu->root_entry);
659 	iommu->root_entry = NULL;
660 }
661 
662 #ifdef CONFIG_DMAR_DEBUG
663 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
664 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
665 {
666 	struct dma_pte *pte;
667 	int offset;
668 
669 	while (1) {
670 		offset = pfn_level_offset(pfn, level);
671 		pte = &parent[offset];
672 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
673 			pr_info("PTE not present at level %d\n", level);
674 			break;
675 		}
676 
677 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
678 
679 		if (level == 1)
680 			break;
681 
682 		parent = phys_to_virt(dma_pte_addr(pte));
683 		level--;
684 	}
685 }
686 
687 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
688 			  unsigned long long addr, u32 pasid)
689 {
690 	struct pasid_dir_entry *dir, *pde;
691 	struct pasid_entry *entries, *pte;
692 	struct context_entry *ctx_entry;
693 	struct root_entry *rt_entry;
694 	int i, dir_index, index, level;
695 	u8 devfn = source_id & 0xff;
696 	u8 bus = source_id >> 8;
697 	struct dma_pte *pgtable;
698 
699 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
700 
701 	/* root entry dump */
702 	rt_entry = &iommu->root_entry[bus];
703 	if (!rt_entry) {
704 		pr_info("root table entry is not present\n");
705 		return;
706 	}
707 
708 	if (sm_supported(iommu))
709 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
710 			rt_entry->hi, rt_entry->lo);
711 	else
712 		pr_info("root entry: 0x%016llx", rt_entry->lo);
713 
714 	/* context entry dump */
715 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
716 	if (!ctx_entry) {
717 		pr_info("context table entry is not present\n");
718 		return;
719 	}
720 
721 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
722 		ctx_entry->hi, ctx_entry->lo);
723 
724 	/* legacy mode does not require PASID entries */
725 	if (!sm_supported(iommu)) {
726 		level = agaw_to_level(ctx_entry->hi & 7);
727 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
728 		goto pgtable_walk;
729 	}
730 
731 	/* get the pointer to pasid directory entry */
732 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
733 	if (!dir) {
734 		pr_info("pasid directory entry is not present\n");
735 		return;
736 	}
737 	/* For request-without-pasid, get the pasid from context entry */
738 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
739 		pasid = IOMMU_NO_PASID;
740 
741 	dir_index = pasid >> PASID_PDE_SHIFT;
742 	pde = &dir[dir_index];
743 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
744 
745 	/* get the pointer to the pasid table entry */
746 	entries = get_pasid_table_from_pde(pde);
747 	if (!entries) {
748 		pr_info("pasid table entry is not present\n");
749 		return;
750 	}
751 	index = pasid & PASID_PTE_MASK;
752 	pte = &entries[index];
753 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
754 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
755 
756 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
757 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
758 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
759 	} else {
760 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
761 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
762 	}
763 
764 pgtable_walk:
765 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
766 }
767 #endif
768 
769 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
770 				      unsigned long pfn, int *target_level,
771 				      gfp_t gfp)
772 {
773 	struct dma_pte *parent, *pte;
774 	int level = agaw_to_level(domain->agaw);
775 	int offset;
776 
777 	if (!domain_pfn_supported(domain, pfn))
778 		/* Address beyond IOMMU's addressing capabilities. */
779 		return NULL;
780 
781 	parent = domain->pgd;
782 
783 	while (1) {
784 		void *tmp_page;
785 
786 		offset = pfn_level_offset(pfn, level);
787 		pte = &parent[offset];
788 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789 			break;
790 		if (level == *target_level)
791 			break;
792 
793 		if (!dma_pte_present(pte)) {
794 			uint64_t pteval;
795 
796 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
797 
798 			if (!tmp_page)
799 				return NULL;
800 
801 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803 			if (domain->use_first_level)
804 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
805 
806 			if (cmpxchg64(&pte->val, 0ULL, pteval))
807 				/* Someone else set it while we were thinking; use theirs. */
808 				free_pgtable_page(tmp_page);
809 			else
810 				domain_flush_cache(domain, pte, sizeof(*pte));
811 		}
812 		if (level == 1)
813 			break;
814 
815 		parent = phys_to_virt(dma_pte_addr(pte));
816 		level--;
817 	}
818 
819 	if (!*target_level)
820 		*target_level = level;
821 
822 	return pte;
823 }
824 
825 /* return address's pte at specific level */
826 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
827 					 unsigned long pfn,
828 					 int level, int *large_page)
829 {
830 	struct dma_pte *parent, *pte;
831 	int total = agaw_to_level(domain->agaw);
832 	int offset;
833 
834 	parent = domain->pgd;
835 	while (level <= total) {
836 		offset = pfn_level_offset(pfn, total);
837 		pte = &parent[offset];
838 		if (level == total)
839 			return pte;
840 
841 		if (!dma_pte_present(pte)) {
842 			*large_page = total;
843 			break;
844 		}
845 
846 		if (dma_pte_superpage(pte)) {
847 			*large_page = total;
848 			return pte;
849 		}
850 
851 		parent = phys_to_virt(dma_pte_addr(pte));
852 		total--;
853 	}
854 	return NULL;
855 }
856 
857 /* clear last level pte, a tlb flush should be followed */
858 static void dma_pte_clear_range(struct dmar_domain *domain,
859 				unsigned long start_pfn,
860 				unsigned long last_pfn)
861 {
862 	unsigned int large_page;
863 	struct dma_pte *first_pte, *pte;
864 
865 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
866 	    WARN_ON(start_pfn > last_pfn))
867 		return;
868 
869 	/* we don't need lock here; nobody else touches the iova range */
870 	do {
871 		large_page = 1;
872 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
873 		if (!pte) {
874 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
875 			continue;
876 		}
877 		do {
878 			dma_clear_pte(pte);
879 			start_pfn += lvl_to_nr_pages(large_page);
880 			pte++;
881 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
882 
883 		domain_flush_cache(domain, first_pte,
884 				   (void *)pte - (void *)first_pte);
885 
886 	} while (start_pfn && start_pfn <= last_pfn);
887 }
888 
889 static void dma_pte_free_level(struct dmar_domain *domain, int level,
890 			       int retain_level, struct dma_pte *pte,
891 			       unsigned long pfn, unsigned long start_pfn,
892 			       unsigned long last_pfn)
893 {
894 	pfn = max(start_pfn, pfn);
895 	pte = &pte[pfn_level_offset(pfn, level)];
896 
897 	do {
898 		unsigned long level_pfn;
899 		struct dma_pte *level_pte;
900 
901 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
902 			goto next;
903 
904 		level_pfn = pfn & level_mask(level);
905 		level_pte = phys_to_virt(dma_pte_addr(pte));
906 
907 		if (level > 2) {
908 			dma_pte_free_level(domain, level - 1, retain_level,
909 					   level_pte, level_pfn, start_pfn,
910 					   last_pfn);
911 		}
912 
913 		/*
914 		 * Free the page table if we're below the level we want to
915 		 * retain and the range covers the entire table.
916 		 */
917 		if (level < retain_level && !(start_pfn > level_pfn ||
918 		      last_pfn < level_pfn + level_size(level) - 1)) {
919 			dma_clear_pte(pte);
920 			domain_flush_cache(domain, pte, sizeof(*pte));
921 			free_pgtable_page(level_pte);
922 		}
923 next:
924 		pfn += level_size(level);
925 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
926 }
927 
928 /*
929  * clear last level (leaf) ptes and free page table pages below the
930  * level we wish to keep intact.
931  */
932 static void dma_pte_free_pagetable(struct dmar_domain *domain,
933 				   unsigned long start_pfn,
934 				   unsigned long last_pfn,
935 				   int retain_level)
936 {
937 	dma_pte_clear_range(domain, start_pfn, last_pfn);
938 
939 	/* We don't need lock here; nobody else touches the iova range */
940 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
941 			   domain->pgd, 0, start_pfn, last_pfn);
942 
943 	/* free pgd */
944 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
945 		free_pgtable_page(domain->pgd);
946 		domain->pgd = NULL;
947 	}
948 }
949 
950 /* When a page at a given level is being unlinked from its parent, we don't
951    need to *modify* it at all. All we need to do is make a list of all the
952    pages which can be freed just as soon as we've flushed the IOTLB and we
953    know the hardware page-walk will no longer touch them.
954    The 'pte' argument is the *parent* PTE, pointing to the page that is to
955    be freed. */
956 static void dma_pte_list_pagetables(struct dmar_domain *domain,
957 				    int level, struct dma_pte *pte,
958 				    struct list_head *freelist)
959 {
960 	struct page *pg;
961 
962 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
963 	list_add_tail(&pg->lru, freelist);
964 
965 	if (level == 1)
966 		return;
967 
968 	pte = page_address(pg);
969 	do {
970 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
971 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
972 		pte++;
973 	} while (!first_pte_in_page(pte));
974 }
975 
976 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
977 				struct dma_pte *pte, unsigned long pfn,
978 				unsigned long start_pfn, unsigned long last_pfn,
979 				struct list_head *freelist)
980 {
981 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
982 
983 	pfn = max(start_pfn, pfn);
984 	pte = &pte[pfn_level_offset(pfn, level)];
985 
986 	do {
987 		unsigned long level_pfn = pfn & level_mask(level);
988 
989 		if (!dma_pte_present(pte))
990 			goto next;
991 
992 		/* If range covers entire pagetable, free it */
993 		if (start_pfn <= level_pfn &&
994 		    last_pfn >= level_pfn + level_size(level) - 1) {
995 			/* These suborbinate page tables are going away entirely. Don't
996 			   bother to clear them; we're just going to *free* them. */
997 			if (level > 1 && !dma_pte_superpage(pte))
998 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
999 
1000 			dma_clear_pte(pte);
1001 			if (!first_pte)
1002 				first_pte = pte;
1003 			last_pte = pte;
1004 		} else if (level > 1) {
1005 			/* Recurse down into a level that isn't *entirely* obsolete */
1006 			dma_pte_clear_level(domain, level - 1,
1007 					    phys_to_virt(dma_pte_addr(pte)),
1008 					    level_pfn, start_pfn, last_pfn,
1009 					    freelist);
1010 		}
1011 next:
1012 		pfn = level_pfn + level_size(level);
1013 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1014 
1015 	if (first_pte)
1016 		domain_flush_cache(domain, first_pte,
1017 				   (void *)++last_pte - (void *)first_pte);
1018 }
1019 
1020 /* We can't just free the pages because the IOMMU may still be walking
1021    the page tables, and may have cached the intermediate levels. The
1022    pages can only be freed after the IOTLB flush has been done. */
1023 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1024 			 unsigned long last_pfn, struct list_head *freelist)
1025 {
1026 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1027 	    WARN_ON(start_pfn > last_pfn))
1028 		return;
1029 
1030 	/* we don't need lock here; nobody else touches the iova range */
1031 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1032 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1033 
1034 	/* free pgd */
1035 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1036 		struct page *pgd_page = virt_to_page(domain->pgd);
1037 		list_add_tail(&pgd_page->lru, freelist);
1038 		domain->pgd = NULL;
1039 	}
1040 }
1041 
1042 /* iommu handling */
1043 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1044 {
1045 	struct root_entry *root;
1046 
1047 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1048 	if (!root) {
1049 		pr_err("Allocating root entry for %s failed\n",
1050 			iommu->name);
1051 		return -ENOMEM;
1052 	}
1053 
1054 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1055 	iommu->root_entry = root;
1056 
1057 	return 0;
1058 }
1059 
1060 static void iommu_set_root_entry(struct intel_iommu *iommu)
1061 {
1062 	u64 addr;
1063 	u32 sts;
1064 	unsigned long flag;
1065 
1066 	addr = virt_to_phys(iommu->root_entry);
1067 	if (sm_supported(iommu))
1068 		addr |= DMA_RTADDR_SMT;
1069 
1070 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1071 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1072 
1073 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1074 
1075 	/* Make sure hardware complete it */
1076 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1077 		      readl, (sts & DMA_GSTS_RTPS), sts);
1078 
1079 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1080 
1081 	/*
1082 	 * Hardware invalidates all DMA remapping hardware translation
1083 	 * caches as part of SRTP flow.
1084 	 */
1085 	if (cap_esrtps(iommu->cap))
1086 		return;
1087 
1088 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1089 	if (sm_supported(iommu))
1090 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1091 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1092 }
1093 
1094 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1095 {
1096 	u32 val;
1097 	unsigned long flag;
1098 
1099 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1100 		return;
1101 
1102 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1103 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1104 
1105 	/* Make sure hardware complete it */
1106 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1107 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1108 
1109 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1110 }
1111 
1112 /* return value determine if we need a write buffer flush */
1113 static void __iommu_flush_context(struct intel_iommu *iommu,
1114 				  u16 did, u16 source_id, u8 function_mask,
1115 				  u64 type)
1116 {
1117 	u64 val = 0;
1118 	unsigned long flag;
1119 
1120 	switch (type) {
1121 	case DMA_CCMD_GLOBAL_INVL:
1122 		val = DMA_CCMD_GLOBAL_INVL;
1123 		break;
1124 	case DMA_CCMD_DOMAIN_INVL:
1125 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1126 		break;
1127 	case DMA_CCMD_DEVICE_INVL:
1128 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1129 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1130 		break;
1131 	default:
1132 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1133 			iommu->name, type);
1134 		return;
1135 	}
1136 	val |= DMA_CCMD_ICC;
1137 
1138 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1139 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1140 
1141 	/* Make sure hardware complete it */
1142 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1143 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1144 
1145 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1146 }
1147 
1148 /* return value determine if we need a write buffer flush */
1149 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1150 				u64 addr, unsigned int size_order, u64 type)
1151 {
1152 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1153 	u64 val = 0, val_iva = 0;
1154 	unsigned long flag;
1155 
1156 	switch (type) {
1157 	case DMA_TLB_GLOBAL_FLUSH:
1158 		/* global flush doesn't need set IVA_REG */
1159 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1160 		break;
1161 	case DMA_TLB_DSI_FLUSH:
1162 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1163 		break;
1164 	case DMA_TLB_PSI_FLUSH:
1165 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1166 		/* IH bit is passed in as part of address */
1167 		val_iva = size_order | addr;
1168 		break;
1169 	default:
1170 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1171 			iommu->name, type);
1172 		return;
1173 	}
1174 
1175 	if (cap_write_drain(iommu->cap))
1176 		val |= DMA_TLB_WRITE_DRAIN;
1177 
1178 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1179 	/* Note: Only uses first TLB reg currently */
1180 	if (val_iva)
1181 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1182 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1183 
1184 	/* Make sure hardware complete it */
1185 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1186 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1187 
1188 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1189 
1190 	/* check IOTLB invalidation granularity */
1191 	if (DMA_TLB_IAIG(val) == 0)
1192 		pr_err("Flush IOTLB failed\n");
1193 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1194 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1195 			(unsigned long long)DMA_TLB_IIRG(type),
1196 			(unsigned long long)DMA_TLB_IAIG(val));
1197 }
1198 
1199 static struct device_domain_info *
1200 domain_lookup_dev_info(struct dmar_domain *domain,
1201 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1202 {
1203 	struct device_domain_info *info;
1204 	unsigned long flags;
1205 
1206 	spin_lock_irqsave(&domain->lock, flags);
1207 	list_for_each_entry(info, &domain->devices, link) {
1208 		if (info->iommu == iommu && info->bus == bus &&
1209 		    info->devfn == devfn) {
1210 			spin_unlock_irqrestore(&domain->lock, flags);
1211 			return info;
1212 		}
1213 	}
1214 	spin_unlock_irqrestore(&domain->lock, flags);
1215 
1216 	return NULL;
1217 }
1218 
1219 void domain_update_iotlb(struct dmar_domain *domain)
1220 {
1221 	struct dev_pasid_info *dev_pasid;
1222 	struct device_domain_info *info;
1223 	bool has_iotlb_device = false;
1224 	unsigned long flags;
1225 
1226 	spin_lock_irqsave(&domain->lock, flags);
1227 	list_for_each_entry(info, &domain->devices, link) {
1228 		if (info->ats_enabled) {
1229 			has_iotlb_device = true;
1230 			break;
1231 		}
1232 	}
1233 
1234 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1235 		info = dev_iommu_priv_get(dev_pasid->dev);
1236 		if (info->ats_enabled) {
1237 			has_iotlb_device = true;
1238 			break;
1239 		}
1240 	}
1241 	domain->has_iotlb_device = has_iotlb_device;
1242 	spin_unlock_irqrestore(&domain->lock, flags);
1243 }
1244 
1245 /*
1246  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1247  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1248  * check because it applies only to the built-in QAT devices and it doesn't
1249  * grant additional privileges.
1250  */
1251 #define BUGGY_QAT_DEVID_MASK 0x4940
1252 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1253 {
1254 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1255 		return false;
1256 
1257 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1258 		return false;
1259 
1260 	return true;
1261 }
1262 
1263 static void iommu_enable_pci_caps(struct device_domain_info *info)
1264 {
1265 	struct pci_dev *pdev;
1266 
1267 	if (!dev_is_pci(info->dev))
1268 		return;
1269 
1270 	pdev = to_pci_dev(info->dev);
1271 
1272 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1273 	   the device if you enable PASID support after ATS support is
1274 	   undefined. So always enable PASID support on devices which
1275 	   have it, even if we can't yet know if we're ever going to
1276 	   use it. */
1277 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1278 		info->pasid_enabled = 1;
1279 
1280 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1281 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1282 		info->ats_enabled = 1;
1283 		domain_update_iotlb(info->domain);
1284 	}
1285 }
1286 
1287 static void iommu_disable_pci_caps(struct device_domain_info *info)
1288 {
1289 	struct pci_dev *pdev;
1290 
1291 	if (!dev_is_pci(info->dev))
1292 		return;
1293 
1294 	pdev = to_pci_dev(info->dev);
1295 
1296 	if (info->ats_enabled) {
1297 		pci_disable_ats(pdev);
1298 		info->ats_enabled = 0;
1299 		domain_update_iotlb(info->domain);
1300 	}
1301 
1302 	if (info->pasid_enabled) {
1303 		pci_disable_pasid(pdev);
1304 		info->pasid_enabled = 0;
1305 	}
1306 }
1307 
1308 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1309 				    u64 addr, unsigned int mask)
1310 {
1311 	u16 sid, qdep;
1312 
1313 	if (!info || !info->ats_enabled)
1314 		return;
1315 
1316 	sid = info->bus << 8 | info->devfn;
1317 	qdep = info->ats_qdep;
1318 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1319 			   qdep, addr, mask);
1320 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1321 }
1322 
1323 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1324 				  u64 addr, unsigned mask)
1325 {
1326 	struct dev_pasid_info *dev_pasid;
1327 	struct device_domain_info *info;
1328 	unsigned long flags;
1329 
1330 	if (!domain->has_iotlb_device)
1331 		return;
1332 
1333 	spin_lock_irqsave(&domain->lock, flags);
1334 	list_for_each_entry(info, &domain->devices, link)
1335 		__iommu_flush_dev_iotlb(info, addr, mask);
1336 
1337 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1338 		info = dev_iommu_priv_get(dev_pasid->dev);
1339 
1340 		if (!info->ats_enabled)
1341 			continue;
1342 
1343 		qi_flush_dev_iotlb_pasid(info->iommu,
1344 					 PCI_DEVID(info->bus, info->devfn),
1345 					 info->pfsid, dev_pasid->pasid,
1346 					 info->ats_qdep, addr,
1347 					 mask);
1348 	}
1349 	spin_unlock_irqrestore(&domain->lock, flags);
1350 }
1351 
1352 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1353 				     struct dmar_domain *domain, u64 addr,
1354 				     unsigned long npages, bool ih)
1355 {
1356 	u16 did = domain_id_iommu(domain, iommu);
1357 	struct dev_pasid_info *dev_pasid;
1358 	unsigned long flags;
1359 
1360 	spin_lock_irqsave(&domain->lock, flags);
1361 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1362 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1363 
1364 	if (!list_empty(&domain->devices))
1365 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1366 	spin_unlock_irqrestore(&domain->lock, flags);
1367 }
1368 
1369 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1370 				    unsigned long pfn, unsigned int pages,
1371 				    int ih)
1372 {
1373 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1374 	unsigned long bitmask = aligned_pages - 1;
1375 	unsigned int mask = ilog2(aligned_pages);
1376 	u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1377 
1378 	/*
1379 	 * PSI masks the low order bits of the base address. If the
1380 	 * address isn't aligned to the mask, then compute a mask value
1381 	 * needed to ensure the target range is flushed.
1382 	 */
1383 	if (unlikely(bitmask & pfn)) {
1384 		unsigned long end_pfn = pfn + pages - 1, shared_bits;
1385 
1386 		/*
1387 		 * Since end_pfn <= pfn + bitmask, the only way bits
1388 		 * higher than bitmask can differ in pfn and end_pfn is
1389 		 * by carrying. This means after masking out bitmask,
1390 		 * high bits starting with the first set bit in
1391 		 * shared_bits are all equal in both pfn and end_pfn.
1392 		 */
1393 		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1394 		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1395 	}
1396 
1397 	/*
1398 	 * Fallback to domain selective flush if no PSI support or
1399 	 * the size is too big.
1400 	 */
1401 	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1402 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1403 					 DMA_TLB_DSI_FLUSH);
1404 	else
1405 		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1406 					 DMA_TLB_PSI_FLUSH);
1407 }
1408 
1409 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1410 				  struct dmar_domain *domain,
1411 				  unsigned long pfn, unsigned int pages,
1412 				  int ih, int map)
1413 {
1414 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1415 	unsigned int mask = ilog2(aligned_pages);
1416 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1417 	u16 did = domain_id_iommu(domain, iommu);
1418 
1419 	if (WARN_ON(!pages))
1420 		return;
1421 
1422 	if (ih)
1423 		ih = 1 << 6;
1424 
1425 	if (domain->use_first_level)
1426 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1427 	else
1428 		__iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1429 
1430 	/*
1431 	 * In caching mode, changes of pages from non-present to present require
1432 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1433 	 */
1434 	if (!cap_caching_mode(iommu->cap) || !map)
1435 		iommu_flush_dev_iotlb(domain, addr, mask);
1436 }
1437 
1438 /* Notification for newly created mappings */
1439 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1440 				 unsigned long pfn, unsigned int pages)
1441 {
1442 	/*
1443 	 * It's a non-present to present mapping. Only flush if caching mode
1444 	 * and second level.
1445 	 */
1446 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1447 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1448 	else
1449 		iommu_flush_write_buffer(iommu);
1450 }
1451 
1452 /*
1453  * Flush the relevant caches in nested translation if the domain
1454  * also serves as a parent
1455  */
1456 static void parent_domain_flush(struct dmar_domain *domain,
1457 				unsigned long pfn,
1458 				unsigned long pages, int ih)
1459 {
1460 	struct dmar_domain *s1_domain;
1461 
1462 	spin_lock(&domain->s1_lock);
1463 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1464 		struct device_domain_info *device_info;
1465 		struct iommu_domain_info *info;
1466 		unsigned long flags;
1467 		unsigned long i;
1468 
1469 		xa_for_each(&s1_domain->iommu_array, i, info)
1470 			__iommu_flush_iotlb_psi(info->iommu, info->did,
1471 						pfn, pages, ih);
1472 
1473 		if (!s1_domain->has_iotlb_device)
1474 			continue;
1475 
1476 		spin_lock_irqsave(&s1_domain->lock, flags);
1477 		list_for_each_entry(device_info, &s1_domain->devices, link)
1478 			/*
1479 			 * Address translation cache in device side caches the
1480 			 * result of nested translation. There is no easy way
1481 			 * to identify the exact set of nested translations
1482 			 * affected by a change in S2. So just flush the entire
1483 			 * device cache.
1484 			 */
1485 			__iommu_flush_dev_iotlb(device_info, 0,
1486 						MAX_AGAW_PFN_WIDTH);
1487 		spin_unlock_irqrestore(&s1_domain->lock, flags);
1488 	}
1489 	spin_unlock(&domain->s1_lock);
1490 }
1491 
1492 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1493 {
1494 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1495 	struct iommu_domain_info *info;
1496 	unsigned long idx;
1497 
1498 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1499 		struct intel_iommu *iommu = info->iommu;
1500 		u16 did = domain_id_iommu(dmar_domain, iommu);
1501 
1502 		if (dmar_domain->use_first_level)
1503 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1504 		else
1505 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1506 						 DMA_TLB_DSI_FLUSH);
1507 
1508 		if (!cap_caching_mode(iommu->cap))
1509 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1510 	}
1511 
1512 	if (dmar_domain->nested_parent)
1513 		parent_domain_flush(dmar_domain, 0, -1, 0);
1514 }
1515 
1516 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1517 {
1518 	u32 pmen;
1519 	unsigned long flags;
1520 
1521 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1522 		return;
1523 
1524 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1525 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1526 	pmen &= ~DMA_PMEN_EPM;
1527 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1528 
1529 	/* wait for the protected region status bit to clear */
1530 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1531 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1532 
1533 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1534 }
1535 
1536 static void iommu_enable_translation(struct intel_iommu *iommu)
1537 {
1538 	u32 sts;
1539 	unsigned long flags;
1540 
1541 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542 	iommu->gcmd |= DMA_GCMD_TE;
1543 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1544 
1545 	/* Make sure hardware complete it */
1546 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1547 		      readl, (sts & DMA_GSTS_TES), sts);
1548 
1549 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1550 }
1551 
1552 static void iommu_disable_translation(struct intel_iommu *iommu)
1553 {
1554 	u32 sts;
1555 	unsigned long flag;
1556 
1557 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1558 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1559 		return;
1560 
1561 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1562 	iommu->gcmd &= ~DMA_GCMD_TE;
1563 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1564 
1565 	/* Make sure hardware complete it */
1566 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1567 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1568 
1569 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1570 }
1571 
1572 static int iommu_init_domains(struct intel_iommu *iommu)
1573 {
1574 	u32 ndomains;
1575 
1576 	ndomains = cap_ndoms(iommu->cap);
1577 	pr_debug("%s: Number of Domains supported <%d>\n",
1578 		 iommu->name, ndomains);
1579 
1580 	spin_lock_init(&iommu->lock);
1581 
1582 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1583 	if (!iommu->domain_ids)
1584 		return -ENOMEM;
1585 
1586 	/*
1587 	 * If Caching mode is set, then invalid translations are tagged
1588 	 * with domain-id 0, hence we need to pre-allocate it. We also
1589 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1590 	 * make sure it is not used for a real domain.
1591 	 */
1592 	set_bit(0, iommu->domain_ids);
1593 
1594 	/*
1595 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1596 	 * entry for first-level or pass-through translation modes should
1597 	 * be programmed with a domain id different from those used for
1598 	 * second-level or nested translation. We reserve a domain id for
1599 	 * this purpose.
1600 	 */
1601 	if (sm_supported(iommu))
1602 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1603 
1604 	return 0;
1605 }
1606 
1607 static void disable_dmar_iommu(struct intel_iommu *iommu)
1608 {
1609 	if (!iommu->domain_ids)
1610 		return;
1611 
1612 	/*
1613 	 * All iommu domains must have been detached from the devices,
1614 	 * hence there should be no domain IDs in use.
1615 	 */
1616 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1617 		    > NUM_RESERVED_DID))
1618 		return;
1619 
1620 	if (iommu->gcmd & DMA_GCMD_TE)
1621 		iommu_disable_translation(iommu);
1622 }
1623 
1624 static void free_dmar_iommu(struct intel_iommu *iommu)
1625 {
1626 	if (iommu->domain_ids) {
1627 		bitmap_free(iommu->domain_ids);
1628 		iommu->domain_ids = NULL;
1629 	}
1630 
1631 	if (iommu->copied_tables) {
1632 		bitmap_free(iommu->copied_tables);
1633 		iommu->copied_tables = NULL;
1634 	}
1635 
1636 	/* free context mapping */
1637 	free_context_table(iommu);
1638 
1639 #ifdef CONFIG_INTEL_IOMMU_SVM
1640 	if (pasid_supported(iommu)) {
1641 		if (ecap_prs(iommu->ecap))
1642 			intel_svm_finish_prq(iommu);
1643 	}
1644 #endif
1645 }
1646 
1647 /*
1648  * Check and return whether first level is used by default for
1649  * DMA translation.
1650  */
1651 static bool first_level_by_default(unsigned int type)
1652 {
1653 	/* Only SL is available in legacy mode */
1654 	if (!scalable_mode_support())
1655 		return false;
1656 
1657 	/* Only level (either FL or SL) is available, just use it */
1658 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1659 		return intel_cap_flts_sanity();
1660 
1661 	/* Both levels are available, decide it based on domain type */
1662 	return type != IOMMU_DOMAIN_UNMANAGED;
1663 }
1664 
1665 static struct dmar_domain *alloc_domain(unsigned int type)
1666 {
1667 	struct dmar_domain *domain;
1668 
1669 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1670 	if (!domain)
1671 		return NULL;
1672 
1673 	domain->nid = NUMA_NO_NODE;
1674 	if (first_level_by_default(type))
1675 		domain->use_first_level = true;
1676 	domain->has_iotlb_device = false;
1677 	INIT_LIST_HEAD(&domain->devices);
1678 	INIT_LIST_HEAD(&domain->dev_pasids);
1679 	spin_lock_init(&domain->lock);
1680 	xa_init(&domain->iommu_array);
1681 
1682 	return domain;
1683 }
1684 
1685 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686 {
1687 	struct iommu_domain_info *info, *curr;
1688 	unsigned long ndomains;
1689 	int num, ret = -ENOSPC;
1690 
1691 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1692 	if (!info)
1693 		return -ENOMEM;
1694 
1695 	spin_lock(&iommu->lock);
1696 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1697 	if (curr) {
1698 		curr->refcnt++;
1699 		spin_unlock(&iommu->lock);
1700 		kfree(info);
1701 		return 0;
1702 	}
1703 
1704 	ndomains = cap_ndoms(iommu->cap);
1705 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1706 	if (num >= ndomains) {
1707 		pr_err("%s: No free domain ids\n", iommu->name);
1708 		goto err_unlock;
1709 	}
1710 
1711 	set_bit(num, iommu->domain_ids);
1712 	info->refcnt	= 1;
1713 	info->did	= num;
1714 	info->iommu	= iommu;
1715 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1716 			  NULL, info, GFP_ATOMIC);
1717 	if (curr) {
1718 		ret = xa_err(curr) ? : -EBUSY;
1719 		goto err_clear;
1720 	}
1721 	domain_update_iommu_cap(domain);
1722 
1723 	spin_unlock(&iommu->lock);
1724 	return 0;
1725 
1726 err_clear:
1727 	clear_bit(info->did, iommu->domain_ids);
1728 err_unlock:
1729 	spin_unlock(&iommu->lock);
1730 	kfree(info);
1731 	return ret;
1732 }
1733 
1734 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1735 {
1736 	struct iommu_domain_info *info;
1737 
1738 	spin_lock(&iommu->lock);
1739 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1740 	if (--info->refcnt == 0) {
1741 		clear_bit(info->did, iommu->domain_ids);
1742 		xa_erase(&domain->iommu_array, iommu->seq_id);
1743 		domain->nid = NUMA_NO_NODE;
1744 		domain_update_iommu_cap(domain);
1745 		kfree(info);
1746 	}
1747 	spin_unlock(&iommu->lock);
1748 }
1749 
1750 static int guestwidth_to_adjustwidth(int gaw)
1751 {
1752 	int agaw;
1753 	int r = (gaw - 12) % 9;
1754 
1755 	if (r == 0)
1756 		agaw = gaw;
1757 	else
1758 		agaw = gaw + 9 - r;
1759 	if (agaw > 64)
1760 		agaw = 64;
1761 	return agaw;
1762 }
1763 
1764 static void domain_exit(struct dmar_domain *domain)
1765 {
1766 	if (domain->pgd) {
1767 		LIST_HEAD(freelist);
1768 
1769 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1770 		put_pages_list(&freelist);
1771 	}
1772 
1773 	if (WARN_ON(!list_empty(&domain->devices)))
1774 		return;
1775 
1776 	kfree(domain);
1777 }
1778 
1779 /*
1780  * Get the PASID directory size for scalable mode context entry.
1781  * Value of X in the PDTS field of a scalable mode context entry
1782  * indicates PASID directory with 2^(X + 7) entries.
1783  */
1784 static unsigned long context_get_sm_pds(struct pasid_table *table)
1785 {
1786 	unsigned long pds, max_pde;
1787 
1788 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1789 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1790 	if (pds < 7)
1791 		return 0;
1792 
1793 	return pds - 7;
1794 }
1795 
1796 static int domain_context_mapping_one(struct dmar_domain *domain,
1797 				      struct intel_iommu *iommu,
1798 				      struct pasid_table *table,
1799 				      u8 bus, u8 devfn)
1800 {
1801 	struct device_domain_info *info =
1802 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1803 	u16 did = domain_id_iommu(domain, iommu);
1804 	int translation = CONTEXT_TT_MULTI_LEVEL;
1805 	struct context_entry *context;
1806 	int ret;
1807 
1808 	if (hw_pass_through && domain_type_is_si(domain))
1809 		translation = CONTEXT_TT_PASS_THROUGH;
1810 
1811 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1812 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1813 
1814 	spin_lock(&iommu->lock);
1815 	ret = -ENOMEM;
1816 	context = iommu_context_addr(iommu, bus, devfn, 1);
1817 	if (!context)
1818 		goto out_unlock;
1819 
1820 	ret = 0;
1821 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1822 		goto out_unlock;
1823 
1824 	/*
1825 	 * For kdump cases, old valid entries may be cached due to the
1826 	 * in-flight DMA and copied pgtable, but there is no unmapping
1827 	 * behaviour for them, thus we need an explicit cache flush for
1828 	 * the newly-mapped device. For kdump, at this point, the device
1829 	 * is supposed to finish reset at its driver probe stage, so no
1830 	 * in-flight DMA will exist, and we don't need to worry anymore
1831 	 * hereafter.
1832 	 */
1833 	if (context_copied(iommu, bus, devfn)) {
1834 		u16 did_old = context_domain_id(context);
1835 
1836 		if (did_old < cap_ndoms(iommu->cap)) {
1837 			iommu->flush.flush_context(iommu, did_old,
1838 						   (((u16)bus) << 8) | devfn,
1839 						   DMA_CCMD_MASK_NOBIT,
1840 						   DMA_CCMD_DEVICE_INVL);
1841 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1842 						 DMA_TLB_DSI_FLUSH);
1843 		}
1844 
1845 		clear_context_copied(iommu, bus, devfn);
1846 	}
1847 
1848 	context_clear_entry(context);
1849 
1850 	if (sm_supported(iommu)) {
1851 		unsigned long pds;
1852 
1853 		/* Setup the PASID DIR pointer: */
1854 		pds = context_get_sm_pds(table);
1855 		context->lo = (u64)virt_to_phys(table->table) |
1856 				context_pdts(pds);
1857 
1858 		/* Setup the RID_PASID field: */
1859 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1860 
1861 		/*
1862 		 * Setup the Device-TLB enable bit and Page request
1863 		 * Enable bit:
1864 		 */
1865 		if (info && info->ats_supported)
1866 			context_set_sm_dte(context);
1867 		if (info && info->pri_supported)
1868 			context_set_sm_pre(context);
1869 		if (info && info->pasid_supported)
1870 			context_set_pasid(context);
1871 	} else {
1872 		struct dma_pte *pgd = domain->pgd;
1873 		int agaw;
1874 
1875 		context_set_domain_id(context, did);
1876 
1877 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1878 			/*
1879 			 * Skip top levels of page tables for iommu which has
1880 			 * less agaw than default. Unnecessary for PT mode.
1881 			 */
1882 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1883 				ret = -ENOMEM;
1884 				pgd = phys_to_virt(dma_pte_addr(pgd));
1885 				if (!dma_pte_present(pgd))
1886 					goto out_unlock;
1887 			}
1888 
1889 			if (info && info->ats_supported)
1890 				translation = CONTEXT_TT_DEV_IOTLB;
1891 			else
1892 				translation = CONTEXT_TT_MULTI_LEVEL;
1893 
1894 			context_set_address_root(context, virt_to_phys(pgd));
1895 			context_set_address_width(context, agaw);
1896 		} else {
1897 			/*
1898 			 * In pass through mode, AW must be programmed to
1899 			 * indicate the largest AGAW value supported by
1900 			 * hardware. And ASR is ignored by hardware.
1901 			 */
1902 			context_set_address_width(context, iommu->msagaw);
1903 		}
1904 
1905 		context_set_translation_type(context, translation);
1906 	}
1907 
1908 	context_set_fault_enable(context);
1909 	context_set_present(context);
1910 	if (!ecap_coherent(iommu->ecap))
1911 		clflush_cache_range(context, sizeof(*context));
1912 
1913 	/*
1914 	 * It's a non-present to present mapping. If hardware doesn't cache
1915 	 * non-present entry we only need to flush the write-buffer. If the
1916 	 * _does_ cache non-present entries, then it does so in the special
1917 	 * domain #0, which we have to flush:
1918 	 */
1919 	if (cap_caching_mode(iommu->cap)) {
1920 		iommu->flush.flush_context(iommu, 0,
1921 					   (((u16)bus) << 8) | devfn,
1922 					   DMA_CCMD_MASK_NOBIT,
1923 					   DMA_CCMD_DEVICE_INVL);
1924 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1925 	} else {
1926 		iommu_flush_write_buffer(iommu);
1927 	}
1928 
1929 	ret = 0;
1930 
1931 out_unlock:
1932 	spin_unlock(&iommu->lock);
1933 
1934 	return ret;
1935 }
1936 
1937 struct domain_context_mapping_data {
1938 	struct dmar_domain *domain;
1939 	struct intel_iommu *iommu;
1940 	struct pasid_table *table;
1941 };
1942 
1943 static int domain_context_mapping_cb(struct pci_dev *pdev,
1944 				     u16 alias, void *opaque)
1945 {
1946 	struct domain_context_mapping_data *data = opaque;
1947 
1948 	return domain_context_mapping_one(data->domain, data->iommu,
1949 					  data->table, PCI_BUS_NUM(alias),
1950 					  alias & 0xff);
1951 }
1952 
1953 static int
1954 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1955 {
1956 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1957 	struct domain_context_mapping_data data;
1958 	struct intel_iommu *iommu = info->iommu;
1959 	u8 bus = info->bus, devfn = info->devfn;
1960 	struct pasid_table *table;
1961 
1962 	table = intel_pasid_get_table(dev);
1963 
1964 	if (!dev_is_pci(dev))
1965 		return domain_context_mapping_one(domain, iommu, table,
1966 						  bus, devfn);
1967 
1968 	data.domain = domain;
1969 	data.iommu = iommu;
1970 	data.table = table;
1971 
1972 	return pci_for_each_dma_alias(to_pci_dev(dev),
1973 				      &domain_context_mapping_cb, &data);
1974 }
1975 
1976 /* Returns a number of VTD pages, but aligned to MM page size */
1977 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1978 {
1979 	host_addr &= ~PAGE_MASK;
1980 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1981 }
1982 
1983 /* Return largest possible superpage level for a given mapping */
1984 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1985 				   unsigned long phy_pfn, unsigned long pages)
1986 {
1987 	int support, level = 1;
1988 	unsigned long pfnmerge;
1989 
1990 	support = domain->iommu_superpage;
1991 
1992 	/* To use a large page, the virtual *and* physical addresses
1993 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1994 	   of them will mean we have to use smaller pages. So just
1995 	   merge them and check both at once. */
1996 	pfnmerge = iov_pfn | phy_pfn;
1997 
1998 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1999 		pages >>= VTD_STRIDE_SHIFT;
2000 		if (!pages)
2001 			break;
2002 		pfnmerge >>= VTD_STRIDE_SHIFT;
2003 		level++;
2004 		support--;
2005 	}
2006 	return level;
2007 }
2008 
2009 /*
2010  * Ensure that old small page tables are removed to make room for superpage(s).
2011  * We're going to add new large pages, so make sure we don't remove their parent
2012  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2013  */
2014 static void switch_to_super_page(struct dmar_domain *domain,
2015 				 unsigned long start_pfn,
2016 				 unsigned long end_pfn, int level)
2017 {
2018 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2019 	struct iommu_domain_info *info;
2020 	struct dma_pte *pte = NULL;
2021 	unsigned long i;
2022 
2023 	while (start_pfn <= end_pfn) {
2024 		if (!pte)
2025 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2026 					     GFP_ATOMIC);
2027 
2028 		if (dma_pte_present(pte)) {
2029 			dma_pte_free_pagetable(domain, start_pfn,
2030 					       start_pfn + lvl_pages - 1,
2031 					       level + 1);
2032 
2033 			xa_for_each(&domain->iommu_array, i, info)
2034 				iommu_flush_iotlb_psi(info->iommu, domain,
2035 						      start_pfn, lvl_pages,
2036 						      0, 0);
2037 			if (domain->nested_parent)
2038 				parent_domain_flush(domain, start_pfn,
2039 						    lvl_pages, 0);
2040 		}
2041 
2042 		pte++;
2043 		start_pfn += lvl_pages;
2044 		if (first_pte_in_page(pte))
2045 			pte = NULL;
2046 	}
2047 }
2048 
2049 static int
2050 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2052 		 gfp_t gfp)
2053 {
2054 	struct dma_pte *first_pte = NULL, *pte = NULL;
2055 	unsigned int largepage_lvl = 0;
2056 	unsigned long lvl_pages = 0;
2057 	phys_addr_t pteval;
2058 	u64 attr;
2059 
2060 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2061 		return -EINVAL;
2062 
2063 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2064 		return -EINVAL;
2065 
2066 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2067 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2068 		return -EINVAL;
2069 	}
2070 
2071 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2072 	attr |= DMA_FL_PTE_PRESENT;
2073 	if (domain->use_first_level) {
2074 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2075 		if (prot & DMA_PTE_WRITE)
2076 			attr |= DMA_FL_PTE_DIRTY;
2077 	}
2078 
2079 	domain->has_mappings = true;
2080 
2081 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2082 
2083 	while (nr_pages > 0) {
2084 		uint64_t tmp;
2085 
2086 		if (!pte) {
2087 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2088 					phys_pfn, nr_pages);
2089 
2090 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2091 					     gfp);
2092 			if (!pte)
2093 				return -ENOMEM;
2094 			first_pte = pte;
2095 
2096 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2097 
2098 			/* It is large page*/
2099 			if (largepage_lvl > 1) {
2100 				unsigned long end_pfn;
2101 				unsigned long pages_to_remove;
2102 
2103 				pteval |= DMA_PTE_LARGE_PAGE;
2104 				pages_to_remove = min_t(unsigned long, nr_pages,
2105 							nr_pte_to_next_page(pte) * lvl_pages);
2106 				end_pfn = iov_pfn + pages_to_remove - 1;
2107 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2108 			} else {
2109 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2110 			}
2111 
2112 		}
2113 		/* We don't need lock here, nobody else
2114 		 * touches the iova range
2115 		 */
2116 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2117 		if (tmp) {
2118 			static int dumps = 5;
2119 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2120 				iov_pfn, tmp, (unsigned long long)pteval);
2121 			if (dumps) {
2122 				dumps--;
2123 				debug_dma_dump_mappings(NULL);
2124 			}
2125 			WARN_ON(1);
2126 		}
2127 
2128 		nr_pages -= lvl_pages;
2129 		iov_pfn += lvl_pages;
2130 		phys_pfn += lvl_pages;
2131 		pteval += lvl_pages * VTD_PAGE_SIZE;
2132 
2133 		/* If the next PTE would be the first in a new page, then we
2134 		 * need to flush the cache on the entries we've just written.
2135 		 * And then we'll need to recalculate 'pte', so clear it and
2136 		 * let it get set again in the if (!pte) block above.
2137 		 *
2138 		 * If we're done (!nr_pages) we need to flush the cache too.
2139 		 *
2140 		 * Also if we've been setting superpages, we may need to
2141 		 * recalculate 'pte' and switch back to smaller pages for the
2142 		 * end of the mapping, if the trailing size is not enough to
2143 		 * use another superpage (i.e. nr_pages < lvl_pages).
2144 		 */
2145 		pte++;
2146 		if (!nr_pages || first_pte_in_page(pte) ||
2147 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2148 			domain_flush_cache(domain, first_pte,
2149 					   (void *)pte - (void *)first_pte);
2150 			pte = NULL;
2151 		}
2152 	}
2153 
2154 	return 0;
2155 }
2156 
2157 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2158 {
2159 	struct intel_iommu *iommu = info->iommu;
2160 	struct context_entry *context;
2161 	u16 did_old;
2162 
2163 	if (!iommu)
2164 		return;
2165 
2166 	spin_lock(&iommu->lock);
2167 	context = iommu_context_addr(iommu, bus, devfn, 0);
2168 	if (!context) {
2169 		spin_unlock(&iommu->lock);
2170 		return;
2171 	}
2172 
2173 	if (sm_supported(iommu)) {
2174 		if (hw_pass_through && domain_type_is_si(info->domain))
2175 			did_old = FLPT_DEFAULT_DID;
2176 		else
2177 			did_old = domain_id_iommu(info->domain, iommu);
2178 	} else {
2179 		did_old = context_domain_id(context);
2180 	}
2181 
2182 	context_clear_entry(context);
2183 	__iommu_flush_cache(iommu, context, sizeof(*context));
2184 	spin_unlock(&iommu->lock);
2185 	iommu->flush.flush_context(iommu,
2186 				   did_old,
2187 				   (((u16)bus) << 8) | devfn,
2188 				   DMA_CCMD_MASK_NOBIT,
2189 				   DMA_CCMD_DEVICE_INVL);
2190 
2191 	if (sm_supported(iommu))
2192 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2193 
2194 	iommu->flush.flush_iotlb(iommu,
2195 				 did_old,
2196 				 0,
2197 				 0,
2198 				 DMA_TLB_DSI_FLUSH);
2199 
2200 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2201 }
2202 
2203 static int domain_setup_first_level(struct intel_iommu *iommu,
2204 				    struct dmar_domain *domain,
2205 				    struct device *dev,
2206 				    u32 pasid)
2207 {
2208 	struct dma_pte *pgd = domain->pgd;
2209 	int agaw, level;
2210 	int flags = 0;
2211 
2212 	/*
2213 	 * Skip top levels of page tables for iommu which has
2214 	 * less agaw than default. Unnecessary for PT mode.
2215 	 */
2216 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2217 		pgd = phys_to_virt(dma_pte_addr(pgd));
2218 		if (!dma_pte_present(pgd))
2219 			return -ENOMEM;
2220 	}
2221 
2222 	level = agaw_to_level(agaw);
2223 	if (level != 4 && level != 5)
2224 		return -EINVAL;
2225 
2226 	if (level == 5)
2227 		flags |= PASID_FLAG_FL5LP;
2228 
2229 	if (domain->force_snooping)
2230 		flags |= PASID_FLAG_PAGE_SNOOP;
2231 
2232 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2233 					     domain_id_iommu(domain, iommu),
2234 					     flags);
2235 }
2236 
2237 static bool dev_is_real_dma_subdevice(struct device *dev)
2238 {
2239 	return dev && dev_is_pci(dev) &&
2240 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2241 }
2242 
2243 static int iommu_domain_identity_map(struct dmar_domain *domain,
2244 				     unsigned long first_vpfn,
2245 				     unsigned long last_vpfn)
2246 {
2247 	/*
2248 	 * RMRR range might have overlap with physical memory range,
2249 	 * clear it first
2250 	 */
2251 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2252 
2253 	return __domain_mapping(domain, first_vpfn,
2254 				first_vpfn, last_vpfn - first_vpfn + 1,
2255 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2256 }
2257 
2258 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2259 
2260 static int __init si_domain_init(int hw)
2261 {
2262 	struct dmar_rmrr_unit *rmrr;
2263 	struct device *dev;
2264 	int i, nid, ret;
2265 
2266 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2267 	if (!si_domain)
2268 		return -EFAULT;
2269 
2270 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2271 		domain_exit(si_domain);
2272 		si_domain = NULL;
2273 		return -EFAULT;
2274 	}
2275 
2276 	if (hw)
2277 		return 0;
2278 
2279 	for_each_online_node(nid) {
2280 		unsigned long start_pfn, end_pfn;
2281 		int i;
2282 
2283 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2284 			ret = iommu_domain_identity_map(si_domain,
2285 					mm_to_dma_pfn_start(start_pfn),
2286 					mm_to_dma_pfn_end(end_pfn));
2287 			if (ret)
2288 				return ret;
2289 		}
2290 	}
2291 
2292 	/*
2293 	 * Identity map the RMRRs so that devices with RMRRs could also use
2294 	 * the si_domain.
2295 	 */
2296 	for_each_rmrr_units(rmrr) {
2297 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2298 					  i, dev) {
2299 			unsigned long long start = rmrr->base_address;
2300 			unsigned long long end = rmrr->end_address;
2301 
2302 			if (WARN_ON(end < start ||
2303 				    end >> agaw_to_width(si_domain->agaw)))
2304 				continue;
2305 
2306 			ret = iommu_domain_identity_map(si_domain,
2307 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2308 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2309 			if (ret)
2310 				return ret;
2311 		}
2312 	}
2313 
2314 	return 0;
2315 }
2316 
2317 static int dmar_domain_attach_device(struct dmar_domain *domain,
2318 				     struct device *dev)
2319 {
2320 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2321 	struct intel_iommu *iommu = info->iommu;
2322 	unsigned long flags;
2323 	int ret;
2324 
2325 	ret = domain_attach_iommu(domain, iommu);
2326 	if (ret)
2327 		return ret;
2328 	info->domain = domain;
2329 	spin_lock_irqsave(&domain->lock, flags);
2330 	list_add(&info->link, &domain->devices);
2331 	spin_unlock_irqrestore(&domain->lock, flags);
2332 
2333 	/* PASID table is mandatory for a PCI device in scalable mode. */
2334 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2335 		/* Setup the PASID entry for requests without PASID: */
2336 		if (hw_pass_through && domain_type_is_si(domain))
2337 			ret = intel_pasid_setup_pass_through(iommu,
2338 					dev, IOMMU_NO_PASID);
2339 		else if (domain->use_first_level)
2340 			ret = domain_setup_first_level(iommu, domain, dev,
2341 					IOMMU_NO_PASID);
2342 		else
2343 			ret = intel_pasid_setup_second_level(iommu, domain,
2344 					dev, IOMMU_NO_PASID);
2345 		if (ret) {
2346 			dev_err(dev, "Setup RID2PASID failed\n");
2347 			device_block_translation(dev);
2348 			return ret;
2349 		}
2350 	}
2351 
2352 	ret = domain_context_mapping(domain, dev);
2353 	if (ret) {
2354 		dev_err(dev, "Domain context map failed\n");
2355 		device_block_translation(dev);
2356 		return ret;
2357 	}
2358 
2359 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2360 		iommu_enable_pci_caps(info);
2361 
2362 	return 0;
2363 }
2364 
2365 /**
2366  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2367  * is relaxable (ie. is allowed to be not enforced under some conditions)
2368  * @dev: device handle
2369  *
2370  * We assume that PCI USB devices with RMRRs have them largely
2371  * for historical reasons and that the RMRR space is not actively used post
2372  * boot.  This exclusion may change if vendors begin to abuse it.
2373  *
2374  * The same exception is made for graphics devices, with the requirement that
2375  * any use of the RMRR regions will be torn down before assigning the device
2376  * to a guest.
2377  *
2378  * Return: true if the RMRR is relaxable, false otherwise
2379  */
2380 static bool device_rmrr_is_relaxable(struct device *dev)
2381 {
2382 	struct pci_dev *pdev;
2383 
2384 	if (!dev_is_pci(dev))
2385 		return false;
2386 
2387 	pdev = to_pci_dev(dev);
2388 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2389 		return true;
2390 	else
2391 		return false;
2392 }
2393 
2394 /*
2395  * Return the required default domain type for a specific device.
2396  *
2397  * @dev: the device in query
2398  * @startup: true if this is during early boot
2399  *
2400  * Returns:
2401  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2402  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2403  *  - 0: both identity and dynamic domains work for this device
2404  */
2405 static int device_def_domain_type(struct device *dev)
2406 {
2407 	if (dev_is_pci(dev)) {
2408 		struct pci_dev *pdev = to_pci_dev(dev);
2409 
2410 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2411 			return IOMMU_DOMAIN_IDENTITY;
2412 
2413 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2414 			return IOMMU_DOMAIN_IDENTITY;
2415 	}
2416 
2417 	return 0;
2418 }
2419 
2420 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2421 {
2422 	/*
2423 	 * Start from the sane iommu hardware state.
2424 	 * If the queued invalidation is already initialized by us
2425 	 * (for example, while enabling interrupt-remapping) then
2426 	 * we got the things already rolling from a sane state.
2427 	 */
2428 	if (!iommu->qi) {
2429 		/*
2430 		 * Clear any previous faults.
2431 		 */
2432 		dmar_fault(-1, iommu);
2433 		/*
2434 		 * Disable queued invalidation if supported and already enabled
2435 		 * before OS handover.
2436 		 */
2437 		dmar_disable_qi(iommu);
2438 	}
2439 
2440 	if (dmar_enable_qi(iommu)) {
2441 		/*
2442 		 * Queued Invalidate not enabled, use Register Based Invalidate
2443 		 */
2444 		iommu->flush.flush_context = __iommu_flush_context;
2445 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2446 		pr_info("%s: Using Register based invalidation\n",
2447 			iommu->name);
2448 	} else {
2449 		iommu->flush.flush_context = qi_flush_context;
2450 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2451 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2452 	}
2453 }
2454 
2455 static int copy_context_table(struct intel_iommu *iommu,
2456 			      struct root_entry *old_re,
2457 			      struct context_entry **tbl,
2458 			      int bus, bool ext)
2459 {
2460 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2461 	struct context_entry *new_ce = NULL, ce;
2462 	struct context_entry *old_ce = NULL;
2463 	struct root_entry re;
2464 	phys_addr_t old_ce_phys;
2465 
2466 	tbl_idx = ext ? bus * 2 : bus;
2467 	memcpy(&re, old_re, sizeof(re));
2468 
2469 	for (devfn = 0; devfn < 256; devfn++) {
2470 		/* First calculate the correct index */
2471 		idx = (ext ? devfn * 2 : devfn) % 256;
2472 
2473 		if (idx == 0) {
2474 			/* First save what we may have and clean up */
2475 			if (new_ce) {
2476 				tbl[tbl_idx] = new_ce;
2477 				__iommu_flush_cache(iommu, new_ce,
2478 						    VTD_PAGE_SIZE);
2479 				pos = 1;
2480 			}
2481 
2482 			if (old_ce)
2483 				memunmap(old_ce);
2484 
2485 			ret = 0;
2486 			if (devfn < 0x80)
2487 				old_ce_phys = root_entry_lctp(&re);
2488 			else
2489 				old_ce_phys = root_entry_uctp(&re);
2490 
2491 			if (!old_ce_phys) {
2492 				if (ext && devfn == 0) {
2493 					/* No LCTP, try UCTP */
2494 					devfn = 0x7f;
2495 					continue;
2496 				} else {
2497 					goto out;
2498 				}
2499 			}
2500 
2501 			ret = -ENOMEM;
2502 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2503 					MEMREMAP_WB);
2504 			if (!old_ce)
2505 				goto out;
2506 
2507 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2508 			if (!new_ce)
2509 				goto out_unmap;
2510 
2511 			ret = 0;
2512 		}
2513 
2514 		/* Now copy the context entry */
2515 		memcpy(&ce, old_ce + idx, sizeof(ce));
2516 
2517 		if (!context_present(&ce))
2518 			continue;
2519 
2520 		did = context_domain_id(&ce);
2521 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2522 			set_bit(did, iommu->domain_ids);
2523 
2524 		set_context_copied(iommu, bus, devfn);
2525 		new_ce[idx] = ce;
2526 	}
2527 
2528 	tbl[tbl_idx + pos] = new_ce;
2529 
2530 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2531 
2532 out_unmap:
2533 	memunmap(old_ce);
2534 
2535 out:
2536 	return ret;
2537 }
2538 
2539 static int copy_translation_tables(struct intel_iommu *iommu)
2540 {
2541 	struct context_entry **ctxt_tbls;
2542 	struct root_entry *old_rt;
2543 	phys_addr_t old_rt_phys;
2544 	int ctxt_table_entries;
2545 	u64 rtaddr_reg;
2546 	int bus, ret;
2547 	bool new_ext, ext;
2548 
2549 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2550 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2551 	new_ext    = !!sm_supported(iommu);
2552 
2553 	/*
2554 	 * The RTT bit can only be changed when translation is disabled,
2555 	 * but disabling translation means to open a window for data
2556 	 * corruption. So bail out and don't copy anything if we would
2557 	 * have to change the bit.
2558 	 */
2559 	if (new_ext != ext)
2560 		return -EINVAL;
2561 
2562 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2563 	if (!iommu->copied_tables)
2564 		return -ENOMEM;
2565 
2566 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2567 	if (!old_rt_phys)
2568 		return -EINVAL;
2569 
2570 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2571 	if (!old_rt)
2572 		return -ENOMEM;
2573 
2574 	/* This is too big for the stack - allocate it from slab */
2575 	ctxt_table_entries = ext ? 512 : 256;
2576 	ret = -ENOMEM;
2577 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2578 	if (!ctxt_tbls)
2579 		goto out_unmap;
2580 
2581 	for (bus = 0; bus < 256; bus++) {
2582 		ret = copy_context_table(iommu, &old_rt[bus],
2583 					 ctxt_tbls, bus, ext);
2584 		if (ret) {
2585 			pr_err("%s: Failed to copy context table for bus %d\n",
2586 				iommu->name, bus);
2587 			continue;
2588 		}
2589 	}
2590 
2591 	spin_lock(&iommu->lock);
2592 
2593 	/* Context tables are copied, now write them to the root_entry table */
2594 	for (bus = 0; bus < 256; bus++) {
2595 		int idx = ext ? bus * 2 : bus;
2596 		u64 val;
2597 
2598 		if (ctxt_tbls[idx]) {
2599 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2600 			iommu->root_entry[bus].lo = val;
2601 		}
2602 
2603 		if (!ext || !ctxt_tbls[idx + 1])
2604 			continue;
2605 
2606 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2607 		iommu->root_entry[bus].hi = val;
2608 	}
2609 
2610 	spin_unlock(&iommu->lock);
2611 
2612 	kfree(ctxt_tbls);
2613 
2614 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2615 
2616 	ret = 0;
2617 
2618 out_unmap:
2619 	memunmap(old_rt);
2620 
2621 	return ret;
2622 }
2623 
2624 static int __init init_dmars(void)
2625 {
2626 	struct dmar_drhd_unit *drhd;
2627 	struct intel_iommu *iommu;
2628 	int ret;
2629 
2630 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2631 	if (ret)
2632 		goto free_iommu;
2633 
2634 	for_each_iommu(iommu, drhd) {
2635 		if (drhd->ignored) {
2636 			iommu_disable_translation(iommu);
2637 			continue;
2638 		}
2639 
2640 		/*
2641 		 * Find the max pasid size of all IOMMU's in the system.
2642 		 * We need to ensure the system pasid table is no bigger
2643 		 * than the smallest supported.
2644 		 */
2645 		if (pasid_supported(iommu)) {
2646 			u32 temp = 2 << ecap_pss(iommu->ecap);
2647 
2648 			intel_pasid_max_id = min_t(u32, temp,
2649 						   intel_pasid_max_id);
2650 		}
2651 
2652 		intel_iommu_init_qi(iommu);
2653 
2654 		ret = iommu_init_domains(iommu);
2655 		if (ret)
2656 			goto free_iommu;
2657 
2658 		init_translation_status(iommu);
2659 
2660 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2661 			iommu_disable_translation(iommu);
2662 			clear_translation_pre_enabled(iommu);
2663 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2664 				iommu->name);
2665 		}
2666 
2667 		/*
2668 		 * TBD:
2669 		 * we could share the same root & context tables
2670 		 * among all IOMMU's. Need to Split it later.
2671 		 */
2672 		ret = iommu_alloc_root_entry(iommu);
2673 		if (ret)
2674 			goto free_iommu;
2675 
2676 		if (translation_pre_enabled(iommu)) {
2677 			pr_info("Translation already enabled - trying to copy translation structures\n");
2678 
2679 			ret = copy_translation_tables(iommu);
2680 			if (ret) {
2681 				/*
2682 				 * We found the IOMMU with translation
2683 				 * enabled - but failed to copy over the
2684 				 * old root-entry table. Try to proceed
2685 				 * by disabling translation now and
2686 				 * allocating a clean root-entry table.
2687 				 * This might cause DMAR faults, but
2688 				 * probably the dump will still succeed.
2689 				 */
2690 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2691 				       iommu->name);
2692 				iommu_disable_translation(iommu);
2693 				clear_translation_pre_enabled(iommu);
2694 			} else {
2695 				pr_info("Copied translation tables from previous kernel for %s\n",
2696 					iommu->name);
2697 			}
2698 		}
2699 
2700 		if (!ecap_pass_through(iommu->ecap))
2701 			hw_pass_through = 0;
2702 		intel_svm_check(iommu);
2703 	}
2704 
2705 	/*
2706 	 * Now that qi is enabled on all iommus, set the root entry and flush
2707 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2708 	 * flush_context function will loop forever and the boot hangs.
2709 	 */
2710 	for_each_active_iommu(iommu, drhd) {
2711 		iommu_flush_write_buffer(iommu);
2712 		iommu_set_root_entry(iommu);
2713 	}
2714 
2715 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2716 	dmar_map_gfx = 0;
2717 #endif
2718 
2719 	if (!dmar_map_gfx)
2720 		iommu_identity_mapping |= IDENTMAP_GFX;
2721 
2722 	check_tylersburg_isoch();
2723 
2724 	ret = si_domain_init(hw_pass_through);
2725 	if (ret)
2726 		goto free_iommu;
2727 
2728 	/*
2729 	 * for each drhd
2730 	 *   enable fault log
2731 	 *   global invalidate context cache
2732 	 *   global invalidate iotlb
2733 	 *   enable translation
2734 	 */
2735 	for_each_iommu(iommu, drhd) {
2736 		if (drhd->ignored) {
2737 			/*
2738 			 * we always have to disable PMRs or DMA may fail on
2739 			 * this device
2740 			 */
2741 			if (force_on)
2742 				iommu_disable_protect_mem_regions(iommu);
2743 			continue;
2744 		}
2745 
2746 		iommu_flush_write_buffer(iommu);
2747 
2748 #ifdef CONFIG_INTEL_IOMMU_SVM
2749 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2750 			/*
2751 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2752 			 * could cause possible lock race condition.
2753 			 */
2754 			up_write(&dmar_global_lock);
2755 			ret = intel_svm_enable_prq(iommu);
2756 			down_write(&dmar_global_lock);
2757 			if (ret)
2758 				goto free_iommu;
2759 		}
2760 #endif
2761 		ret = dmar_set_interrupt(iommu);
2762 		if (ret)
2763 			goto free_iommu;
2764 	}
2765 
2766 	return 0;
2767 
2768 free_iommu:
2769 	for_each_active_iommu(iommu, drhd) {
2770 		disable_dmar_iommu(iommu);
2771 		free_dmar_iommu(iommu);
2772 	}
2773 	if (si_domain) {
2774 		domain_exit(si_domain);
2775 		si_domain = NULL;
2776 	}
2777 
2778 	return ret;
2779 }
2780 
2781 static void __init init_no_remapping_devices(void)
2782 {
2783 	struct dmar_drhd_unit *drhd;
2784 	struct device *dev;
2785 	int i;
2786 
2787 	for_each_drhd_unit(drhd) {
2788 		if (!drhd->include_all) {
2789 			for_each_active_dev_scope(drhd->devices,
2790 						  drhd->devices_cnt, i, dev)
2791 				break;
2792 			/* ignore DMAR unit if no devices exist */
2793 			if (i == drhd->devices_cnt)
2794 				drhd->ignored = 1;
2795 		}
2796 	}
2797 
2798 	for_each_active_drhd_unit(drhd) {
2799 		if (drhd->include_all)
2800 			continue;
2801 
2802 		for_each_active_dev_scope(drhd->devices,
2803 					  drhd->devices_cnt, i, dev)
2804 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2805 				break;
2806 		if (i < drhd->devices_cnt)
2807 			continue;
2808 
2809 		/* This IOMMU has *only* gfx devices. Either bypass it or
2810 		   set the gfx_mapped flag, as appropriate */
2811 		drhd->gfx_dedicated = 1;
2812 		if (!dmar_map_gfx)
2813 			drhd->ignored = 1;
2814 	}
2815 }
2816 
2817 #ifdef CONFIG_SUSPEND
2818 static int init_iommu_hw(void)
2819 {
2820 	struct dmar_drhd_unit *drhd;
2821 	struct intel_iommu *iommu = NULL;
2822 	int ret;
2823 
2824 	for_each_active_iommu(iommu, drhd) {
2825 		if (iommu->qi) {
2826 			ret = dmar_reenable_qi(iommu);
2827 			if (ret)
2828 				return ret;
2829 		}
2830 	}
2831 
2832 	for_each_iommu(iommu, drhd) {
2833 		if (drhd->ignored) {
2834 			/*
2835 			 * we always have to disable PMRs or DMA may fail on
2836 			 * this device
2837 			 */
2838 			if (force_on)
2839 				iommu_disable_protect_mem_regions(iommu);
2840 			continue;
2841 		}
2842 
2843 		iommu_flush_write_buffer(iommu);
2844 		iommu_set_root_entry(iommu);
2845 		iommu_enable_translation(iommu);
2846 		iommu_disable_protect_mem_regions(iommu);
2847 	}
2848 
2849 	return 0;
2850 }
2851 
2852 static void iommu_flush_all(void)
2853 {
2854 	struct dmar_drhd_unit *drhd;
2855 	struct intel_iommu *iommu;
2856 
2857 	for_each_active_iommu(iommu, drhd) {
2858 		iommu->flush.flush_context(iommu, 0, 0, 0,
2859 					   DMA_CCMD_GLOBAL_INVL);
2860 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2861 					 DMA_TLB_GLOBAL_FLUSH);
2862 	}
2863 }
2864 
2865 static int iommu_suspend(void)
2866 {
2867 	struct dmar_drhd_unit *drhd;
2868 	struct intel_iommu *iommu = NULL;
2869 	unsigned long flag;
2870 
2871 	iommu_flush_all();
2872 
2873 	for_each_active_iommu(iommu, drhd) {
2874 		iommu_disable_translation(iommu);
2875 
2876 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2877 
2878 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2879 			readl(iommu->reg + DMAR_FECTL_REG);
2880 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2881 			readl(iommu->reg + DMAR_FEDATA_REG);
2882 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2883 			readl(iommu->reg + DMAR_FEADDR_REG);
2884 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2885 			readl(iommu->reg + DMAR_FEUADDR_REG);
2886 
2887 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2888 	}
2889 	return 0;
2890 }
2891 
2892 static void iommu_resume(void)
2893 {
2894 	struct dmar_drhd_unit *drhd;
2895 	struct intel_iommu *iommu = NULL;
2896 	unsigned long flag;
2897 
2898 	if (init_iommu_hw()) {
2899 		if (force_on)
2900 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2901 		else
2902 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2903 		return;
2904 	}
2905 
2906 	for_each_active_iommu(iommu, drhd) {
2907 
2908 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2909 
2910 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2911 			iommu->reg + DMAR_FECTL_REG);
2912 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2913 			iommu->reg + DMAR_FEDATA_REG);
2914 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2915 			iommu->reg + DMAR_FEADDR_REG);
2916 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2917 			iommu->reg + DMAR_FEUADDR_REG);
2918 
2919 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2920 	}
2921 }
2922 
2923 static struct syscore_ops iommu_syscore_ops = {
2924 	.resume		= iommu_resume,
2925 	.suspend	= iommu_suspend,
2926 };
2927 
2928 static void __init init_iommu_pm_ops(void)
2929 {
2930 	register_syscore_ops(&iommu_syscore_ops);
2931 }
2932 
2933 #else
2934 static inline void init_iommu_pm_ops(void) {}
2935 #endif	/* CONFIG_PM */
2936 
2937 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2938 {
2939 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2940 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2941 	    rmrr->end_address <= rmrr->base_address ||
2942 	    arch_rmrr_sanity_check(rmrr))
2943 		return -EINVAL;
2944 
2945 	return 0;
2946 }
2947 
2948 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2949 {
2950 	struct acpi_dmar_reserved_memory *rmrr;
2951 	struct dmar_rmrr_unit *rmrru;
2952 
2953 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2954 	if (rmrr_sanity_check(rmrr)) {
2955 		pr_warn(FW_BUG
2956 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2957 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2958 			   rmrr->base_address, rmrr->end_address,
2959 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2960 			   dmi_get_system_info(DMI_BIOS_VERSION),
2961 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2962 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2963 	}
2964 
2965 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2966 	if (!rmrru)
2967 		goto out;
2968 
2969 	rmrru->hdr = header;
2970 
2971 	rmrru->base_address = rmrr->base_address;
2972 	rmrru->end_address = rmrr->end_address;
2973 
2974 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2975 				((void *)rmrr) + rmrr->header.length,
2976 				&rmrru->devices_cnt);
2977 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2978 		goto free_rmrru;
2979 
2980 	list_add(&rmrru->list, &dmar_rmrr_units);
2981 
2982 	return 0;
2983 free_rmrru:
2984 	kfree(rmrru);
2985 out:
2986 	return -ENOMEM;
2987 }
2988 
2989 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2990 {
2991 	struct dmar_atsr_unit *atsru;
2992 	struct acpi_dmar_atsr *tmp;
2993 
2994 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2995 				dmar_rcu_check()) {
2996 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2997 		if (atsr->segment != tmp->segment)
2998 			continue;
2999 		if (atsr->header.length != tmp->header.length)
3000 			continue;
3001 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3002 			return atsru;
3003 	}
3004 
3005 	return NULL;
3006 }
3007 
3008 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3009 {
3010 	struct acpi_dmar_atsr *atsr;
3011 	struct dmar_atsr_unit *atsru;
3012 
3013 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3014 		return 0;
3015 
3016 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3017 	atsru = dmar_find_atsr(atsr);
3018 	if (atsru)
3019 		return 0;
3020 
3021 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3022 	if (!atsru)
3023 		return -ENOMEM;
3024 
3025 	/*
3026 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3027 	 * copy the memory content because the memory buffer will be freed
3028 	 * on return.
3029 	 */
3030 	atsru->hdr = (void *)(atsru + 1);
3031 	memcpy(atsru->hdr, hdr, hdr->length);
3032 	atsru->include_all = atsr->flags & 0x1;
3033 	if (!atsru->include_all) {
3034 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3035 				(void *)atsr + atsr->header.length,
3036 				&atsru->devices_cnt);
3037 		if (atsru->devices_cnt && atsru->devices == NULL) {
3038 			kfree(atsru);
3039 			return -ENOMEM;
3040 		}
3041 	}
3042 
3043 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3044 
3045 	return 0;
3046 }
3047 
3048 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3049 {
3050 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3051 	kfree(atsru);
3052 }
3053 
3054 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3055 {
3056 	struct acpi_dmar_atsr *atsr;
3057 	struct dmar_atsr_unit *atsru;
3058 
3059 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3060 	atsru = dmar_find_atsr(atsr);
3061 	if (atsru) {
3062 		list_del_rcu(&atsru->list);
3063 		synchronize_rcu();
3064 		intel_iommu_free_atsr(atsru);
3065 	}
3066 
3067 	return 0;
3068 }
3069 
3070 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3071 {
3072 	int i;
3073 	struct device *dev;
3074 	struct acpi_dmar_atsr *atsr;
3075 	struct dmar_atsr_unit *atsru;
3076 
3077 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3078 	atsru = dmar_find_atsr(atsr);
3079 	if (!atsru)
3080 		return 0;
3081 
3082 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3083 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3084 					  i, dev)
3085 			return -EBUSY;
3086 	}
3087 
3088 	return 0;
3089 }
3090 
3091 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3092 {
3093 	struct dmar_satc_unit *satcu;
3094 	struct acpi_dmar_satc *tmp;
3095 
3096 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3097 				dmar_rcu_check()) {
3098 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3099 		if (satc->segment != tmp->segment)
3100 			continue;
3101 		if (satc->header.length != tmp->header.length)
3102 			continue;
3103 		if (memcmp(satc, tmp, satc->header.length) == 0)
3104 			return satcu;
3105 	}
3106 
3107 	return NULL;
3108 }
3109 
3110 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3111 {
3112 	struct acpi_dmar_satc *satc;
3113 	struct dmar_satc_unit *satcu;
3114 
3115 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3116 		return 0;
3117 
3118 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3119 	satcu = dmar_find_satc(satc);
3120 	if (satcu)
3121 		return 0;
3122 
3123 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3124 	if (!satcu)
3125 		return -ENOMEM;
3126 
3127 	satcu->hdr = (void *)(satcu + 1);
3128 	memcpy(satcu->hdr, hdr, hdr->length);
3129 	satcu->atc_required = satc->flags & 0x1;
3130 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3131 					      (void *)satc + satc->header.length,
3132 					      &satcu->devices_cnt);
3133 	if (satcu->devices_cnt && !satcu->devices) {
3134 		kfree(satcu);
3135 		return -ENOMEM;
3136 	}
3137 	list_add_rcu(&satcu->list, &dmar_satc_units);
3138 
3139 	return 0;
3140 }
3141 
3142 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3143 {
3144 	int sp, ret;
3145 	struct intel_iommu *iommu = dmaru->iommu;
3146 
3147 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3148 	if (ret)
3149 		goto out;
3150 
3151 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3152 		pr_warn("%s: Doesn't support hardware pass through.\n",
3153 			iommu->name);
3154 		return -ENXIO;
3155 	}
3156 
3157 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3158 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3159 		pr_warn("%s: Doesn't support large page.\n",
3160 			iommu->name);
3161 		return -ENXIO;
3162 	}
3163 
3164 	/*
3165 	 * Disable translation if already enabled prior to OS handover.
3166 	 */
3167 	if (iommu->gcmd & DMA_GCMD_TE)
3168 		iommu_disable_translation(iommu);
3169 
3170 	ret = iommu_init_domains(iommu);
3171 	if (ret == 0)
3172 		ret = iommu_alloc_root_entry(iommu);
3173 	if (ret)
3174 		goto out;
3175 
3176 	intel_svm_check(iommu);
3177 
3178 	if (dmaru->ignored) {
3179 		/*
3180 		 * we always have to disable PMRs or DMA may fail on this device
3181 		 */
3182 		if (force_on)
3183 			iommu_disable_protect_mem_regions(iommu);
3184 		return 0;
3185 	}
3186 
3187 	intel_iommu_init_qi(iommu);
3188 	iommu_flush_write_buffer(iommu);
3189 
3190 #ifdef CONFIG_INTEL_IOMMU_SVM
3191 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3192 		ret = intel_svm_enable_prq(iommu);
3193 		if (ret)
3194 			goto disable_iommu;
3195 	}
3196 #endif
3197 	ret = dmar_set_interrupt(iommu);
3198 	if (ret)
3199 		goto disable_iommu;
3200 
3201 	iommu_set_root_entry(iommu);
3202 	iommu_enable_translation(iommu);
3203 
3204 	iommu_disable_protect_mem_regions(iommu);
3205 	return 0;
3206 
3207 disable_iommu:
3208 	disable_dmar_iommu(iommu);
3209 out:
3210 	free_dmar_iommu(iommu);
3211 	return ret;
3212 }
3213 
3214 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3215 {
3216 	int ret = 0;
3217 	struct intel_iommu *iommu = dmaru->iommu;
3218 
3219 	if (!intel_iommu_enabled)
3220 		return 0;
3221 	if (iommu == NULL)
3222 		return -EINVAL;
3223 
3224 	if (insert) {
3225 		ret = intel_iommu_add(dmaru);
3226 	} else {
3227 		disable_dmar_iommu(iommu);
3228 		free_dmar_iommu(iommu);
3229 	}
3230 
3231 	return ret;
3232 }
3233 
3234 static void intel_iommu_free_dmars(void)
3235 {
3236 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3237 	struct dmar_atsr_unit *atsru, *atsr_n;
3238 	struct dmar_satc_unit *satcu, *satc_n;
3239 
3240 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3241 		list_del(&rmrru->list);
3242 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3243 		kfree(rmrru);
3244 	}
3245 
3246 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3247 		list_del(&atsru->list);
3248 		intel_iommu_free_atsr(atsru);
3249 	}
3250 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3251 		list_del(&satcu->list);
3252 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3253 		kfree(satcu);
3254 	}
3255 }
3256 
3257 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3258 {
3259 	struct dmar_satc_unit *satcu;
3260 	struct acpi_dmar_satc *satc;
3261 	struct device *tmp;
3262 	int i;
3263 
3264 	dev = pci_physfn(dev);
3265 	rcu_read_lock();
3266 
3267 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3268 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3269 		if (satc->segment != pci_domain_nr(dev->bus))
3270 			continue;
3271 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3272 			if (to_pci_dev(tmp) == dev)
3273 				goto out;
3274 	}
3275 	satcu = NULL;
3276 out:
3277 	rcu_read_unlock();
3278 	return satcu;
3279 }
3280 
3281 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3282 {
3283 	int i, ret = 1;
3284 	struct pci_bus *bus;
3285 	struct pci_dev *bridge = NULL;
3286 	struct device *tmp;
3287 	struct acpi_dmar_atsr *atsr;
3288 	struct dmar_atsr_unit *atsru;
3289 	struct dmar_satc_unit *satcu;
3290 
3291 	dev = pci_physfn(dev);
3292 	satcu = dmar_find_matched_satc_unit(dev);
3293 	if (satcu)
3294 		/*
3295 		 * This device supports ATS as it is in SATC table.
3296 		 * When IOMMU is in legacy mode, enabling ATS is done
3297 		 * automatically by HW for the device that requires
3298 		 * ATS, hence OS should not enable this device ATS
3299 		 * to avoid duplicated TLB invalidation.
3300 		 */
3301 		return !(satcu->atc_required && !sm_supported(iommu));
3302 
3303 	for (bus = dev->bus; bus; bus = bus->parent) {
3304 		bridge = bus->self;
3305 		/* If it's an integrated device, allow ATS */
3306 		if (!bridge)
3307 			return 1;
3308 		/* Connected via non-PCIe: no ATS */
3309 		if (!pci_is_pcie(bridge) ||
3310 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3311 			return 0;
3312 		/* If we found the root port, look it up in the ATSR */
3313 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3314 			break;
3315 	}
3316 
3317 	rcu_read_lock();
3318 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3319 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3320 		if (atsr->segment != pci_domain_nr(dev->bus))
3321 			continue;
3322 
3323 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3324 			if (tmp == &bridge->dev)
3325 				goto out;
3326 
3327 		if (atsru->include_all)
3328 			goto out;
3329 	}
3330 	ret = 0;
3331 out:
3332 	rcu_read_unlock();
3333 
3334 	return ret;
3335 }
3336 
3337 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3338 {
3339 	int ret;
3340 	struct dmar_rmrr_unit *rmrru;
3341 	struct dmar_atsr_unit *atsru;
3342 	struct dmar_satc_unit *satcu;
3343 	struct acpi_dmar_atsr *atsr;
3344 	struct acpi_dmar_reserved_memory *rmrr;
3345 	struct acpi_dmar_satc *satc;
3346 
3347 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3348 		return 0;
3349 
3350 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3351 		rmrr = container_of(rmrru->hdr,
3352 				    struct acpi_dmar_reserved_memory, header);
3353 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3354 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3355 				((void *)rmrr) + rmrr->header.length,
3356 				rmrr->segment, rmrru->devices,
3357 				rmrru->devices_cnt);
3358 			if (ret < 0)
3359 				return ret;
3360 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3361 			dmar_remove_dev_scope(info, rmrr->segment,
3362 				rmrru->devices, rmrru->devices_cnt);
3363 		}
3364 	}
3365 
3366 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3367 		if (atsru->include_all)
3368 			continue;
3369 
3370 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3371 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3372 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3373 					(void *)atsr + atsr->header.length,
3374 					atsr->segment, atsru->devices,
3375 					atsru->devices_cnt);
3376 			if (ret > 0)
3377 				break;
3378 			else if (ret < 0)
3379 				return ret;
3380 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3381 			if (dmar_remove_dev_scope(info, atsr->segment,
3382 					atsru->devices, atsru->devices_cnt))
3383 				break;
3384 		}
3385 	}
3386 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3387 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3388 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3389 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3390 					(void *)satc + satc->header.length,
3391 					satc->segment, satcu->devices,
3392 					satcu->devices_cnt);
3393 			if (ret > 0)
3394 				break;
3395 			else if (ret < 0)
3396 				return ret;
3397 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3398 			if (dmar_remove_dev_scope(info, satc->segment,
3399 					satcu->devices, satcu->devices_cnt))
3400 				break;
3401 		}
3402 	}
3403 
3404 	return 0;
3405 }
3406 
3407 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3408 				       unsigned long val, void *v)
3409 {
3410 	struct memory_notify *mhp = v;
3411 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3412 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3413 			mhp->nr_pages - 1);
3414 
3415 	switch (val) {
3416 	case MEM_GOING_ONLINE:
3417 		if (iommu_domain_identity_map(si_domain,
3418 					      start_vpfn, last_vpfn)) {
3419 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3420 				start_vpfn, last_vpfn);
3421 			return NOTIFY_BAD;
3422 		}
3423 		break;
3424 
3425 	case MEM_OFFLINE:
3426 	case MEM_CANCEL_ONLINE:
3427 		{
3428 			struct dmar_drhd_unit *drhd;
3429 			struct intel_iommu *iommu;
3430 			LIST_HEAD(freelist);
3431 
3432 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3433 
3434 			rcu_read_lock();
3435 			for_each_active_iommu(iommu, drhd)
3436 				iommu_flush_iotlb_psi(iommu, si_domain,
3437 					start_vpfn, mhp->nr_pages,
3438 					list_empty(&freelist), 0);
3439 			rcu_read_unlock();
3440 			put_pages_list(&freelist);
3441 		}
3442 		break;
3443 	}
3444 
3445 	return NOTIFY_OK;
3446 }
3447 
3448 static struct notifier_block intel_iommu_memory_nb = {
3449 	.notifier_call = intel_iommu_memory_notifier,
3450 	.priority = 0
3451 };
3452 
3453 static void intel_disable_iommus(void)
3454 {
3455 	struct intel_iommu *iommu = NULL;
3456 	struct dmar_drhd_unit *drhd;
3457 
3458 	for_each_iommu(iommu, drhd)
3459 		iommu_disable_translation(iommu);
3460 }
3461 
3462 void intel_iommu_shutdown(void)
3463 {
3464 	struct dmar_drhd_unit *drhd;
3465 	struct intel_iommu *iommu = NULL;
3466 
3467 	if (no_iommu || dmar_disabled)
3468 		return;
3469 
3470 	down_write(&dmar_global_lock);
3471 
3472 	/* Disable PMRs explicitly here. */
3473 	for_each_iommu(iommu, drhd)
3474 		iommu_disable_protect_mem_regions(iommu);
3475 
3476 	/* Make sure the IOMMUs are switched off */
3477 	intel_disable_iommus();
3478 
3479 	up_write(&dmar_global_lock);
3480 }
3481 
3482 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3483 {
3484 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3485 
3486 	return container_of(iommu_dev, struct intel_iommu, iommu);
3487 }
3488 
3489 static ssize_t version_show(struct device *dev,
3490 			    struct device_attribute *attr, char *buf)
3491 {
3492 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3493 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3494 	return sysfs_emit(buf, "%d:%d\n",
3495 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3496 }
3497 static DEVICE_ATTR_RO(version);
3498 
3499 static ssize_t address_show(struct device *dev,
3500 			    struct device_attribute *attr, char *buf)
3501 {
3502 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3503 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3504 }
3505 static DEVICE_ATTR_RO(address);
3506 
3507 static ssize_t cap_show(struct device *dev,
3508 			struct device_attribute *attr, char *buf)
3509 {
3510 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3511 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3512 }
3513 static DEVICE_ATTR_RO(cap);
3514 
3515 static ssize_t ecap_show(struct device *dev,
3516 			 struct device_attribute *attr, char *buf)
3517 {
3518 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3519 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3520 }
3521 static DEVICE_ATTR_RO(ecap);
3522 
3523 static ssize_t domains_supported_show(struct device *dev,
3524 				      struct device_attribute *attr, char *buf)
3525 {
3526 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3527 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3528 }
3529 static DEVICE_ATTR_RO(domains_supported);
3530 
3531 static ssize_t domains_used_show(struct device *dev,
3532 				 struct device_attribute *attr, char *buf)
3533 {
3534 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3535 	return sysfs_emit(buf, "%d\n",
3536 			  bitmap_weight(iommu->domain_ids,
3537 					cap_ndoms(iommu->cap)));
3538 }
3539 static DEVICE_ATTR_RO(domains_used);
3540 
3541 static struct attribute *intel_iommu_attrs[] = {
3542 	&dev_attr_version.attr,
3543 	&dev_attr_address.attr,
3544 	&dev_attr_cap.attr,
3545 	&dev_attr_ecap.attr,
3546 	&dev_attr_domains_supported.attr,
3547 	&dev_attr_domains_used.attr,
3548 	NULL,
3549 };
3550 
3551 static struct attribute_group intel_iommu_group = {
3552 	.name = "intel-iommu",
3553 	.attrs = intel_iommu_attrs,
3554 };
3555 
3556 const struct attribute_group *intel_iommu_groups[] = {
3557 	&intel_iommu_group,
3558 	NULL,
3559 };
3560 
3561 static bool has_external_pci(void)
3562 {
3563 	struct pci_dev *pdev = NULL;
3564 
3565 	for_each_pci_dev(pdev)
3566 		if (pdev->external_facing) {
3567 			pci_dev_put(pdev);
3568 			return true;
3569 		}
3570 
3571 	return false;
3572 }
3573 
3574 static int __init platform_optin_force_iommu(void)
3575 {
3576 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3577 		return 0;
3578 
3579 	if (no_iommu || dmar_disabled)
3580 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3581 
3582 	/*
3583 	 * If Intel-IOMMU is disabled by default, we will apply identity
3584 	 * map for all devices except those marked as being untrusted.
3585 	 */
3586 	if (dmar_disabled)
3587 		iommu_set_default_passthrough(false);
3588 
3589 	dmar_disabled = 0;
3590 	no_iommu = 0;
3591 
3592 	return 1;
3593 }
3594 
3595 static int __init probe_acpi_namespace_devices(void)
3596 {
3597 	struct dmar_drhd_unit *drhd;
3598 	/* To avoid a -Wunused-but-set-variable warning. */
3599 	struct intel_iommu *iommu __maybe_unused;
3600 	struct device *dev;
3601 	int i, ret = 0;
3602 
3603 	for_each_active_iommu(iommu, drhd) {
3604 		for_each_active_dev_scope(drhd->devices,
3605 					  drhd->devices_cnt, i, dev) {
3606 			struct acpi_device_physical_node *pn;
3607 			struct acpi_device *adev;
3608 
3609 			if (dev->bus != &acpi_bus_type)
3610 				continue;
3611 
3612 			adev = to_acpi_device(dev);
3613 			mutex_lock(&adev->physical_node_lock);
3614 			list_for_each_entry(pn,
3615 					    &adev->physical_node_list, node) {
3616 				ret = iommu_probe_device(pn->dev);
3617 				if (ret)
3618 					break;
3619 			}
3620 			mutex_unlock(&adev->physical_node_lock);
3621 
3622 			if (ret)
3623 				return ret;
3624 		}
3625 	}
3626 
3627 	return 0;
3628 }
3629 
3630 static __init int tboot_force_iommu(void)
3631 {
3632 	if (!tboot_enabled())
3633 		return 0;
3634 
3635 	if (no_iommu || dmar_disabled)
3636 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3637 
3638 	dmar_disabled = 0;
3639 	no_iommu = 0;
3640 
3641 	return 1;
3642 }
3643 
3644 int __init intel_iommu_init(void)
3645 {
3646 	int ret = -ENODEV;
3647 	struct dmar_drhd_unit *drhd;
3648 	struct intel_iommu *iommu;
3649 
3650 	/*
3651 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3652 	 * opt in, so enforce that.
3653 	 */
3654 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3655 		    platform_optin_force_iommu();
3656 
3657 	down_write(&dmar_global_lock);
3658 	if (dmar_table_init()) {
3659 		if (force_on)
3660 			panic("tboot: Failed to initialize DMAR table\n");
3661 		goto out_free_dmar;
3662 	}
3663 
3664 	if (dmar_dev_scope_init() < 0) {
3665 		if (force_on)
3666 			panic("tboot: Failed to initialize DMAR device scope\n");
3667 		goto out_free_dmar;
3668 	}
3669 
3670 	up_write(&dmar_global_lock);
3671 
3672 	/*
3673 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3674 	 * complain later when we register it under the lock.
3675 	 */
3676 	dmar_register_bus_notifier();
3677 
3678 	down_write(&dmar_global_lock);
3679 
3680 	if (!no_iommu)
3681 		intel_iommu_debugfs_init();
3682 
3683 	if (no_iommu || dmar_disabled) {
3684 		/*
3685 		 * We exit the function here to ensure IOMMU's remapping and
3686 		 * mempool aren't setup, which means that the IOMMU's PMRs
3687 		 * won't be disabled via the call to init_dmars(). So disable
3688 		 * it explicitly here. The PMRs were setup by tboot prior to
3689 		 * calling SENTER, but the kernel is expected to reset/tear
3690 		 * down the PMRs.
3691 		 */
3692 		if (intel_iommu_tboot_noforce) {
3693 			for_each_iommu(iommu, drhd)
3694 				iommu_disable_protect_mem_regions(iommu);
3695 		}
3696 
3697 		/*
3698 		 * Make sure the IOMMUs are switched off, even when we
3699 		 * boot into a kexec kernel and the previous kernel left
3700 		 * them enabled
3701 		 */
3702 		intel_disable_iommus();
3703 		goto out_free_dmar;
3704 	}
3705 
3706 	if (list_empty(&dmar_rmrr_units))
3707 		pr_info("No RMRR found\n");
3708 
3709 	if (list_empty(&dmar_atsr_units))
3710 		pr_info("No ATSR found\n");
3711 
3712 	if (list_empty(&dmar_satc_units))
3713 		pr_info("No SATC found\n");
3714 
3715 	init_no_remapping_devices();
3716 
3717 	ret = init_dmars();
3718 	if (ret) {
3719 		if (force_on)
3720 			panic("tboot: Failed to initialize DMARs\n");
3721 		pr_err("Initialization failed\n");
3722 		goto out_free_dmar;
3723 	}
3724 	up_write(&dmar_global_lock);
3725 
3726 	init_iommu_pm_ops();
3727 
3728 	down_read(&dmar_global_lock);
3729 	for_each_active_iommu(iommu, drhd) {
3730 		/*
3731 		 * The flush queue implementation does not perform
3732 		 * page-selective invalidations that are required for efficient
3733 		 * TLB flushes in virtual environments.  The benefit of batching
3734 		 * is likely to be much lower than the overhead of synchronizing
3735 		 * the virtual and physical IOMMU page-tables.
3736 		 */
3737 		if (cap_caching_mode(iommu->cap) &&
3738 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3739 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3740 			iommu_set_dma_strict();
3741 		}
3742 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3743 				       intel_iommu_groups,
3744 				       "%s", iommu->name);
3745 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3746 
3747 		iommu_pmu_register(iommu);
3748 	}
3749 	up_read(&dmar_global_lock);
3750 
3751 	if (si_domain && !hw_pass_through)
3752 		register_memory_notifier(&intel_iommu_memory_nb);
3753 
3754 	down_read(&dmar_global_lock);
3755 	if (probe_acpi_namespace_devices())
3756 		pr_warn("ACPI name space devices didn't probe correctly\n");
3757 
3758 	/* Finally, we enable the DMA remapping hardware. */
3759 	for_each_iommu(iommu, drhd) {
3760 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3761 			iommu_enable_translation(iommu);
3762 
3763 		iommu_disable_protect_mem_regions(iommu);
3764 	}
3765 	up_read(&dmar_global_lock);
3766 
3767 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3768 
3769 	intel_iommu_enabled = 1;
3770 
3771 	return 0;
3772 
3773 out_free_dmar:
3774 	intel_iommu_free_dmars();
3775 	up_write(&dmar_global_lock);
3776 	return ret;
3777 }
3778 
3779 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3780 {
3781 	struct device_domain_info *info = opaque;
3782 
3783 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3784 	return 0;
3785 }
3786 
3787 /*
3788  * NB - intel-iommu lacks any sort of reference counting for the users of
3789  * dependent devices.  If multiple endpoints have intersecting dependent
3790  * devices, unbinding the driver from any one of them will possibly leave
3791  * the others unable to operate.
3792  */
3793 static void domain_context_clear(struct device_domain_info *info)
3794 {
3795 	if (!dev_is_pci(info->dev))
3796 		domain_context_clear_one(info, info->bus, info->devfn);
3797 
3798 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3799 			       &domain_context_clear_one_cb, info);
3800 }
3801 
3802 static void dmar_remove_one_dev_info(struct device *dev)
3803 {
3804 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3805 	struct dmar_domain *domain = info->domain;
3806 	struct intel_iommu *iommu = info->iommu;
3807 	unsigned long flags;
3808 
3809 	if (!dev_is_real_dma_subdevice(info->dev)) {
3810 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3811 			intel_pasid_tear_down_entry(iommu, info->dev,
3812 					IOMMU_NO_PASID, false);
3813 
3814 		iommu_disable_pci_caps(info);
3815 		domain_context_clear(info);
3816 	}
3817 
3818 	spin_lock_irqsave(&domain->lock, flags);
3819 	list_del(&info->link);
3820 	spin_unlock_irqrestore(&domain->lock, flags);
3821 
3822 	domain_detach_iommu(domain, iommu);
3823 	info->domain = NULL;
3824 }
3825 
3826 /*
3827  * Clear the page table pointer in context or pasid table entries so that
3828  * all DMA requests without PASID from the device are blocked. If the page
3829  * table has been set, clean up the data structures.
3830  */
3831 void device_block_translation(struct device *dev)
3832 {
3833 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3834 	struct intel_iommu *iommu = info->iommu;
3835 	unsigned long flags;
3836 
3837 	iommu_disable_pci_caps(info);
3838 	if (!dev_is_real_dma_subdevice(dev)) {
3839 		if (sm_supported(iommu))
3840 			intel_pasid_tear_down_entry(iommu, dev,
3841 						    IOMMU_NO_PASID, false);
3842 		else
3843 			domain_context_clear(info);
3844 	}
3845 
3846 	if (!info->domain)
3847 		return;
3848 
3849 	spin_lock_irqsave(&info->domain->lock, flags);
3850 	list_del(&info->link);
3851 	spin_unlock_irqrestore(&info->domain->lock, flags);
3852 
3853 	domain_detach_iommu(info->domain, iommu);
3854 	info->domain = NULL;
3855 }
3856 
3857 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3858 {
3859 	int adjust_width;
3860 
3861 	/* calculate AGAW */
3862 	domain->gaw = guest_width;
3863 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3864 	domain->agaw = width_to_agaw(adjust_width);
3865 
3866 	domain->iommu_coherency = false;
3867 	domain->iommu_superpage = 0;
3868 	domain->max_addr = 0;
3869 
3870 	/* always allocate the top pgd */
3871 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3872 	if (!domain->pgd)
3873 		return -ENOMEM;
3874 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3875 	return 0;
3876 }
3877 
3878 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3879 				      struct device *dev)
3880 {
3881 	device_block_translation(dev);
3882 	return 0;
3883 }
3884 
3885 static struct iommu_domain blocking_domain = {
3886 	.type = IOMMU_DOMAIN_BLOCKED,
3887 	.ops = &(const struct iommu_domain_ops) {
3888 		.attach_dev	= blocking_domain_attach_dev,
3889 	}
3890 };
3891 
3892 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3893 {
3894 	struct dmar_domain *dmar_domain;
3895 	struct iommu_domain *domain;
3896 
3897 	switch (type) {
3898 	case IOMMU_DOMAIN_DMA:
3899 	case IOMMU_DOMAIN_UNMANAGED:
3900 		dmar_domain = alloc_domain(type);
3901 		if (!dmar_domain) {
3902 			pr_err("Can't allocate dmar_domain\n");
3903 			return NULL;
3904 		}
3905 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3906 			pr_err("Domain initialization failed\n");
3907 			domain_exit(dmar_domain);
3908 			return NULL;
3909 		}
3910 
3911 		domain = &dmar_domain->domain;
3912 		domain->geometry.aperture_start = 0;
3913 		domain->geometry.aperture_end   =
3914 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3915 		domain->geometry.force_aperture = true;
3916 
3917 		return domain;
3918 	case IOMMU_DOMAIN_IDENTITY:
3919 		return &si_domain->domain;
3920 	case IOMMU_DOMAIN_SVA:
3921 		return intel_svm_domain_alloc();
3922 	default:
3923 		return NULL;
3924 	}
3925 
3926 	return NULL;
3927 }
3928 
3929 static struct iommu_domain *
3930 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3931 			      struct iommu_domain *parent,
3932 			      const struct iommu_user_data *user_data)
3933 {
3934 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3935 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3936 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3937 	struct intel_iommu *iommu = info->iommu;
3938 	struct dmar_domain *dmar_domain;
3939 	struct iommu_domain *domain;
3940 
3941 	/* Must be NESTING domain */
3942 	if (parent) {
3943 		if (!nested_supported(iommu) || flags)
3944 			return ERR_PTR(-EOPNOTSUPP);
3945 		return intel_nested_domain_alloc(parent, user_data);
3946 	}
3947 
3948 	if (flags &
3949 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3950 		return ERR_PTR(-EOPNOTSUPP);
3951 	if (nested_parent && !nested_supported(iommu))
3952 		return ERR_PTR(-EOPNOTSUPP);
3953 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3954 		return ERR_PTR(-EOPNOTSUPP);
3955 
3956 	/*
3957 	 * domain_alloc_user op needs to fully initialize a domain before
3958 	 * return, so uses iommu_domain_alloc() here for simple.
3959 	 */
3960 	domain = iommu_domain_alloc(dev->bus);
3961 	if (!domain)
3962 		return ERR_PTR(-ENOMEM);
3963 
3964 	dmar_domain = to_dmar_domain(domain);
3965 
3966 	if (nested_parent) {
3967 		dmar_domain->nested_parent = true;
3968 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3969 		spin_lock_init(&dmar_domain->s1_lock);
3970 	}
3971 
3972 	if (dirty_tracking) {
3973 		if (dmar_domain->use_first_level) {
3974 			iommu_domain_free(domain);
3975 			return ERR_PTR(-EOPNOTSUPP);
3976 		}
3977 		domain->dirty_ops = &intel_dirty_ops;
3978 	}
3979 
3980 	return domain;
3981 }
3982 
3983 static void intel_iommu_domain_free(struct iommu_domain *domain)
3984 {
3985 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3986 
3987 	WARN_ON(dmar_domain->nested_parent &&
3988 		!list_empty(&dmar_domain->s1_domains));
3989 	if (domain != &si_domain->domain)
3990 		domain_exit(dmar_domain);
3991 }
3992 
3993 int prepare_domain_attach_device(struct iommu_domain *domain,
3994 				 struct device *dev)
3995 {
3996 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3997 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3998 	struct intel_iommu *iommu = info->iommu;
3999 	int addr_width;
4000 
4001 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4002 		return -EINVAL;
4003 
4004 	if (domain->dirty_ops && !ssads_supported(iommu))
4005 		return -EINVAL;
4006 
4007 	/* check if this iommu agaw is sufficient for max mapped address */
4008 	addr_width = agaw_to_width(iommu->agaw);
4009 	if (addr_width > cap_mgaw(iommu->cap))
4010 		addr_width = cap_mgaw(iommu->cap);
4011 
4012 	if (dmar_domain->max_addr > (1LL << addr_width))
4013 		return -EINVAL;
4014 	dmar_domain->gaw = addr_width;
4015 
4016 	/*
4017 	 * Knock out extra levels of page tables if necessary
4018 	 */
4019 	while (iommu->agaw < dmar_domain->agaw) {
4020 		struct dma_pte *pte;
4021 
4022 		pte = dmar_domain->pgd;
4023 		if (dma_pte_present(pte)) {
4024 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4025 			free_pgtable_page(pte);
4026 		}
4027 		dmar_domain->agaw--;
4028 	}
4029 
4030 	return 0;
4031 }
4032 
4033 static int intel_iommu_attach_device(struct iommu_domain *domain,
4034 				     struct device *dev)
4035 {
4036 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4037 	int ret;
4038 
4039 	if (info->domain)
4040 		device_block_translation(dev);
4041 
4042 	ret = prepare_domain_attach_device(domain, dev);
4043 	if (ret)
4044 		return ret;
4045 
4046 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4047 }
4048 
4049 static int intel_iommu_map(struct iommu_domain *domain,
4050 			   unsigned long iova, phys_addr_t hpa,
4051 			   size_t size, int iommu_prot, gfp_t gfp)
4052 {
4053 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4054 	u64 max_addr;
4055 	int prot = 0;
4056 
4057 	if (iommu_prot & IOMMU_READ)
4058 		prot |= DMA_PTE_READ;
4059 	if (iommu_prot & IOMMU_WRITE)
4060 		prot |= DMA_PTE_WRITE;
4061 	if (dmar_domain->set_pte_snp)
4062 		prot |= DMA_PTE_SNP;
4063 
4064 	max_addr = iova + size;
4065 	if (dmar_domain->max_addr < max_addr) {
4066 		u64 end;
4067 
4068 		/* check if minimum agaw is sufficient for mapped address */
4069 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4070 		if (end < max_addr) {
4071 			pr_err("%s: iommu width (%d) is not "
4072 			       "sufficient for the mapped address (%llx)\n",
4073 			       __func__, dmar_domain->gaw, max_addr);
4074 			return -EFAULT;
4075 		}
4076 		dmar_domain->max_addr = max_addr;
4077 	}
4078 	/* Round up size to next multiple of PAGE_SIZE, if it and
4079 	   the low bits of hpa would take us onto the next page */
4080 	size = aligned_nrpages(hpa, size);
4081 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4082 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4083 }
4084 
4085 static int intel_iommu_map_pages(struct iommu_domain *domain,
4086 				 unsigned long iova, phys_addr_t paddr,
4087 				 size_t pgsize, size_t pgcount,
4088 				 int prot, gfp_t gfp, size_t *mapped)
4089 {
4090 	unsigned long pgshift = __ffs(pgsize);
4091 	size_t size = pgcount << pgshift;
4092 	int ret;
4093 
4094 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4095 		return -EINVAL;
4096 
4097 	if (!IS_ALIGNED(iova | paddr, pgsize))
4098 		return -EINVAL;
4099 
4100 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4101 	if (!ret && mapped)
4102 		*mapped = size;
4103 
4104 	return ret;
4105 }
4106 
4107 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4108 				unsigned long iova, size_t size,
4109 				struct iommu_iotlb_gather *gather)
4110 {
4111 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4112 	unsigned long start_pfn, last_pfn;
4113 	int level = 0;
4114 
4115 	/* Cope with horrid API which requires us to unmap more than the
4116 	   size argument if it happens to be a large-page mapping. */
4117 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4118 				     &level, GFP_ATOMIC)))
4119 		return 0;
4120 
4121 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4122 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4123 
4124 	start_pfn = iova >> VTD_PAGE_SHIFT;
4125 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4126 
4127 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4128 
4129 	if (dmar_domain->max_addr == iova + size)
4130 		dmar_domain->max_addr = iova;
4131 
4132 	/*
4133 	 * We do not use page-selective IOTLB invalidation in flush queue,
4134 	 * so there is no need to track page and sync iotlb.
4135 	 */
4136 	if (!iommu_iotlb_gather_queued(gather))
4137 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4138 
4139 	return size;
4140 }
4141 
4142 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4143 				      unsigned long iova,
4144 				      size_t pgsize, size_t pgcount,
4145 				      struct iommu_iotlb_gather *gather)
4146 {
4147 	unsigned long pgshift = __ffs(pgsize);
4148 	size_t size = pgcount << pgshift;
4149 
4150 	return intel_iommu_unmap(domain, iova, size, gather);
4151 }
4152 
4153 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4154 				 struct iommu_iotlb_gather *gather)
4155 {
4156 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4157 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4158 	size_t size = gather->end - gather->start;
4159 	struct iommu_domain_info *info;
4160 	unsigned long start_pfn;
4161 	unsigned long nrpages;
4162 	unsigned long i;
4163 
4164 	nrpages = aligned_nrpages(gather->start, size);
4165 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4166 
4167 	xa_for_each(&dmar_domain->iommu_array, i, info)
4168 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4169 				      start_pfn, nrpages,
4170 				      list_empty(&gather->freelist), 0);
4171 
4172 	if (dmar_domain->nested_parent)
4173 		parent_domain_flush(dmar_domain, start_pfn, nrpages,
4174 				    list_empty(&gather->freelist));
4175 	put_pages_list(&gather->freelist);
4176 }
4177 
4178 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4179 					    dma_addr_t iova)
4180 {
4181 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4182 	struct dma_pte *pte;
4183 	int level = 0;
4184 	u64 phys = 0;
4185 
4186 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4187 			     GFP_ATOMIC);
4188 	if (pte && dma_pte_present(pte))
4189 		phys = dma_pte_addr(pte) +
4190 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4191 						VTD_PAGE_SHIFT) - 1));
4192 
4193 	return phys;
4194 }
4195 
4196 static bool domain_support_force_snooping(struct dmar_domain *domain)
4197 {
4198 	struct device_domain_info *info;
4199 	bool support = true;
4200 
4201 	assert_spin_locked(&domain->lock);
4202 	list_for_each_entry(info, &domain->devices, link) {
4203 		if (!ecap_sc_support(info->iommu->ecap)) {
4204 			support = false;
4205 			break;
4206 		}
4207 	}
4208 
4209 	return support;
4210 }
4211 
4212 static void domain_set_force_snooping(struct dmar_domain *domain)
4213 {
4214 	struct device_domain_info *info;
4215 
4216 	assert_spin_locked(&domain->lock);
4217 	/*
4218 	 * Second level page table supports per-PTE snoop control. The
4219 	 * iommu_map() interface will handle this by setting SNP bit.
4220 	 */
4221 	if (!domain->use_first_level) {
4222 		domain->set_pte_snp = true;
4223 		return;
4224 	}
4225 
4226 	list_for_each_entry(info, &domain->devices, link)
4227 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4228 						     IOMMU_NO_PASID);
4229 }
4230 
4231 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4232 {
4233 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4234 	unsigned long flags;
4235 
4236 	if (dmar_domain->force_snooping)
4237 		return true;
4238 
4239 	spin_lock_irqsave(&dmar_domain->lock, flags);
4240 	if (!domain_support_force_snooping(dmar_domain) ||
4241 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4242 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4243 		return false;
4244 	}
4245 
4246 	domain_set_force_snooping(dmar_domain);
4247 	dmar_domain->force_snooping = true;
4248 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4249 
4250 	return true;
4251 }
4252 
4253 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4254 {
4255 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4256 
4257 	switch (cap) {
4258 	case IOMMU_CAP_CACHE_COHERENCY:
4259 	case IOMMU_CAP_DEFERRED_FLUSH:
4260 		return true;
4261 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4262 		return dmar_platform_optin();
4263 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4264 		return ecap_sc_support(info->iommu->ecap);
4265 	case IOMMU_CAP_DIRTY_TRACKING:
4266 		return ssads_supported(info->iommu);
4267 	default:
4268 		return false;
4269 	}
4270 }
4271 
4272 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4273 {
4274 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4275 	struct device_domain_info *info;
4276 	struct intel_iommu *iommu;
4277 	u8 bus, devfn;
4278 	int ret;
4279 
4280 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4281 	if (!iommu || !iommu->iommu.ops)
4282 		return ERR_PTR(-ENODEV);
4283 
4284 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4285 	if (!info)
4286 		return ERR_PTR(-ENOMEM);
4287 
4288 	if (dev_is_real_dma_subdevice(dev)) {
4289 		info->bus = pdev->bus->number;
4290 		info->devfn = pdev->devfn;
4291 		info->segment = pci_domain_nr(pdev->bus);
4292 	} else {
4293 		info->bus = bus;
4294 		info->devfn = devfn;
4295 		info->segment = iommu->segment;
4296 	}
4297 
4298 	info->dev = dev;
4299 	info->iommu = iommu;
4300 	if (dev_is_pci(dev)) {
4301 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4302 		    pci_ats_supported(pdev) &&
4303 		    dmar_ats_supported(pdev, iommu)) {
4304 			info->ats_supported = 1;
4305 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4306 
4307 			/*
4308 			 * For IOMMU that supports device IOTLB throttling
4309 			 * (DIT), we assign PFSID to the invalidation desc
4310 			 * of a VF such that IOMMU HW can gauge queue depth
4311 			 * at PF level. If DIT is not set, PFSID will be
4312 			 * treated as reserved, which should be set to 0.
4313 			 */
4314 			if (ecap_dit(iommu->ecap))
4315 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4316 			info->ats_qdep = pci_ats_queue_depth(pdev);
4317 		}
4318 		if (sm_supported(iommu)) {
4319 			if (pasid_supported(iommu)) {
4320 				int features = pci_pasid_features(pdev);
4321 
4322 				if (features >= 0)
4323 					info->pasid_supported = features | 1;
4324 			}
4325 
4326 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4327 			    pci_pri_supported(pdev))
4328 				info->pri_supported = 1;
4329 		}
4330 	}
4331 
4332 	dev_iommu_priv_set(dev, info);
4333 
4334 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4335 		ret = intel_pasid_alloc_table(dev);
4336 		if (ret) {
4337 			dev_err(dev, "PASID table allocation failed\n");
4338 			kfree(info);
4339 			return ERR_PTR(ret);
4340 		}
4341 	}
4342 
4343 	intel_iommu_debugfs_create_dev(info);
4344 
4345 	return &iommu->iommu;
4346 }
4347 
4348 static void intel_iommu_release_device(struct device *dev)
4349 {
4350 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4351 
4352 	dmar_remove_one_dev_info(dev);
4353 	intel_pasid_free_table(dev);
4354 	intel_iommu_debugfs_remove_dev(info);
4355 	kfree(info);
4356 	set_dma_ops(dev, NULL);
4357 }
4358 
4359 static void intel_iommu_probe_finalize(struct device *dev)
4360 {
4361 	set_dma_ops(dev, NULL);
4362 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4363 }
4364 
4365 static void intel_iommu_get_resv_regions(struct device *device,
4366 					 struct list_head *head)
4367 {
4368 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4369 	struct iommu_resv_region *reg;
4370 	struct dmar_rmrr_unit *rmrr;
4371 	struct device *i_dev;
4372 	int i;
4373 
4374 	rcu_read_lock();
4375 	for_each_rmrr_units(rmrr) {
4376 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4377 					  i, i_dev) {
4378 			struct iommu_resv_region *resv;
4379 			enum iommu_resv_type type;
4380 			size_t length;
4381 
4382 			if (i_dev != device &&
4383 			    !is_downstream_to_pci_bridge(device, i_dev))
4384 				continue;
4385 
4386 			length = rmrr->end_address - rmrr->base_address + 1;
4387 
4388 			type = device_rmrr_is_relaxable(device) ?
4389 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4390 
4391 			resv = iommu_alloc_resv_region(rmrr->base_address,
4392 						       length, prot, type,
4393 						       GFP_ATOMIC);
4394 			if (!resv)
4395 				break;
4396 
4397 			list_add_tail(&resv->list, head);
4398 		}
4399 	}
4400 	rcu_read_unlock();
4401 
4402 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4403 	if (dev_is_pci(device)) {
4404 		struct pci_dev *pdev = to_pci_dev(device);
4405 
4406 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4407 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4408 					IOMMU_RESV_DIRECT_RELAXABLE,
4409 					GFP_KERNEL);
4410 			if (reg)
4411 				list_add_tail(&reg->list, head);
4412 		}
4413 	}
4414 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4415 
4416 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4417 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4418 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4419 	if (!reg)
4420 		return;
4421 	list_add_tail(&reg->list, head);
4422 }
4423 
4424 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4425 {
4426 	if (dev_is_pci(dev))
4427 		return pci_device_group(dev);
4428 	return generic_device_group(dev);
4429 }
4430 
4431 static int intel_iommu_enable_sva(struct device *dev)
4432 {
4433 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4434 	struct intel_iommu *iommu;
4435 
4436 	if (!info || dmar_disabled)
4437 		return -EINVAL;
4438 
4439 	iommu = info->iommu;
4440 	if (!iommu)
4441 		return -EINVAL;
4442 
4443 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4444 		return -ENODEV;
4445 
4446 	if (!info->pasid_enabled || !info->ats_enabled)
4447 		return -EINVAL;
4448 
4449 	/*
4450 	 * Devices having device-specific I/O fault handling should not
4451 	 * support PCI/PRI. The IOMMU side has no means to check the
4452 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4453 	 * default that if the device driver enables SVA on a non-PRI
4454 	 * device, it will handle IOPF in its own way.
4455 	 */
4456 	if (!info->pri_supported)
4457 		return 0;
4458 
4459 	/* Devices supporting PRI should have it enabled. */
4460 	if (!info->pri_enabled)
4461 		return -EINVAL;
4462 
4463 	return 0;
4464 }
4465 
4466 static int intel_iommu_enable_iopf(struct device *dev)
4467 {
4468 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4469 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4470 	struct intel_iommu *iommu;
4471 	int ret;
4472 
4473 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4474 		return -ENODEV;
4475 
4476 	if (info->pri_enabled)
4477 		return -EBUSY;
4478 
4479 	iommu = info->iommu;
4480 	if (!iommu)
4481 		return -EINVAL;
4482 
4483 	/* PASID is required in PRG Response Message. */
4484 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4485 		return -EINVAL;
4486 
4487 	ret = pci_reset_pri(pdev);
4488 	if (ret)
4489 		return ret;
4490 
4491 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4492 	if (ret)
4493 		return ret;
4494 
4495 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4496 	if (ret)
4497 		goto iopf_remove_device;
4498 
4499 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4500 	if (ret)
4501 		goto iopf_unregister_handler;
4502 	info->pri_enabled = 1;
4503 
4504 	return 0;
4505 
4506 iopf_unregister_handler:
4507 	iommu_unregister_device_fault_handler(dev);
4508 iopf_remove_device:
4509 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4510 
4511 	return ret;
4512 }
4513 
4514 static int intel_iommu_disable_iopf(struct device *dev)
4515 {
4516 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4517 	struct intel_iommu *iommu = info->iommu;
4518 
4519 	if (!info->pri_enabled)
4520 		return -EINVAL;
4521 
4522 	/*
4523 	 * PCIe spec states that by clearing PRI enable bit, the Page
4524 	 * Request Interface will not issue new page requests, but has
4525 	 * outstanding page requests that have been transmitted or are
4526 	 * queued for transmission. This is supposed to be called after
4527 	 * the device driver has stopped DMA, all PASIDs have been
4528 	 * unbound and the outstanding PRQs have been drained.
4529 	 */
4530 	pci_disable_pri(to_pci_dev(dev));
4531 	info->pri_enabled = 0;
4532 
4533 	/*
4534 	 * With PRI disabled and outstanding PRQs drained, unregistering
4535 	 * fault handler and removing device from iopf queue should never
4536 	 * fail.
4537 	 */
4538 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4539 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4540 
4541 	return 0;
4542 }
4543 
4544 static int
4545 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4546 {
4547 	switch (feat) {
4548 	case IOMMU_DEV_FEAT_IOPF:
4549 		return intel_iommu_enable_iopf(dev);
4550 
4551 	case IOMMU_DEV_FEAT_SVA:
4552 		return intel_iommu_enable_sva(dev);
4553 
4554 	default:
4555 		return -ENODEV;
4556 	}
4557 }
4558 
4559 static int
4560 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4561 {
4562 	switch (feat) {
4563 	case IOMMU_DEV_FEAT_IOPF:
4564 		return intel_iommu_disable_iopf(dev);
4565 
4566 	case IOMMU_DEV_FEAT_SVA:
4567 		return 0;
4568 
4569 	default:
4570 		return -ENODEV;
4571 	}
4572 }
4573 
4574 static bool intel_iommu_is_attach_deferred(struct device *dev)
4575 {
4576 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4577 
4578 	return translation_pre_enabled(info->iommu) && !info->domain;
4579 }
4580 
4581 /*
4582  * Check that the device does not live on an external facing PCI port that is
4583  * marked as untrusted. Such devices should not be able to apply quirks and
4584  * thus not be able to bypass the IOMMU restrictions.
4585  */
4586 static bool risky_device(struct pci_dev *pdev)
4587 {
4588 	if (pdev->untrusted) {
4589 		pci_info(pdev,
4590 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4591 			 pdev->vendor, pdev->device);
4592 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4593 		return true;
4594 	}
4595 	return false;
4596 }
4597 
4598 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4599 				      unsigned long iova, size_t size)
4600 {
4601 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4602 	unsigned long pages = aligned_nrpages(iova, size);
4603 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4604 	struct iommu_domain_info *info;
4605 	unsigned long i;
4606 
4607 	xa_for_each(&dmar_domain->iommu_array, i, info)
4608 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4609 	return 0;
4610 }
4611 
4612 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4613 {
4614 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4615 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4616 	struct intel_iommu *iommu = info->iommu;
4617 	struct dmar_domain *dmar_domain;
4618 	struct iommu_domain *domain;
4619 	unsigned long flags;
4620 
4621 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4622 	if (WARN_ON_ONCE(!domain))
4623 		goto out_tear_down;
4624 
4625 	/*
4626 	 * The SVA implementation needs to handle its own stuffs like the mm
4627 	 * notification. Before consolidating that code into iommu core, let
4628 	 * the intel sva code handle it.
4629 	 */
4630 	if (domain->type == IOMMU_DOMAIN_SVA) {
4631 		intel_svm_remove_dev_pasid(dev, pasid);
4632 		goto out_tear_down;
4633 	}
4634 
4635 	dmar_domain = to_dmar_domain(domain);
4636 	spin_lock_irqsave(&dmar_domain->lock, flags);
4637 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4638 		if (curr->dev == dev && curr->pasid == pasid) {
4639 			list_del(&curr->link_domain);
4640 			dev_pasid = curr;
4641 			break;
4642 		}
4643 	}
4644 	WARN_ON_ONCE(!dev_pasid);
4645 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4646 
4647 	domain_detach_iommu(dmar_domain, iommu);
4648 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4649 	kfree(dev_pasid);
4650 out_tear_down:
4651 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4652 	intel_drain_pasid_prq(dev, pasid);
4653 }
4654 
4655 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4656 				     struct device *dev, ioasid_t pasid)
4657 {
4658 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4659 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4660 	struct intel_iommu *iommu = info->iommu;
4661 	struct dev_pasid_info *dev_pasid;
4662 	unsigned long flags;
4663 	int ret;
4664 
4665 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4666 		return -EOPNOTSUPP;
4667 
4668 	if (domain->dirty_ops)
4669 		return -EINVAL;
4670 
4671 	if (context_copied(iommu, info->bus, info->devfn))
4672 		return -EBUSY;
4673 
4674 	ret = prepare_domain_attach_device(domain, dev);
4675 	if (ret)
4676 		return ret;
4677 
4678 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4679 	if (!dev_pasid)
4680 		return -ENOMEM;
4681 
4682 	ret = domain_attach_iommu(dmar_domain, iommu);
4683 	if (ret)
4684 		goto out_free;
4685 
4686 	if (domain_type_is_si(dmar_domain))
4687 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4688 	else if (dmar_domain->use_first_level)
4689 		ret = domain_setup_first_level(iommu, dmar_domain,
4690 					       dev, pasid);
4691 	else
4692 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4693 						     dev, pasid);
4694 	if (ret)
4695 		goto out_detach_iommu;
4696 
4697 	dev_pasid->dev = dev;
4698 	dev_pasid->pasid = pasid;
4699 	spin_lock_irqsave(&dmar_domain->lock, flags);
4700 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4701 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4702 
4703 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4704 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4705 
4706 	return 0;
4707 out_detach_iommu:
4708 	domain_detach_iommu(dmar_domain, iommu);
4709 out_free:
4710 	kfree(dev_pasid);
4711 	return ret;
4712 }
4713 
4714 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4715 {
4716 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4717 	struct intel_iommu *iommu = info->iommu;
4718 	struct iommu_hw_info_vtd *vtd;
4719 
4720 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4721 	if (!vtd)
4722 		return ERR_PTR(-ENOMEM);
4723 
4724 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4725 	vtd->cap_reg = iommu->cap;
4726 	vtd->ecap_reg = iommu->ecap;
4727 	*length = sizeof(*vtd);
4728 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4729 	return vtd;
4730 }
4731 
4732 /*
4733  * Set dirty tracking for the device list of a domain. The caller must
4734  * hold the domain->lock when calling it.
4735  */
4736 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4737 {
4738 	struct device_domain_info *info;
4739 	int ret = 0;
4740 
4741 	list_for_each_entry(info, devices, link) {
4742 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4743 						       IOMMU_NO_PASID, enable);
4744 		if (ret)
4745 			break;
4746 	}
4747 
4748 	return ret;
4749 }
4750 
4751 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4752 					    bool enable)
4753 {
4754 	struct dmar_domain *s1_domain;
4755 	unsigned long flags;
4756 	int ret;
4757 
4758 	spin_lock(&domain->s1_lock);
4759 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4760 		spin_lock_irqsave(&s1_domain->lock, flags);
4761 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4762 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4763 		if (ret)
4764 			goto err_unwind;
4765 	}
4766 	spin_unlock(&domain->s1_lock);
4767 	return 0;
4768 
4769 err_unwind:
4770 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4771 		spin_lock_irqsave(&s1_domain->lock, flags);
4772 		device_set_dirty_tracking(&s1_domain->devices,
4773 					  domain->dirty_tracking);
4774 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4775 	}
4776 	spin_unlock(&domain->s1_lock);
4777 	return ret;
4778 }
4779 
4780 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4781 					  bool enable)
4782 {
4783 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4784 	int ret;
4785 
4786 	spin_lock(&dmar_domain->lock);
4787 	if (dmar_domain->dirty_tracking == enable)
4788 		goto out_unlock;
4789 
4790 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4791 	if (ret)
4792 		goto err_unwind;
4793 
4794 	if (dmar_domain->nested_parent) {
4795 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4796 		if (ret)
4797 			goto err_unwind;
4798 	}
4799 
4800 	dmar_domain->dirty_tracking = enable;
4801 out_unlock:
4802 	spin_unlock(&dmar_domain->lock);
4803 
4804 	return 0;
4805 
4806 err_unwind:
4807 	device_set_dirty_tracking(&dmar_domain->devices,
4808 				  dmar_domain->dirty_tracking);
4809 	spin_unlock(&dmar_domain->lock);
4810 	return ret;
4811 }
4812 
4813 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4814 					    unsigned long iova, size_t size,
4815 					    unsigned long flags,
4816 					    struct iommu_dirty_bitmap *dirty)
4817 {
4818 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4819 	unsigned long end = iova + size - 1;
4820 	unsigned long pgsize;
4821 
4822 	/*
4823 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4824 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4825 	 * have occurred when we stopped dirty tracking. This ensures that we
4826 	 * never inherit dirtied bits from a previous cycle.
4827 	 */
4828 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4829 		return -EINVAL;
4830 
4831 	do {
4832 		struct dma_pte *pte;
4833 		int lvl = 0;
4834 
4835 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4836 				     GFP_ATOMIC);
4837 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4838 		if (!pte || !dma_pte_present(pte)) {
4839 			iova += pgsize;
4840 			continue;
4841 		}
4842 
4843 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4844 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4845 		iova += pgsize;
4846 	} while (iova < end);
4847 
4848 	return 0;
4849 }
4850 
4851 static const struct iommu_dirty_ops intel_dirty_ops = {
4852 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4853 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4854 };
4855 
4856 const struct iommu_ops intel_iommu_ops = {
4857 	.blocked_domain		= &blocking_domain,
4858 	.capable		= intel_iommu_capable,
4859 	.hw_info		= intel_iommu_hw_info,
4860 	.domain_alloc		= intel_iommu_domain_alloc,
4861 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4862 	.probe_device		= intel_iommu_probe_device,
4863 	.probe_finalize		= intel_iommu_probe_finalize,
4864 	.release_device		= intel_iommu_release_device,
4865 	.get_resv_regions	= intel_iommu_get_resv_regions,
4866 	.device_group		= intel_iommu_device_group,
4867 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4868 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4869 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4870 	.def_domain_type	= device_def_domain_type,
4871 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4872 	.pgsize_bitmap		= SZ_4K,
4873 #ifdef CONFIG_INTEL_IOMMU_SVM
4874 	.page_response		= intel_svm_page_response,
4875 #endif
4876 	.default_domain_ops = &(const struct iommu_domain_ops) {
4877 		.attach_dev		= intel_iommu_attach_device,
4878 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4879 		.map_pages		= intel_iommu_map_pages,
4880 		.unmap_pages		= intel_iommu_unmap_pages,
4881 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4882 		.flush_iotlb_all        = intel_flush_iotlb_all,
4883 		.iotlb_sync		= intel_iommu_tlb_sync,
4884 		.iova_to_phys		= intel_iommu_iova_to_phys,
4885 		.free			= intel_iommu_domain_free,
4886 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4887 	}
4888 };
4889 
4890 static void quirk_iommu_igfx(struct pci_dev *dev)
4891 {
4892 	if (risky_device(dev))
4893 		return;
4894 
4895 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4896 	dmar_map_gfx = 0;
4897 }
4898 
4899 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4907 
4908 /* Broadwell igfx malfunctions with dmar */
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4921 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4933 
4934 static void quirk_iommu_rwbf(struct pci_dev *dev)
4935 {
4936 	if (risky_device(dev))
4937 		return;
4938 
4939 	/*
4940 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4941 	 * but needs it. Same seems to hold for the desktop versions.
4942 	 */
4943 	pci_info(dev, "Forcing write-buffer flush capability\n");
4944 	rwbf_quirk = 1;
4945 }
4946 
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4954 
4955 #define GGC 0x52
4956 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4957 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4958 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4959 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4960 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4961 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4962 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4963 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4964 
4965 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4966 {
4967 	unsigned short ggc;
4968 
4969 	if (risky_device(dev))
4970 		return;
4971 
4972 	if (pci_read_config_word(dev, GGC, &ggc))
4973 		return;
4974 
4975 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4976 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4977 		dmar_map_gfx = 0;
4978 	} else if (dmar_map_gfx) {
4979 		/* we have to ensure the gfx device is idle before we flush */
4980 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4981 		iommu_set_dma_strict();
4982 	}
4983 }
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4988 
4989 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4990 {
4991 	unsigned short ver;
4992 
4993 	if (!IS_GFX_DEVICE(dev))
4994 		return;
4995 
4996 	ver = (dev->device >> 8) & 0xff;
4997 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4998 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4999 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5000 		return;
5001 
5002 	if (risky_device(dev))
5003 		return;
5004 
5005 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5006 	iommu_skip_te_disable = 1;
5007 }
5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5009 
5010 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5011    ISOCH DMAR unit for the Azalia sound device, but not give it any
5012    TLB entries, which causes it to deadlock. Check for that.  We do
5013    this in a function called from init_dmars(), instead of in a PCI
5014    quirk, because we don't want to print the obnoxious "BIOS broken"
5015    message if VT-d is actually disabled.
5016 */
5017 static void __init check_tylersburg_isoch(void)
5018 {
5019 	struct pci_dev *pdev;
5020 	uint32_t vtisochctrl;
5021 
5022 	/* If there's no Azalia in the system anyway, forget it. */
5023 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5024 	if (!pdev)
5025 		return;
5026 
5027 	if (risky_device(pdev)) {
5028 		pci_dev_put(pdev);
5029 		return;
5030 	}
5031 
5032 	pci_dev_put(pdev);
5033 
5034 	/* System Management Registers. Might be hidden, in which case
5035 	   we can't do the sanity check. But that's OK, because the
5036 	   known-broken BIOSes _don't_ actually hide it, so far. */
5037 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5038 	if (!pdev)
5039 		return;
5040 
5041 	if (risky_device(pdev)) {
5042 		pci_dev_put(pdev);
5043 		return;
5044 	}
5045 
5046 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5047 		pci_dev_put(pdev);
5048 		return;
5049 	}
5050 
5051 	pci_dev_put(pdev);
5052 
5053 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5054 	if (vtisochctrl & 1)
5055 		return;
5056 
5057 	/* Drop all bits other than the number of TLB entries */
5058 	vtisochctrl &= 0x1c;
5059 
5060 	/* If we have the recommended number of TLB entries (16), fine. */
5061 	if (vtisochctrl == 0x10)
5062 		return;
5063 
5064 	/* Zero TLB entries? You get to ride the short bus to school. */
5065 	if (!vtisochctrl) {
5066 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5067 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5068 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5069 		     dmi_get_system_info(DMI_BIOS_VERSION),
5070 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5071 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5072 		return;
5073 	}
5074 
5075 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5076 	       vtisochctrl);
5077 }
5078 
5079 /*
5080  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5081  * invalidation completion before posted writes initiated with translated address
5082  * that utilized translations matching the invalidation address range, violating
5083  * the invalidation completion ordering.
5084  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5085  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5086  * under the control of the trusted/privileged host device driver must use this
5087  * quirk.
5088  * Device TLBs are invalidated under the following six conditions:
5089  * 1. Device driver does DMA API unmap IOVA
5090  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5091  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5092  *    exit_mmap() due to crash
5093  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5094  *    VM has to free pages that were unmapped
5095  * 5. Userspace driver unmaps a DMA buffer
5096  * 6. Cache invalidation in vSVA usage (upcoming)
5097  *
5098  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5099  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5100  * invalidate TLB the same way as normal user unmap which will use this quirk.
5101  * The dTLB invalidation after PASID cache flush does not need this quirk.
5102  *
5103  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5104  */
5105 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5106 			       unsigned long address, unsigned long mask,
5107 			       u32 pasid, u16 qdep)
5108 {
5109 	u16 sid;
5110 
5111 	if (likely(!info->dtlb_extra_inval))
5112 		return;
5113 
5114 	sid = PCI_DEVID(info->bus, info->devfn);
5115 	if (pasid == IOMMU_NO_PASID) {
5116 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5117 				   qdep, address, mask);
5118 	} else {
5119 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5120 					 pasid, qdep, address, mask);
5121 	}
5122 }
5123 
5124 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5125 
5126 /*
5127  * Function to submit a command to the enhanced command interface. The
5128  * valid enhanced command descriptions are defined in Table 47 of the
5129  * VT-d spec. The VT-d hardware implementation may support some but not
5130  * all commands, which can be determined by checking the Enhanced
5131  * Command Capability Register.
5132  *
5133  * Return values:
5134  *  - 0: Command successful without any error;
5135  *  - Negative: software error value;
5136  *  - Nonzero positive: failure status code defined in Table 48.
5137  */
5138 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5139 {
5140 	unsigned long flags;
5141 	u64 res;
5142 	int ret;
5143 
5144 	if (!cap_ecmds(iommu->cap))
5145 		return -ENODEV;
5146 
5147 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5148 
5149 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5150 	if (res & DMA_ECMD_ECRSP_IP) {
5151 		ret = -EBUSY;
5152 		goto err;
5153 	}
5154 
5155 	/*
5156 	 * Unconditionally write the operand B, because
5157 	 * - There is no side effect if an ecmd doesn't require an
5158 	 *   operand B, but we set the register to some value.
5159 	 * - It's not invoked in any critical path. The extra MMIO
5160 	 *   write doesn't bring any performance concerns.
5161 	 */
5162 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5163 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5164 
5165 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5166 		      !(res & DMA_ECMD_ECRSP_IP), res);
5167 
5168 	if (res & DMA_ECMD_ECRSP_IP) {
5169 		ret = -ETIMEDOUT;
5170 		goto err;
5171 	}
5172 
5173 	ret = ecmd_get_status_code(res);
5174 err:
5175 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5176 
5177 	return ret;
5178 }
5179