xref: /linux/drivers/iommu/intel/iommu.c (revision 4cde72fead4cebb5b6b2fe9425904c2064739184)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN		(1)
63 
64 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65 
66 /* page table handling */
67 #define LEVEL_STRIDE		(9)
68 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69 
70 static inline int agaw_to_level(int agaw)
71 {
72 	return agaw + 2;
73 }
74 
75 static inline int agaw_to_width(int agaw)
76 {
77 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79 
80 static inline int width_to_agaw(int width)
81 {
82 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84 
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 	return (level - 1) * LEVEL_STRIDE;
88 }
89 
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94 
95 static inline u64 level_mask(int level)
96 {
97 	return -1ULL << level_to_offset_bits(level);
98 }
99 
100 static inline u64 level_size(int level)
101 {
102 	return 1ULL << level_to_offset_bits(level);
103 }
104 
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 	return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109 
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114 
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 	return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 	return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 	return page_to_dma_pfn(virt_to_page(p));
132 }
133 
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_set_present(struct context_entry *context)
172 {
173 	context->lo |= 1;
174 }
175 
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 	context->lo &= (((u64)-1) << 2) | 1;
179 }
180 
181 static inline void context_set_translation_type(struct context_entry *context,
182 						unsigned long value)
183 {
184 	context->lo &= (((u64)-1) << 4) | 3;
185 	context->lo |= (value & 3) << 2;
186 }
187 
188 static inline void context_set_address_root(struct context_entry *context,
189 					    unsigned long value)
190 {
191 	context->lo &= ~VTD_PAGE_MASK;
192 	context->lo |= value & VTD_PAGE_MASK;
193 }
194 
195 static inline void context_set_address_width(struct context_entry *context,
196 					     unsigned long value)
197 {
198 	context->hi |= value & 7;
199 }
200 
201 static inline void context_set_domain_id(struct context_entry *context,
202 					 unsigned long value)
203 {
204 	context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206 
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 	context->lo |= CONTEXT_PASIDE;
210 }
211 
212 static inline int context_domain_id(struct context_entry *c)
213 {
214 	return((c->hi >> 8) & 0xffff);
215 }
216 
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 	context->lo = 0;
220 	context->hi = 0;
221 }
222 
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 	if (!iommu->copied_tables)
226 		return false;
227 
228 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230 
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236 
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242 
243 /*
244  * This domain is a statically identity mapping domain.
245  *	1. This domain creats a static 1:1 mapping to all usable memory.
246  * 	2. It maps to each iommu if successful.
247  *	3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251 
252 struct dmar_rmrr_unit {
253 	struct list_head list;		/* list of rmrr units	*/
254 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
255 	u64	base_address;		/* reserved base address*/
256 	u64	end_address;		/* reserved end address */
257 	struct dmar_dev_scope *devices;	/* target devices */
258 	int	devices_cnt;		/* target device count */
259 };
260 
261 struct dmar_atsr_unit {
262 	struct list_head list;		/* list of ATSR units */
263 	struct acpi_dmar_header *hdr;	/* ACPI header */
264 	struct dmar_dev_scope *devices;	/* target devices */
265 	int devices_cnt;		/* target device count */
266 	u8 include_all:1;		/* include all ports */
267 };
268 
269 struct dmar_satc_unit {
270 	struct list_head list;		/* list of SATC units */
271 	struct acpi_dmar_header *hdr;	/* ACPI header */
272 	struct dmar_dev_scope *devices;	/* target devices */
273 	struct intel_iommu *iommu;	/* the corresponding iommu */
274 	int devices_cnt;		/* target device count */
275 	u8 atc_required:1;		/* ATS is required */
276 };
277 
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281 
282 #define for_each_rmrr_units(rmrr) \
283 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284 
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286 
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289 
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292 
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297 
298 #define IDENTMAP_GFX		2
299 #define IDENTMAP_AZALIA		4
300 
301 const struct iommu_ops intel_iommu_ops;
302 static const struct iommu_dirty_ops intel_dirty_ops;
303 
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308 
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313 
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 	u32 gsts;
317 
318 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 	if (gsts & DMA_GSTS_TES)
320 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322 
323 static int __init intel_iommu_setup(char *str)
324 {
325 	if (!str)
326 		return -EINVAL;
327 
328 	while (*str) {
329 		if (!strncmp(str, "on", 2)) {
330 			dmar_disabled = 0;
331 			pr_info("IOMMU enabled\n");
332 		} else if (!strncmp(str, "off", 3)) {
333 			dmar_disabled = 1;
334 			no_platform_optin = 1;
335 			pr_info("IOMMU disabled\n");
336 		} else if (!strncmp(str, "igfx_off", 8)) {
337 			dmar_map_gfx = 0;
338 			pr_info("Disable GFX device mapping\n");
339 		} else if (!strncmp(str, "forcedac", 8)) {
340 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 			iommu_dma_forcedac = true;
342 		} else if (!strncmp(str, "strict", 6)) {
343 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 			iommu_set_dma_strict();
345 		} else if (!strncmp(str, "sp_off", 6)) {
346 			pr_info("Disable supported super page\n");
347 			intel_iommu_superpage = 0;
348 		} else if (!strncmp(str, "sm_on", 5)) {
349 			pr_info("Enable scalable mode if hardware supports\n");
350 			intel_iommu_sm = 1;
351 		} else if (!strncmp(str, "sm_off", 6)) {
352 			pr_info("Scalable mode is disallowed\n");
353 			intel_iommu_sm = 0;
354 		} else if (!strncmp(str, "tboot_noforce", 13)) {
355 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 			intel_iommu_tboot_noforce = 1;
357 		} else {
358 			pr_notice("Unknown option - '%s'\n", str);
359 		}
360 
361 		str += strcspn(str, ",");
362 		while (*str == ',')
363 			str++;
364 	}
365 
366 	return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369 
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 	struct page *page;
373 	void *vaddr = NULL;
374 
375 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 	if (page)
377 		vaddr = page_address(page);
378 	return vaddr;
379 }
380 
381 void free_pgtable_page(void *vaddr)
382 {
383 	free_page((unsigned long)vaddr);
384 }
385 
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390 
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 				       unsigned long pfn)
393 {
394 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395 
396 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398 
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 	unsigned long fl_sagaw, sl_sagaw;
407 
408 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 	sl_sagaw = cap_sagaw(iommu->cap);
410 
411 	/* Second level only. */
412 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 		return sl_sagaw;
414 
415 	/* First level only. */
416 	if (!ecap_slts(iommu->ecap))
417 		return fl_sagaw;
418 
419 	return fl_sagaw & sl_sagaw;
420 }
421 
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 	unsigned long sagaw;
425 	int agaw;
426 
427 	sagaw = __iommu_calculate_sagaw(iommu);
428 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 		if (test_bit(agaw, &sagaw))
430 			break;
431 	}
432 
433 	return agaw;
434 }
435 
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443 
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453 
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 	return sm_supported(iommu) ?
457 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459 
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 	struct iommu_domain_info *info;
463 	struct dmar_drhd_unit *drhd;
464 	struct intel_iommu *iommu;
465 	bool found = false;
466 	unsigned long i;
467 
468 	domain->iommu_coherency = true;
469 	xa_for_each(&domain->iommu_array, i, info) {
470 		found = true;
471 		if (!iommu_paging_structure_coherency(info->iommu)) {
472 			domain->iommu_coherency = false;
473 			break;
474 		}
475 	}
476 	if (found)
477 		return;
478 
479 	/* No hardware attached; use lowest common denominator */
480 	rcu_read_lock();
481 	for_each_active_iommu(iommu, drhd) {
482 		if (!iommu_paging_structure_coherency(iommu)) {
483 			domain->iommu_coherency = false;
484 			break;
485 		}
486 	}
487 	rcu_read_unlock();
488 }
489 
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 					 struct intel_iommu *skip)
492 {
493 	struct dmar_drhd_unit *drhd;
494 	struct intel_iommu *iommu;
495 	int mask = 0x3;
496 
497 	if (!intel_iommu_superpage)
498 		return 0;
499 
500 	/* set iommu_superpage to the smallest common denominator */
501 	rcu_read_lock();
502 	for_each_active_iommu(iommu, drhd) {
503 		if (iommu != skip) {
504 			if (domain && domain->use_first_level) {
505 				if (!cap_fl1gp_support(iommu->cap))
506 					mask = 0x1;
507 			} else {
508 				mask &= cap_super_page_val(iommu->cap);
509 			}
510 
511 			if (!mask)
512 				break;
513 		}
514 	}
515 	rcu_read_unlock();
516 
517 	return fls(mask);
518 }
519 
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 	struct device_domain_info *info;
523 	int nid = NUMA_NO_NODE;
524 	unsigned long flags;
525 
526 	spin_lock_irqsave(&domain->lock, flags);
527 	list_for_each_entry(info, &domain->devices, link) {
528 		/*
529 		 * There could possibly be multiple device numa nodes as devices
530 		 * within the same domain may sit behind different IOMMUs. There
531 		 * isn't perfect answer in such situation, so we select first
532 		 * come first served policy.
533 		 */
534 		nid = dev_to_node(info->dev);
535 		if (nid != NUMA_NO_NODE)
536 			break;
537 	}
538 	spin_unlock_irqrestore(&domain->lock, flags);
539 
540 	return nid;
541 }
542 
543 static void domain_update_iotlb(struct dmar_domain *domain);
544 
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 	unsigned long bitmap = 0;
549 
550 	/*
551 	 * 1-level super page supports page size of 2MiB, 2-level super page
552 	 * supports page size of both 2MiB and 1GiB.
553 	 */
554 	if (domain->iommu_superpage == 1)
555 		bitmap |= SZ_2M;
556 	else if (domain->iommu_superpage == 2)
557 		bitmap |= SZ_2M | SZ_1G;
558 
559 	return bitmap;
560 }
561 
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 	domain_update_iommu_coherency(domain);
566 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567 
568 	/*
569 	 * If RHSA is missing, we should default to the device numa domain
570 	 * as fall back.
571 	 */
572 	if (domain->nid == NUMA_NO_NODE)
573 		domain->nid = domain_update_device_node(domain);
574 
575 	/*
576 	 * First-level translation restricts the input-address to a
577 	 * canonical address (i.e., address bits 63:N have the same
578 	 * value as address bit [N-1], where N is 48-bits with 4-level
579 	 * paging and 57-bits with 5-level paging). Hence, skip bit
580 	 * [N-1].
581 	 */
582 	if (domain->use_first_level)
583 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 	else
585 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586 
587 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 	domain_update_iotlb(domain);
589 }
590 
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 					 u8 devfn, int alloc)
593 {
594 	struct root_entry *root = &iommu->root_entry[bus];
595 	struct context_entry *context;
596 	u64 *entry;
597 
598 	/*
599 	 * Except that the caller requested to allocate a new entry,
600 	 * returning a copied context entry makes no sense.
601 	 */
602 	if (!alloc && context_copied(iommu, bus, devfn))
603 		return NULL;
604 
605 	entry = &root->lo;
606 	if (sm_supported(iommu)) {
607 		if (devfn >= 0x80) {
608 			devfn -= 0x80;
609 			entry = &root->hi;
610 		}
611 		devfn *= 2;
612 	}
613 	if (*entry & 1)
614 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 	else {
616 		unsigned long phy_addr;
617 		if (!alloc)
618 			return NULL;
619 
620 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 		if (!context)
622 			return NULL;
623 
624 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 		phy_addr = virt_to_phys((void *)context);
626 		*entry = phy_addr | 1;
627 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
628 	}
629 	return &context[devfn];
630 }
631 
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *				 sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 	struct pci_dev *pdev, *pbridge;
644 
645 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 		return false;
647 
648 	pdev = to_pci_dev(dev);
649 	pbridge = to_pci_dev(bridge);
650 
651 	if (pbridge->subordinate &&
652 	    pbridge->subordinate->number <= pdev->bus->number &&
653 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 		return true;
655 
656 	return false;
657 }
658 
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 	struct dmar_drhd_unit *drhd;
662 	u32 vtbar;
663 	int rc;
664 
665 	/* We know that this device on this chipset has its own IOMMU.
666 	 * If we find it under a different IOMMU, then the BIOS is lying
667 	 * to us. Hope that the IOMMU for this device is actually
668 	 * disabled, and it needs no translation...
669 	 */
670 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 	if (rc) {
672 		/* "can't" happen */
673 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 		return false;
675 	}
676 	vtbar &= 0xffff0000;
677 
678 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
679 	drhd = dmar_find_matched_drhd_unit(pdev);
680 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 		return true;
684 	}
685 
686 	return false;
687 }
688 
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 	if (!iommu || iommu->drhd->ignored)
692 		return true;
693 
694 	if (dev_is_pci(dev)) {
695 		struct pci_dev *pdev = to_pci_dev(dev);
696 
697 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 		    quirk_ioat_snb_local_iommu(pdev))
700 			return true;
701 	}
702 
703 	return false;
704 }
705 
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 	struct dmar_drhd_unit *drhd = NULL;
709 	struct pci_dev *pdev = NULL;
710 	struct intel_iommu *iommu;
711 	struct device *tmp;
712 	u16 segment = 0;
713 	int i;
714 
715 	if (!dev)
716 		return NULL;
717 
718 	if (dev_is_pci(dev)) {
719 		struct pci_dev *pf_pdev;
720 
721 		pdev = pci_real_dma_dev(to_pci_dev(dev));
722 
723 		/* VFs aren't listed in scope tables; we need to look up
724 		 * the PF instead to find the IOMMU. */
725 		pf_pdev = pci_physfn(pdev);
726 		dev = &pf_pdev->dev;
727 		segment = pci_domain_nr(pdev->bus);
728 	} else if (has_acpi_companion(dev))
729 		dev = &ACPI_COMPANION(dev)->dev;
730 
731 	rcu_read_lock();
732 	for_each_iommu(iommu, drhd) {
733 		if (pdev && segment != drhd->segment)
734 			continue;
735 
736 		for_each_active_dev_scope(drhd->devices,
737 					  drhd->devices_cnt, i, tmp) {
738 			if (tmp == dev) {
739 				/* For a VF use its original BDF# not that of the PF
740 				 * which we used for the IOMMU lookup. Strictly speaking
741 				 * we could do this for all PCI devices; we only need to
742 				 * get the BDF# from the scope table for ACPI matches. */
743 				if (pdev && pdev->is_virtfn)
744 					goto got_pdev;
745 
746 				if (bus && devfn) {
747 					*bus = drhd->devices[i].bus;
748 					*devfn = drhd->devices[i].devfn;
749 				}
750 				goto out;
751 			}
752 
753 			if (is_downstream_to_pci_bridge(dev, tmp))
754 				goto got_pdev;
755 		}
756 
757 		if (pdev && drhd->include_all) {
758 got_pdev:
759 			if (bus && devfn) {
760 				*bus = pdev->bus->number;
761 				*devfn = pdev->devfn;
762 			}
763 			goto out;
764 		}
765 	}
766 	iommu = NULL;
767 out:
768 	if (iommu_is_dummy(iommu, dev))
769 		iommu = NULL;
770 
771 	rcu_read_unlock();
772 
773 	return iommu;
774 }
775 
776 static void domain_flush_cache(struct dmar_domain *domain,
777 			       void *addr, int size)
778 {
779 	if (!domain->iommu_coherency)
780 		clflush_cache_range(addr, size);
781 }
782 
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 	struct context_entry *context;
786 	int i;
787 
788 	if (!iommu->root_entry)
789 		return;
790 
791 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 		context = iommu_context_addr(iommu, i, 0, 0);
793 		if (context)
794 			free_pgtable_page(context);
795 
796 		if (!sm_supported(iommu))
797 			continue;
798 
799 		context = iommu_context_addr(iommu, i, 0x80, 0);
800 		if (context)
801 			free_pgtable_page(context);
802 	}
803 
804 	free_pgtable_page(iommu->root_entry);
805 	iommu->root_entry = NULL;
806 }
807 
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 	struct dma_pte *pte;
813 	int offset;
814 
815 	while (1) {
816 		offset = pfn_level_offset(pfn, level);
817 		pte = &parent[offset];
818 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 			pr_info("PTE not present at level %d\n", level);
820 			break;
821 		}
822 
823 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824 
825 		if (level == 1)
826 			break;
827 
828 		parent = phys_to_virt(dma_pte_addr(pte));
829 		level--;
830 	}
831 }
832 
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 			  unsigned long long addr, u32 pasid)
835 {
836 	struct pasid_dir_entry *dir, *pde;
837 	struct pasid_entry *entries, *pte;
838 	struct context_entry *ctx_entry;
839 	struct root_entry *rt_entry;
840 	int i, dir_index, index, level;
841 	u8 devfn = source_id & 0xff;
842 	u8 bus = source_id >> 8;
843 	struct dma_pte *pgtable;
844 
845 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846 
847 	/* root entry dump */
848 	rt_entry = &iommu->root_entry[bus];
849 	if (!rt_entry) {
850 		pr_info("root table entry is not present\n");
851 		return;
852 	}
853 
854 	if (sm_supported(iommu))
855 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 			rt_entry->hi, rt_entry->lo);
857 	else
858 		pr_info("root entry: 0x%016llx", rt_entry->lo);
859 
860 	/* context entry dump */
861 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862 	if (!ctx_entry) {
863 		pr_info("context table entry is not present\n");
864 		return;
865 	}
866 
867 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 		ctx_entry->hi, ctx_entry->lo);
869 
870 	/* legacy mode does not require PASID entries */
871 	if (!sm_supported(iommu)) {
872 		level = agaw_to_level(ctx_entry->hi & 7);
873 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 		goto pgtable_walk;
875 	}
876 
877 	/* get the pointer to pasid directory entry */
878 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 	if (!dir) {
880 		pr_info("pasid directory entry is not present\n");
881 		return;
882 	}
883 	/* For request-without-pasid, get the pasid from context entry */
884 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885 		pasid = IOMMU_NO_PASID;
886 
887 	dir_index = pasid >> PASID_PDE_SHIFT;
888 	pde = &dir[dir_index];
889 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890 
891 	/* get the pointer to the pasid table entry */
892 	entries = get_pasid_table_from_pde(pde);
893 	if (!entries) {
894 		pr_info("pasid table entry is not present\n");
895 		return;
896 	}
897 	index = pasid & PASID_PTE_MASK;
898 	pte = &entries[index];
899 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901 
902 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905 	} else {
906 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908 	}
909 
910 pgtable_walk:
911 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914 
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916 				      unsigned long pfn, int *target_level,
917 				      gfp_t gfp)
918 {
919 	struct dma_pte *parent, *pte;
920 	int level = agaw_to_level(domain->agaw);
921 	int offset;
922 
923 	if (!domain_pfn_supported(domain, pfn))
924 		/* Address beyond IOMMU's addressing capabilities. */
925 		return NULL;
926 
927 	parent = domain->pgd;
928 
929 	while (1) {
930 		void *tmp_page;
931 
932 		offset = pfn_level_offset(pfn, level);
933 		pte = &parent[offset];
934 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935 			break;
936 		if (level == *target_level)
937 			break;
938 
939 		if (!dma_pte_present(pte)) {
940 			uint64_t pteval;
941 
942 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
943 
944 			if (!tmp_page)
945 				return NULL;
946 
947 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949 			if (domain->use_first_level)
950 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951 
952 			if (cmpxchg64(&pte->val, 0ULL, pteval))
953 				/* Someone else set it while we were thinking; use theirs. */
954 				free_pgtable_page(tmp_page);
955 			else
956 				domain_flush_cache(domain, pte, sizeof(*pte));
957 		}
958 		if (level == 1)
959 			break;
960 
961 		parent = phys_to_virt(dma_pte_addr(pte));
962 		level--;
963 	}
964 
965 	if (!*target_level)
966 		*target_level = level;
967 
968 	return pte;
969 }
970 
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973 					 unsigned long pfn,
974 					 int level, int *large_page)
975 {
976 	struct dma_pte *parent, *pte;
977 	int total = agaw_to_level(domain->agaw);
978 	int offset;
979 
980 	parent = domain->pgd;
981 	while (level <= total) {
982 		offset = pfn_level_offset(pfn, total);
983 		pte = &parent[offset];
984 		if (level == total)
985 			return pte;
986 
987 		if (!dma_pte_present(pte)) {
988 			*large_page = total;
989 			break;
990 		}
991 
992 		if (dma_pte_superpage(pte)) {
993 			*large_page = total;
994 			return pte;
995 		}
996 
997 		parent = phys_to_virt(dma_pte_addr(pte));
998 		total--;
999 	}
1000 	return NULL;
1001 }
1002 
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005 				unsigned long start_pfn,
1006 				unsigned long last_pfn)
1007 {
1008 	unsigned int large_page;
1009 	struct dma_pte *first_pte, *pte;
1010 
1011 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 	    WARN_ON(start_pfn > last_pfn))
1013 		return;
1014 
1015 	/* we don't need lock here; nobody else touches the iova range */
1016 	do {
1017 		large_page = 1;
1018 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019 		if (!pte) {
1020 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021 			continue;
1022 		}
1023 		do {
1024 			dma_clear_pte(pte);
1025 			start_pfn += lvl_to_nr_pages(large_page);
1026 			pte++;
1027 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028 
1029 		domain_flush_cache(domain, first_pte,
1030 				   (void *)pte - (void *)first_pte);
1031 
1032 	} while (start_pfn && start_pfn <= last_pfn);
1033 }
1034 
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036 			       int retain_level, struct dma_pte *pte,
1037 			       unsigned long pfn, unsigned long start_pfn,
1038 			       unsigned long last_pfn)
1039 {
1040 	pfn = max(start_pfn, pfn);
1041 	pte = &pte[pfn_level_offset(pfn, level)];
1042 
1043 	do {
1044 		unsigned long level_pfn;
1045 		struct dma_pte *level_pte;
1046 
1047 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048 			goto next;
1049 
1050 		level_pfn = pfn & level_mask(level);
1051 		level_pte = phys_to_virt(dma_pte_addr(pte));
1052 
1053 		if (level > 2) {
1054 			dma_pte_free_level(domain, level - 1, retain_level,
1055 					   level_pte, level_pfn, start_pfn,
1056 					   last_pfn);
1057 		}
1058 
1059 		/*
1060 		 * Free the page table if we're below the level we want to
1061 		 * retain and the range covers the entire table.
1062 		 */
1063 		if (level < retain_level && !(start_pfn > level_pfn ||
1064 		      last_pfn < level_pfn + level_size(level) - 1)) {
1065 			dma_clear_pte(pte);
1066 			domain_flush_cache(domain, pte, sizeof(*pte));
1067 			free_pgtable_page(level_pte);
1068 		}
1069 next:
1070 		pfn += level_size(level);
1071 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073 
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079 				   unsigned long start_pfn,
1080 				   unsigned long last_pfn,
1081 				   int retain_level)
1082 {
1083 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 
1085 	/* We don't need lock here; nobody else touches the iova range */
1086 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 			   domain->pgd, 0, start_pfn, last_pfn);
1088 
1089 	/* free pgd */
1090 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 		free_pgtable_page(domain->pgd);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 				    int level, struct dma_pte *pte,
1104 				    struct list_head *freelist)
1105 {
1106 	struct page *pg;
1107 
1108 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 	list_add_tail(&pg->lru, freelist);
1110 
1111 	if (level == 1)
1112 		return;
1113 
1114 	pte = page_address(pg);
1115 	do {
1116 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118 		pte++;
1119 	} while (!first_pte_in_page(pte));
1120 }
1121 
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 				struct dma_pte *pte, unsigned long pfn,
1124 				unsigned long start_pfn, unsigned long last_pfn,
1125 				struct list_head *freelist)
1126 {
1127 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128 
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn = pfn & level_mask(level);
1134 
1135 		if (!dma_pte_present(pte))
1136 			goto next;
1137 
1138 		/* If range covers entire pagetable, free it */
1139 		if (start_pfn <= level_pfn &&
1140 		    last_pfn >= level_pfn + level_size(level) - 1) {
1141 			/* These suborbinate page tables are going away entirely. Don't
1142 			   bother to clear them; we're just going to *free* them. */
1143 			if (level > 1 && !dma_pte_superpage(pte))
1144 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145 
1146 			dma_clear_pte(pte);
1147 			if (!first_pte)
1148 				first_pte = pte;
1149 			last_pte = pte;
1150 		} else if (level > 1) {
1151 			/* Recurse down into a level that isn't *entirely* obsolete */
1152 			dma_pte_clear_level(domain, level - 1,
1153 					    phys_to_virt(dma_pte_addr(pte)),
1154 					    level_pfn, start_pfn, last_pfn,
1155 					    freelist);
1156 		}
1157 next:
1158 		pfn = level_pfn + level_size(level);
1159 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160 
1161 	if (first_pte)
1162 		domain_flush_cache(domain, first_pte,
1163 				   (void *)++last_pte - (void *)first_pte);
1164 }
1165 
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 			 unsigned long last_pfn, struct list_head *freelist)
1171 {
1172 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 	    WARN_ON(start_pfn > last_pfn))
1174 		return;
1175 
1176 	/* we don't need lock here; nobody else touches the iova range */
1177 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1179 
1180 	/* free pgd */
1181 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 		struct page *pgd_page = virt_to_page(domain->pgd);
1183 		list_add_tail(&pgd_page->lru, freelist);
1184 		domain->pgd = NULL;
1185 	}
1186 }
1187 
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191 	struct root_entry *root;
1192 
1193 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194 	if (!root) {
1195 		pr_err("Allocating root entry for %s failed\n",
1196 			iommu->name);
1197 		return -ENOMEM;
1198 	}
1199 
1200 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1201 	iommu->root_entry = root;
1202 
1203 	return 0;
1204 }
1205 
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208 	u64 addr;
1209 	u32 sts;
1210 	unsigned long flag;
1211 
1212 	addr = virt_to_phys(iommu->root_entry);
1213 	if (sm_supported(iommu))
1214 		addr |= DMA_RTADDR_SMT;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218 
1219 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220 
1221 	/* Make sure hardware complete it */
1222 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223 		      readl, (sts & DMA_GSTS_RTPS), sts);
1224 
1225 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226 
1227 	/*
1228 	 * Hardware invalidates all DMA remapping hardware translation
1229 	 * caches as part of SRTP flow.
1230 	 */
1231 	if (cap_esrtps(iommu->cap))
1232 		return;
1233 
1234 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 	if (sm_supported(iommu))
1236 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239 
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242 	u32 val;
1243 	unsigned long flag;
1244 
1245 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246 		return;
1247 
1248 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250 
1251 	/* Make sure hardware complete it */
1252 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1254 
1255 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257 
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260 				  u16 did, u16 source_id, u8 function_mask,
1261 				  u64 type)
1262 {
1263 	u64 val = 0;
1264 	unsigned long flag;
1265 
1266 	switch (type) {
1267 	case DMA_CCMD_GLOBAL_INVL:
1268 		val = DMA_CCMD_GLOBAL_INVL;
1269 		break;
1270 	case DMA_CCMD_DOMAIN_INVL:
1271 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 		break;
1273 	case DMA_CCMD_DEVICE_INVL:
1274 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276 		break;
1277 	default:
1278 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279 			iommu->name, type);
1280 		return;
1281 	}
1282 	val |= DMA_CCMD_ICC;
1283 
1284 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 				u64 addr, unsigned int size_order, u64 type)
1297 {
1298 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 	u64 val = 0, val_iva = 0;
1300 	unsigned long flag;
1301 
1302 	switch (type) {
1303 	case DMA_TLB_GLOBAL_FLUSH:
1304 		/* global flush doesn't need set IVA_REG */
1305 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 		break;
1307 	case DMA_TLB_DSI_FLUSH:
1308 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 		break;
1310 	case DMA_TLB_PSI_FLUSH:
1311 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 		/* IH bit is passed in as part of address */
1313 		val_iva = size_order | addr;
1314 		break;
1315 	default:
1316 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317 			iommu->name, type);
1318 		return;
1319 	}
1320 
1321 	if (cap_write_drain(iommu->cap))
1322 		val |= DMA_TLB_WRITE_DRAIN;
1323 
1324 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 	/* Note: Only uses first TLB reg currently */
1326 	if (val_iva)
1327 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329 
1330 	/* Make sure hardware complete it */
1331 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333 
1334 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 
1336 	/* check IOTLB invalidation granularity */
1337 	if (DMA_TLB_IAIG(val) == 0)
1338 		pr_err("Flush IOTLB failed\n");
1339 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 			(unsigned long long)DMA_TLB_IIRG(type),
1342 			(unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344 
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349 	struct device_domain_info *info;
1350 	unsigned long flags;
1351 
1352 	spin_lock_irqsave(&domain->lock, flags);
1353 	list_for_each_entry(info, &domain->devices, link) {
1354 		if (info->iommu == iommu && info->bus == bus &&
1355 		    info->devfn == devfn) {
1356 			spin_unlock_irqrestore(&domain->lock, flags);
1357 			return info;
1358 		}
1359 	}
1360 	spin_unlock_irqrestore(&domain->lock, flags);
1361 
1362 	return NULL;
1363 }
1364 
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 	struct dev_pasid_info *dev_pasid;
1368 	struct device_domain_info *info;
1369 	bool has_iotlb_device = false;
1370 	unsigned long flags;
1371 
1372 	spin_lock_irqsave(&domain->lock, flags);
1373 	list_for_each_entry(info, &domain->devices, link) {
1374 		if (info->ats_enabled) {
1375 			has_iotlb_device = true;
1376 			break;
1377 		}
1378 	}
1379 
1380 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 		info = dev_iommu_priv_get(dev_pasid->dev);
1382 		if (info->ats_enabled) {
1383 			has_iotlb_device = true;
1384 			break;
1385 		}
1386 	}
1387 	domain->has_iotlb_device = has_iotlb_device;
1388 	spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390 
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401 		return false;
1402 
1403 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404 		return false;
1405 
1406 	return true;
1407 }
1408 
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411 	struct pci_dev *pdev;
1412 
1413 	if (!dev_is_pci(info->dev))
1414 		return;
1415 
1416 	pdev = to_pci_dev(info->dev);
1417 
1418 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1419 	   the device if you enable PASID support after ATS support is
1420 	   undefined. So always enable PASID support on devices which
1421 	   have it, even if we can't yet know if we're ever going to
1422 	   use it. */
1423 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 		info->pasid_enabled = 1;
1425 
1426 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428 		info->ats_enabled = 1;
1429 		domain_update_iotlb(info->domain);
1430 	}
1431 }
1432 
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435 	struct pci_dev *pdev;
1436 
1437 	if (!dev_is_pci(info->dev))
1438 		return;
1439 
1440 	pdev = to_pci_dev(info->dev);
1441 
1442 	if (info->ats_enabled) {
1443 		pci_disable_ats(pdev);
1444 		info->ats_enabled = 0;
1445 		domain_update_iotlb(info->domain);
1446 	}
1447 
1448 	if (info->pasid_enabled) {
1449 		pci_disable_pasid(pdev);
1450 		info->pasid_enabled = 0;
1451 	}
1452 }
1453 
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 				    u64 addr, unsigned int mask)
1456 {
1457 	u16 sid, qdep;
1458 
1459 	if (!info || !info->ats_enabled)
1460 		return;
1461 
1462 	sid = info->bus << 8 | info->devfn;
1463 	qdep = info->ats_qdep;
1464 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 			   qdep, addr, mask);
1466 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468 
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 				  u64 addr, unsigned mask)
1471 {
1472 	struct dev_pasid_info *dev_pasid;
1473 	struct device_domain_info *info;
1474 	unsigned long flags;
1475 
1476 	if (!domain->has_iotlb_device)
1477 		return;
1478 
1479 	spin_lock_irqsave(&domain->lock, flags);
1480 	list_for_each_entry(info, &domain->devices, link)
1481 		__iommu_flush_dev_iotlb(info, addr, mask);
1482 
1483 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 		info = dev_iommu_priv_get(dev_pasid->dev);
1485 
1486 		if (!info->ats_enabled)
1487 			continue;
1488 
1489 		qi_flush_dev_iotlb_pasid(info->iommu,
1490 					 PCI_DEVID(info->bus, info->devfn),
1491 					 info->pfsid, dev_pasid->pasid,
1492 					 info->ats_qdep, addr,
1493 					 mask);
1494 	}
1495 	spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497 
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 				     struct dmar_domain *domain, u64 addr,
1500 				     unsigned long npages, bool ih)
1501 {
1502 	u16 did = domain_id_iommu(domain, iommu);
1503 	struct dev_pasid_info *dev_pasid;
1504 	unsigned long flags;
1505 
1506 	spin_lock_irqsave(&domain->lock, flags);
1507 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509 
1510 	if (!list_empty(&domain->devices))
1511 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512 	spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514 
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 				  struct dmar_domain *domain,
1517 				  unsigned long pfn, unsigned int pages,
1518 				  int ih, int map)
1519 {
1520 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 	unsigned int mask = ilog2(aligned_pages);
1522 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523 	u16 did = domain_id_iommu(domain, iommu);
1524 
1525 	if (WARN_ON(!pages))
1526 		return;
1527 
1528 	if (ih)
1529 		ih = 1 << 6;
1530 
1531 	if (domain->use_first_level) {
1532 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533 	} else {
1534 		unsigned long bitmask = aligned_pages - 1;
1535 
1536 		/*
1537 		 * PSI masks the low order bits of the base address. If the
1538 		 * address isn't aligned to the mask, then compute a mask value
1539 		 * needed to ensure the target range is flushed.
1540 		 */
1541 		if (unlikely(bitmask & pfn)) {
1542 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543 
1544 			/*
1545 			 * Since end_pfn <= pfn + bitmask, the only way bits
1546 			 * higher than bitmask can differ in pfn and end_pfn is
1547 			 * by carrying. This means after masking out bitmask,
1548 			 * high bits starting with the first set bit in
1549 			 * shared_bits are all equal in both pfn and end_pfn.
1550 			 */
1551 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553 		}
1554 
1555 		/*
1556 		 * Fallback to domain selective flush if no PSI support or
1557 		 * the size is too big.
1558 		 */
1559 		if (!cap_pgsel_inv(iommu->cap) ||
1560 		    mask > cap_max_amask_val(iommu->cap))
1561 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 							DMA_TLB_DSI_FLUSH);
1563 		else
1564 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 							DMA_TLB_PSI_FLUSH);
1566 	}
1567 
1568 	/*
1569 	 * In caching mode, changes of pages from non-present to present require
1570 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571 	 */
1572 	if (!cap_caching_mode(iommu->cap) || !map)
1573 		iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575 
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 					struct dmar_domain *domain,
1579 					unsigned long pfn, unsigned int pages)
1580 {
1581 	/*
1582 	 * It's a non-present to present mapping. Only flush if caching mode
1583 	 * and second level.
1584 	 */
1585 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 	else
1588 		iommu_flush_write_buffer(iommu);
1589 }
1590 
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594 	struct iommu_domain_info *info;
1595 	unsigned long idx;
1596 
1597 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 		struct intel_iommu *iommu = info->iommu;
1599 		u16 did = domain_id_iommu(dmar_domain, iommu);
1600 
1601 		if (dmar_domain->use_first_level)
1602 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603 		else
1604 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605 						 DMA_TLB_DSI_FLUSH);
1606 
1607 		if (!cap_caching_mode(iommu->cap))
1608 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609 	}
1610 }
1611 
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614 	u32 pmen;
1615 	unsigned long flags;
1616 
1617 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618 		return;
1619 
1620 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 	pmen &= ~DMA_PMEN_EPM;
1623 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624 
1625 	/* wait for the protected region status bit to clear */
1626 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flags;
1636 
1637 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 	iommu->gcmd |= DMA_GCMD_TE;
1639 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640 
1641 	/* Make sure hardware complete it */
1642 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643 		      readl, (sts & DMA_GSTS_TES), sts);
1644 
1645 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647 
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650 	u32 sts;
1651 	unsigned long flag;
1652 
1653 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655 		return;
1656 
1657 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658 	iommu->gcmd &= ~DMA_GCMD_TE;
1659 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660 
1661 	/* Make sure hardware complete it */
1662 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1664 
1665 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667 
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670 	u32 ndomains;
1671 
1672 	ndomains = cap_ndoms(iommu->cap);
1673 	pr_debug("%s: Number of Domains supported <%d>\n",
1674 		 iommu->name, ndomains);
1675 
1676 	spin_lock_init(&iommu->lock);
1677 
1678 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679 	if (!iommu->domain_ids)
1680 		return -ENOMEM;
1681 
1682 	/*
1683 	 * If Caching mode is set, then invalid translations are tagged
1684 	 * with domain-id 0, hence we need to pre-allocate it. We also
1685 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 	 * make sure it is not used for a real domain.
1687 	 */
1688 	set_bit(0, iommu->domain_ids);
1689 
1690 	/*
1691 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 	 * entry for first-level or pass-through translation modes should
1693 	 * be programmed with a domain id different from those used for
1694 	 * second-level or nested translation. We reserve a domain id for
1695 	 * this purpose.
1696 	 */
1697 	if (sm_supported(iommu))
1698 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699 
1700 	return 0;
1701 }
1702 
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705 	if (!iommu->domain_ids)
1706 		return;
1707 
1708 	/*
1709 	 * All iommu domains must have been detached from the devices,
1710 	 * hence there should be no domain IDs in use.
1711 	 */
1712 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 		    > NUM_RESERVED_DID))
1714 		return;
1715 
1716 	if (iommu->gcmd & DMA_GCMD_TE)
1717 		iommu_disable_translation(iommu);
1718 }
1719 
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722 	if (iommu->domain_ids) {
1723 		bitmap_free(iommu->domain_ids);
1724 		iommu->domain_ids = NULL;
1725 	}
1726 
1727 	if (iommu->copied_tables) {
1728 		bitmap_free(iommu->copied_tables);
1729 		iommu->copied_tables = NULL;
1730 	}
1731 
1732 	/* free context mapping */
1733 	free_context_table(iommu);
1734 
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736 	if (pasid_supported(iommu)) {
1737 		if (ecap_prs(iommu->ecap))
1738 			intel_svm_finish_prq(iommu);
1739 	}
1740 #endif
1741 }
1742 
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749 	/* Only SL is available in legacy mode */
1750 	if (!scalable_mode_support())
1751 		return false;
1752 
1753 	/* Only level (either FL or SL) is available, just use it */
1754 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 		return intel_cap_flts_sanity();
1756 
1757 	/* Both levels are available, decide it based on domain type */
1758 	return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760 
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763 	struct dmar_domain *domain;
1764 
1765 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766 	if (!domain)
1767 		return NULL;
1768 
1769 	domain->nid = NUMA_NO_NODE;
1770 	if (first_level_by_default(type))
1771 		domain->use_first_level = true;
1772 	domain->has_iotlb_device = false;
1773 	INIT_LIST_HEAD(&domain->devices);
1774 	INIT_LIST_HEAD(&domain->dev_pasids);
1775 	spin_lock_init(&domain->lock);
1776 	xa_init(&domain->iommu_array);
1777 
1778 	return domain;
1779 }
1780 
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1782 {
1783 	struct iommu_domain_info *info, *curr;
1784 	unsigned long ndomains;
1785 	int num, ret = -ENOSPC;
1786 
1787 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1788 	if (!info)
1789 		return -ENOMEM;
1790 
1791 	spin_lock(&iommu->lock);
1792 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793 	if (curr) {
1794 		curr->refcnt++;
1795 		spin_unlock(&iommu->lock);
1796 		kfree(info);
1797 		return 0;
1798 	}
1799 
1800 	ndomains = cap_ndoms(iommu->cap);
1801 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802 	if (num >= ndomains) {
1803 		pr_err("%s: No free domain ids\n", iommu->name);
1804 		goto err_unlock;
1805 	}
1806 
1807 	set_bit(num, iommu->domain_ids);
1808 	info->refcnt	= 1;
1809 	info->did	= num;
1810 	info->iommu	= iommu;
1811 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812 			  NULL, info, GFP_ATOMIC);
1813 	if (curr) {
1814 		ret = xa_err(curr) ? : -EBUSY;
1815 		goto err_clear;
1816 	}
1817 	domain_update_iommu_cap(domain);
1818 
1819 	spin_unlock(&iommu->lock);
1820 	return 0;
1821 
1822 err_clear:
1823 	clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825 	spin_unlock(&iommu->lock);
1826 	kfree(info);
1827 	return ret;
1828 }
1829 
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1831 {
1832 	struct iommu_domain_info *info;
1833 
1834 	spin_lock(&iommu->lock);
1835 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1836 	if (--info->refcnt == 0) {
1837 		clear_bit(info->did, iommu->domain_ids);
1838 		xa_erase(&domain->iommu_array, iommu->seq_id);
1839 		domain->nid = NUMA_NO_NODE;
1840 		domain_update_iommu_cap(domain);
1841 		kfree(info);
1842 	}
1843 	spin_unlock(&iommu->lock);
1844 }
1845 
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1847 {
1848 	int agaw;
1849 	int r = (gaw - 12) % 9;
1850 
1851 	if (r == 0)
1852 		agaw = gaw;
1853 	else
1854 		agaw = gaw + 9 - r;
1855 	if (agaw > 64)
1856 		agaw = 64;
1857 	return agaw;
1858 }
1859 
1860 static void domain_exit(struct dmar_domain *domain)
1861 {
1862 	if (domain->pgd) {
1863 		LIST_HEAD(freelist);
1864 
1865 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866 		put_pages_list(&freelist);
1867 	}
1868 
1869 	if (WARN_ON(!list_empty(&domain->devices)))
1870 		return;
1871 
1872 	kfree(domain);
1873 }
1874 
1875 /*
1876  * Get the PASID directory size for scalable mode context entry.
1877  * Value of X in the PDTS field of a scalable mode context entry
1878  * indicates PASID directory with 2^(X + 7) entries.
1879  */
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1881 {
1882 	unsigned long pds, max_pde;
1883 
1884 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1886 	if (pds < 7)
1887 		return 0;
1888 
1889 	return pds - 7;
1890 }
1891 
1892 /*
1893  * Set the RID_PASID field of a scalable mode context entry. The
1894  * IOMMU hardware will use the PASID value set in this field for
1895  * DMA translations of DMA requests without PASID.
1896  */
1897 static inline void
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1899 {
1900 	context->hi |= pasid & ((1 << 20) - 1);
1901 }
1902 
1903 /*
1904  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1905  * entry.
1906  */
1907 static inline void context_set_sm_dte(struct context_entry *context)
1908 {
1909 	context->lo |= BIT_ULL(2);
1910 }
1911 
1912 /*
1913  * Set the PRE(Page Request Enable) field of a scalable mode context
1914  * entry.
1915  */
1916 static inline void context_set_sm_pre(struct context_entry *context)
1917 {
1918 	context->lo |= BIT_ULL(4);
1919 }
1920 
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1923 
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925 				      struct intel_iommu *iommu,
1926 				      struct pasid_table *table,
1927 				      u8 bus, u8 devfn)
1928 {
1929 	struct device_domain_info *info =
1930 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1931 	u16 did = domain_id_iommu(domain, iommu);
1932 	int translation = CONTEXT_TT_MULTI_LEVEL;
1933 	struct context_entry *context;
1934 	int ret;
1935 
1936 	if (hw_pass_through && domain_type_is_si(domain))
1937 		translation = CONTEXT_TT_PASS_THROUGH;
1938 
1939 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1940 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941 
1942 	spin_lock(&iommu->lock);
1943 	ret = -ENOMEM;
1944 	context = iommu_context_addr(iommu, bus, devfn, 1);
1945 	if (!context)
1946 		goto out_unlock;
1947 
1948 	ret = 0;
1949 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1950 		goto out_unlock;
1951 
1952 	/*
1953 	 * For kdump cases, old valid entries may be cached due to the
1954 	 * in-flight DMA and copied pgtable, but there is no unmapping
1955 	 * behaviour for them, thus we need an explicit cache flush for
1956 	 * the newly-mapped device. For kdump, at this point, the device
1957 	 * is supposed to finish reset at its driver probe stage, so no
1958 	 * in-flight DMA will exist, and we don't need to worry anymore
1959 	 * hereafter.
1960 	 */
1961 	if (context_copied(iommu, bus, devfn)) {
1962 		u16 did_old = context_domain_id(context);
1963 
1964 		if (did_old < cap_ndoms(iommu->cap)) {
1965 			iommu->flush.flush_context(iommu, did_old,
1966 						   (((u16)bus) << 8) | devfn,
1967 						   DMA_CCMD_MASK_NOBIT,
1968 						   DMA_CCMD_DEVICE_INVL);
1969 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1970 						 DMA_TLB_DSI_FLUSH);
1971 		}
1972 
1973 		clear_context_copied(iommu, bus, devfn);
1974 	}
1975 
1976 	context_clear_entry(context);
1977 
1978 	if (sm_supported(iommu)) {
1979 		unsigned long pds;
1980 
1981 		/* Setup the PASID DIR pointer: */
1982 		pds = context_get_sm_pds(table);
1983 		context->lo = (u64)virt_to_phys(table->table) |
1984 				context_pdts(pds);
1985 
1986 		/* Setup the RID_PASID field: */
1987 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1988 
1989 		/*
1990 		 * Setup the Device-TLB enable bit and Page request
1991 		 * Enable bit:
1992 		 */
1993 		if (info && info->ats_supported)
1994 			context_set_sm_dte(context);
1995 		if (info && info->pri_supported)
1996 			context_set_sm_pre(context);
1997 		if (info && info->pasid_supported)
1998 			context_set_pasid(context);
1999 	} else {
2000 		struct dma_pte *pgd = domain->pgd;
2001 		int agaw;
2002 
2003 		context_set_domain_id(context, did);
2004 
2005 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2006 			/*
2007 			 * Skip top levels of page tables for iommu which has
2008 			 * less agaw than default. Unnecessary for PT mode.
2009 			 */
2010 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2011 				ret = -ENOMEM;
2012 				pgd = phys_to_virt(dma_pte_addr(pgd));
2013 				if (!dma_pte_present(pgd))
2014 					goto out_unlock;
2015 			}
2016 
2017 			if (info && info->ats_supported)
2018 				translation = CONTEXT_TT_DEV_IOTLB;
2019 			else
2020 				translation = CONTEXT_TT_MULTI_LEVEL;
2021 
2022 			context_set_address_root(context, virt_to_phys(pgd));
2023 			context_set_address_width(context, agaw);
2024 		} else {
2025 			/*
2026 			 * In pass through mode, AW must be programmed to
2027 			 * indicate the largest AGAW value supported by
2028 			 * hardware. And ASR is ignored by hardware.
2029 			 */
2030 			context_set_address_width(context, iommu->msagaw);
2031 		}
2032 
2033 		context_set_translation_type(context, translation);
2034 	}
2035 
2036 	context_set_fault_enable(context);
2037 	context_set_present(context);
2038 	if (!ecap_coherent(iommu->ecap))
2039 		clflush_cache_range(context, sizeof(*context));
2040 
2041 	/*
2042 	 * It's a non-present to present mapping. If hardware doesn't cache
2043 	 * non-present entry we only need to flush the write-buffer. If the
2044 	 * _does_ cache non-present entries, then it does so in the special
2045 	 * domain #0, which we have to flush:
2046 	 */
2047 	if (cap_caching_mode(iommu->cap)) {
2048 		iommu->flush.flush_context(iommu, 0,
2049 					   (((u16)bus) << 8) | devfn,
2050 					   DMA_CCMD_MASK_NOBIT,
2051 					   DMA_CCMD_DEVICE_INVL);
2052 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2053 	} else {
2054 		iommu_flush_write_buffer(iommu);
2055 	}
2056 
2057 	ret = 0;
2058 
2059 out_unlock:
2060 	spin_unlock(&iommu->lock);
2061 
2062 	return ret;
2063 }
2064 
2065 struct domain_context_mapping_data {
2066 	struct dmar_domain *domain;
2067 	struct intel_iommu *iommu;
2068 	struct pasid_table *table;
2069 };
2070 
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072 				     u16 alias, void *opaque)
2073 {
2074 	struct domain_context_mapping_data *data = opaque;
2075 
2076 	return domain_context_mapping_one(data->domain, data->iommu,
2077 					  data->table, PCI_BUS_NUM(alias),
2078 					  alias & 0xff);
2079 }
2080 
2081 static int
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2083 {
2084 	struct domain_context_mapping_data data;
2085 	struct pasid_table *table;
2086 	struct intel_iommu *iommu;
2087 	u8 bus, devfn;
2088 
2089 	iommu = device_to_iommu(dev, &bus, &devfn);
2090 	if (!iommu)
2091 		return -ENODEV;
2092 
2093 	table = intel_pasid_get_table(dev);
2094 
2095 	if (!dev_is_pci(dev))
2096 		return domain_context_mapping_one(domain, iommu, table,
2097 						  bus, devfn);
2098 
2099 	data.domain = domain;
2100 	data.iommu = iommu;
2101 	data.table = table;
2102 
2103 	return pci_for_each_dma_alias(to_pci_dev(dev),
2104 				      &domain_context_mapping_cb, &data);
2105 }
2106 
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109 					    size_t size)
2110 {
2111 	host_addr &= ~PAGE_MASK;
2112 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114 
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117 					  unsigned long iov_pfn,
2118 					  unsigned long phy_pfn,
2119 					  unsigned long pages)
2120 {
2121 	int support, level = 1;
2122 	unsigned long pfnmerge;
2123 
2124 	support = domain->iommu_superpage;
2125 
2126 	/* To use a large page, the virtual *and* physical addresses
2127 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128 	   of them will mean we have to use smaller pages. So just
2129 	   merge them and check both at once. */
2130 	pfnmerge = iov_pfn | phy_pfn;
2131 
2132 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133 		pages >>= VTD_STRIDE_SHIFT;
2134 		if (!pages)
2135 			break;
2136 		pfnmerge >>= VTD_STRIDE_SHIFT;
2137 		level++;
2138 		support--;
2139 	}
2140 	return level;
2141 }
2142 
2143 /*
2144  * Ensure that old small page tables are removed to make room for superpage(s).
2145  * We're going to add new large pages, so make sure we don't remove their parent
2146  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2147  */
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149 				 unsigned long start_pfn,
2150 				 unsigned long end_pfn, int level)
2151 {
2152 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2153 	struct iommu_domain_info *info;
2154 	struct dma_pte *pte = NULL;
2155 	unsigned long i;
2156 
2157 	while (start_pfn <= end_pfn) {
2158 		if (!pte)
2159 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2160 					     GFP_ATOMIC);
2161 
2162 		if (dma_pte_present(pte)) {
2163 			dma_pte_free_pagetable(domain, start_pfn,
2164 					       start_pfn + lvl_pages - 1,
2165 					       level + 1);
2166 
2167 			xa_for_each(&domain->iommu_array, i, info)
2168 				iommu_flush_iotlb_psi(info->iommu, domain,
2169 						      start_pfn, lvl_pages,
2170 						      0, 0);
2171 		}
2172 
2173 		pte++;
2174 		start_pfn += lvl_pages;
2175 		if (first_pte_in_page(pte))
2176 			pte = NULL;
2177 	}
2178 }
2179 
2180 static int
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2183 		 gfp_t gfp)
2184 {
2185 	struct dma_pte *first_pte = NULL, *pte = NULL;
2186 	unsigned int largepage_lvl = 0;
2187 	unsigned long lvl_pages = 0;
2188 	phys_addr_t pteval;
2189 	u64 attr;
2190 
2191 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2192 		return -EINVAL;
2193 
2194 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2195 		return -EINVAL;
2196 
2197 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2199 		return -EINVAL;
2200 	}
2201 
2202 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203 	attr |= DMA_FL_PTE_PRESENT;
2204 	if (domain->use_first_level) {
2205 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206 		if (prot & DMA_PTE_WRITE)
2207 			attr |= DMA_FL_PTE_DIRTY;
2208 	}
2209 
2210 	domain->has_mappings = true;
2211 
2212 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2213 
2214 	while (nr_pages > 0) {
2215 		uint64_t tmp;
2216 
2217 		if (!pte) {
2218 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2219 					phys_pfn, nr_pages);
2220 
2221 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2222 					     gfp);
2223 			if (!pte)
2224 				return -ENOMEM;
2225 			first_pte = pte;
2226 
2227 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2228 
2229 			/* It is large page*/
2230 			if (largepage_lvl > 1) {
2231 				unsigned long end_pfn;
2232 				unsigned long pages_to_remove;
2233 
2234 				pteval |= DMA_PTE_LARGE_PAGE;
2235 				pages_to_remove = min_t(unsigned long, nr_pages,
2236 							nr_pte_to_next_page(pte) * lvl_pages);
2237 				end_pfn = iov_pfn + pages_to_remove - 1;
2238 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2239 			} else {
2240 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2241 			}
2242 
2243 		}
2244 		/* We don't need lock here, nobody else
2245 		 * touches the iova range
2246 		 */
2247 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2248 		if (tmp) {
2249 			static int dumps = 5;
2250 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2251 				iov_pfn, tmp, (unsigned long long)pteval);
2252 			if (dumps) {
2253 				dumps--;
2254 				debug_dma_dump_mappings(NULL);
2255 			}
2256 			WARN_ON(1);
2257 		}
2258 
2259 		nr_pages -= lvl_pages;
2260 		iov_pfn += lvl_pages;
2261 		phys_pfn += lvl_pages;
2262 		pteval += lvl_pages * VTD_PAGE_SIZE;
2263 
2264 		/* If the next PTE would be the first in a new page, then we
2265 		 * need to flush the cache on the entries we've just written.
2266 		 * And then we'll need to recalculate 'pte', so clear it and
2267 		 * let it get set again in the if (!pte) block above.
2268 		 *
2269 		 * If we're done (!nr_pages) we need to flush the cache too.
2270 		 *
2271 		 * Also if we've been setting superpages, we may need to
2272 		 * recalculate 'pte' and switch back to smaller pages for the
2273 		 * end of the mapping, if the trailing size is not enough to
2274 		 * use another superpage (i.e. nr_pages < lvl_pages).
2275 		 */
2276 		pte++;
2277 		if (!nr_pages || first_pte_in_page(pte) ||
2278 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2279 			domain_flush_cache(domain, first_pte,
2280 					   (void *)pte - (void *)first_pte);
2281 			pte = NULL;
2282 		}
2283 	}
2284 
2285 	return 0;
2286 }
2287 
2288 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2289 {
2290 	struct intel_iommu *iommu = info->iommu;
2291 	struct context_entry *context;
2292 	u16 did_old;
2293 
2294 	if (!iommu)
2295 		return;
2296 
2297 	spin_lock(&iommu->lock);
2298 	context = iommu_context_addr(iommu, bus, devfn, 0);
2299 	if (!context) {
2300 		spin_unlock(&iommu->lock);
2301 		return;
2302 	}
2303 
2304 	if (sm_supported(iommu)) {
2305 		if (hw_pass_through && domain_type_is_si(info->domain))
2306 			did_old = FLPT_DEFAULT_DID;
2307 		else
2308 			did_old = domain_id_iommu(info->domain, iommu);
2309 	} else {
2310 		did_old = context_domain_id(context);
2311 	}
2312 
2313 	context_clear_entry(context);
2314 	__iommu_flush_cache(iommu, context, sizeof(*context));
2315 	spin_unlock(&iommu->lock);
2316 	iommu->flush.flush_context(iommu,
2317 				   did_old,
2318 				   (((u16)bus) << 8) | devfn,
2319 				   DMA_CCMD_MASK_NOBIT,
2320 				   DMA_CCMD_DEVICE_INVL);
2321 
2322 	if (sm_supported(iommu))
2323 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2324 
2325 	iommu->flush.flush_iotlb(iommu,
2326 				 did_old,
2327 				 0,
2328 				 0,
2329 				 DMA_TLB_DSI_FLUSH);
2330 
2331 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2332 }
2333 
2334 static int domain_setup_first_level(struct intel_iommu *iommu,
2335 				    struct dmar_domain *domain,
2336 				    struct device *dev,
2337 				    u32 pasid)
2338 {
2339 	struct dma_pte *pgd = domain->pgd;
2340 	int agaw, level;
2341 	int flags = 0;
2342 
2343 	/*
2344 	 * Skip top levels of page tables for iommu which has
2345 	 * less agaw than default. Unnecessary for PT mode.
2346 	 */
2347 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2348 		pgd = phys_to_virt(dma_pte_addr(pgd));
2349 		if (!dma_pte_present(pgd))
2350 			return -ENOMEM;
2351 	}
2352 
2353 	level = agaw_to_level(agaw);
2354 	if (level != 4 && level != 5)
2355 		return -EINVAL;
2356 
2357 	if (level == 5)
2358 		flags |= PASID_FLAG_FL5LP;
2359 
2360 	if (domain->force_snooping)
2361 		flags |= PASID_FLAG_PAGE_SNOOP;
2362 
2363 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2364 					     domain_id_iommu(domain, iommu),
2365 					     flags);
2366 }
2367 
2368 static bool dev_is_real_dma_subdevice(struct device *dev)
2369 {
2370 	return dev && dev_is_pci(dev) &&
2371 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 }
2373 
2374 static int iommu_domain_identity_map(struct dmar_domain *domain,
2375 				     unsigned long first_vpfn,
2376 				     unsigned long last_vpfn)
2377 {
2378 	/*
2379 	 * RMRR range might have overlap with physical memory range,
2380 	 * clear it first
2381 	 */
2382 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2383 
2384 	return __domain_mapping(domain, first_vpfn,
2385 				first_vpfn, last_vpfn - first_vpfn + 1,
2386 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2387 }
2388 
2389 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2390 
2391 static int __init si_domain_init(int hw)
2392 {
2393 	struct dmar_rmrr_unit *rmrr;
2394 	struct device *dev;
2395 	int i, nid, ret;
2396 
2397 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2398 	if (!si_domain)
2399 		return -EFAULT;
2400 
2401 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2402 		domain_exit(si_domain);
2403 		si_domain = NULL;
2404 		return -EFAULT;
2405 	}
2406 
2407 	if (hw)
2408 		return 0;
2409 
2410 	for_each_online_node(nid) {
2411 		unsigned long start_pfn, end_pfn;
2412 		int i;
2413 
2414 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2415 			ret = iommu_domain_identity_map(si_domain,
2416 					mm_to_dma_pfn_start(start_pfn),
2417 					mm_to_dma_pfn_end(end_pfn));
2418 			if (ret)
2419 				return ret;
2420 		}
2421 	}
2422 
2423 	/*
2424 	 * Identity map the RMRRs so that devices with RMRRs could also use
2425 	 * the si_domain.
2426 	 */
2427 	for_each_rmrr_units(rmrr) {
2428 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2429 					  i, dev) {
2430 			unsigned long long start = rmrr->base_address;
2431 			unsigned long long end = rmrr->end_address;
2432 
2433 			if (WARN_ON(end < start ||
2434 				    end >> agaw_to_width(si_domain->agaw)))
2435 				continue;
2436 
2437 			ret = iommu_domain_identity_map(si_domain,
2438 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2439 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2440 			if (ret)
2441 				return ret;
2442 		}
2443 	}
2444 
2445 	return 0;
2446 }
2447 
2448 static int dmar_domain_attach_device(struct dmar_domain *domain,
2449 				     struct device *dev)
2450 {
2451 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2452 	struct intel_iommu *iommu;
2453 	unsigned long flags;
2454 	u8 bus, devfn;
2455 	int ret;
2456 
2457 	iommu = device_to_iommu(dev, &bus, &devfn);
2458 	if (!iommu)
2459 		return -ENODEV;
2460 
2461 	ret = domain_attach_iommu(domain, iommu);
2462 	if (ret)
2463 		return ret;
2464 	info->domain = domain;
2465 	spin_lock_irqsave(&domain->lock, flags);
2466 	list_add(&info->link, &domain->devices);
2467 	spin_unlock_irqrestore(&domain->lock, flags);
2468 
2469 	/* PASID table is mandatory for a PCI device in scalable mode. */
2470 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2471 		/* Setup the PASID entry for requests without PASID: */
2472 		if (hw_pass_through && domain_type_is_si(domain))
2473 			ret = intel_pasid_setup_pass_through(iommu, domain,
2474 					dev, IOMMU_NO_PASID);
2475 		else if (domain->use_first_level)
2476 			ret = domain_setup_first_level(iommu, domain, dev,
2477 					IOMMU_NO_PASID);
2478 		else
2479 			ret = intel_pasid_setup_second_level(iommu, domain,
2480 					dev, IOMMU_NO_PASID);
2481 		if (ret) {
2482 			dev_err(dev, "Setup RID2PASID failed\n");
2483 			device_block_translation(dev);
2484 			return ret;
2485 		}
2486 	}
2487 
2488 	ret = domain_context_mapping(domain, dev);
2489 	if (ret) {
2490 		dev_err(dev, "Domain context map failed\n");
2491 		device_block_translation(dev);
2492 		return ret;
2493 	}
2494 
2495 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2496 		iommu_enable_pci_caps(info);
2497 
2498 	return 0;
2499 }
2500 
2501 /**
2502  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2503  * is relaxable (ie. is allowed to be not enforced under some conditions)
2504  * @dev: device handle
2505  *
2506  * We assume that PCI USB devices with RMRRs have them largely
2507  * for historical reasons and that the RMRR space is not actively used post
2508  * boot.  This exclusion may change if vendors begin to abuse it.
2509  *
2510  * The same exception is made for graphics devices, with the requirement that
2511  * any use of the RMRR regions will be torn down before assigning the device
2512  * to a guest.
2513  *
2514  * Return: true if the RMRR is relaxable, false otherwise
2515  */
2516 static bool device_rmrr_is_relaxable(struct device *dev)
2517 {
2518 	struct pci_dev *pdev;
2519 
2520 	if (!dev_is_pci(dev))
2521 		return false;
2522 
2523 	pdev = to_pci_dev(dev);
2524 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2525 		return true;
2526 	else
2527 		return false;
2528 }
2529 
2530 /*
2531  * Return the required default domain type for a specific device.
2532  *
2533  * @dev: the device in query
2534  * @startup: true if this is during early boot
2535  *
2536  * Returns:
2537  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2538  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2539  *  - 0: both identity and dynamic domains work for this device
2540  */
2541 static int device_def_domain_type(struct device *dev)
2542 {
2543 	if (dev_is_pci(dev)) {
2544 		struct pci_dev *pdev = to_pci_dev(dev);
2545 
2546 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2547 			return IOMMU_DOMAIN_IDENTITY;
2548 
2549 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2550 			return IOMMU_DOMAIN_IDENTITY;
2551 	}
2552 
2553 	return 0;
2554 }
2555 
2556 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2557 {
2558 	/*
2559 	 * Start from the sane iommu hardware state.
2560 	 * If the queued invalidation is already initialized by us
2561 	 * (for example, while enabling interrupt-remapping) then
2562 	 * we got the things already rolling from a sane state.
2563 	 */
2564 	if (!iommu->qi) {
2565 		/*
2566 		 * Clear any previous faults.
2567 		 */
2568 		dmar_fault(-1, iommu);
2569 		/*
2570 		 * Disable queued invalidation if supported and already enabled
2571 		 * before OS handover.
2572 		 */
2573 		dmar_disable_qi(iommu);
2574 	}
2575 
2576 	if (dmar_enable_qi(iommu)) {
2577 		/*
2578 		 * Queued Invalidate not enabled, use Register Based Invalidate
2579 		 */
2580 		iommu->flush.flush_context = __iommu_flush_context;
2581 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2582 		pr_info("%s: Using Register based invalidation\n",
2583 			iommu->name);
2584 	} else {
2585 		iommu->flush.flush_context = qi_flush_context;
2586 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2587 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2588 	}
2589 }
2590 
2591 static int copy_context_table(struct intel_iommu *iommu,
2592 			      struct root_entry *old_re,
2593 			      struct context_entry **tbl,
2594 			      int bus, bool ext)
2595 {
2596 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2597 	struct context_entry *new_ce = NULL, ce;
2598 	struct context_entry *old_ce = NULL;
2599 	struct root_entry re;
2600 	phys_addr_t old_ce_phys;
2601 
2602 	tbl_idx = ext ? bus * 2 : bus;
2603 	memcpy(&re, old_re, sizeof(re));
2604 
2605 	for (devfn = 0; devfn < 256; devfn++) {
2606 		/* First calculate the correct index */
2607 		idx = (ext ? devfn * 2 : devfn) % 256;
2608 
2609 		if (idx == 0) {
2610 			/* First save what we may have and clean up */
2611 			if (new_ce) {
2612 				tbl[tbl_idx] = new_ce;
2613 				__iommu_flush_cache(iommu, new_ce,
2614 						    VTD_PAGE_SIZE);
2615 				pos = 1;
2616 			}
2617 
2618 			if (old_ce)
2619 				memunmap(old_ce);
2620 
2621 			ret = 0;
2622 			if (devfn < 0x80)
2623 				old_ce_phys = root_entry_lctp(&re);
2624 			else
2625 				old_ce_phys = root_entry_uctp(&re);
2626 
2627 			if (!old_ce_phys) {
2628 				if (ext && devfn == 0) {
2629 					/* No LCTP, try UCTP */
2630 					devfn = 0x7f;
2631 					continue;
2632 				} else {
2633 					goto out;
2634 				}
2635 			}
2636 
2637 			ret = -ENOMEM;
2638 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2639 					MEMREMAP_WB);
2640 			if (!old_ce)
2641 				goto out;
2642 
2643 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2644 			if (!new_ce)
2645 				goto out_unmap;
2646 
2647 			ret = 0;
2648 		}
2649 
2650 		/* Now copy the context entry */
2651 		memcpy(&ce, old_ce + idx, sizeof(ce));
2652 
2653 		if (!context_present(&ce))
2654 			continue;
2655 
2656 		did = context_domain_id(&ce);
2657 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2658 			set_bit(did, iommu->domain_ids);
2659 
2660 		set_context_copied(iommu, bus, devfn);
2661 		new_ce[idx] = ce;
2662 	}
2663 
2664 	tbl[tbl_idx + pos] = new_ce;
2665 
2666 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2667 
2668 out_unmap:
2669 	memunmap(old_ce);
2670 
2671 out:
2672 	return ret;
2673 }
2674 
2675 static int copy_translation_tables(struct intel_iommu *iommu)
2676 {
2677 	struct context_entry **ctxt_tbls;
2678 	struct root_entry *old_rt;
2679 	phys_addr_t old_rt_phys;
2680 	int ctxt_table_entries;
2681 	u64 rtaddr_reg;
2682 	int bus, ret;
2683 	bool new_ext, ext;
2684 
2685 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2686 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2687 	new_ext    = !!sm_supported(iommu);
2688 
2689 	/*
2690 	 * The RTT bit can only be changed when translation is disabled,
2691 	 * but disabling translation means to open a window for data
2692 	 * corruption. So bail out and don't copy anything if we would
2693 	 * have to change the bit.
2694 	 */
2695 	if (new_ext != ext)
2696 		return -EINVAL;
2697 
2698 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2699 	if (!iommu->copied_tables)
2700 		return -ENOMEM;
2701 
2702 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2703 	if (!old_rt_phys)
2704 		return -EINVAL;
2705 
2706 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2707 	if (!old_rt)
2708 		return -ENOMEM;
2709 
2710 	/* This is too big for the stack - allocate it from slab */
2711 	ctxt_table_entries = ext ? 512 : 256;
2712 	ret = -ENOMEM;
2713 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2714 	if (!ctxt_tbls)
2715 		goto out_unmap;
2716 
2717 	for (bus = 0; bus < 256; bus++) {
2718 		ret = copy_context_table(iommu, &old_rt[bus],
2719 					 ctxt_tbls, bus, ext);
2720 		if (ret) {
2721 			pr_err("%s: Failed to copy context table for bus %d\n",
2722 				iommu->name, bus);
2723 			continue;
2724 		}
2725 	}
2726 
2727 	spin_lock(&iommu->lock);
2728 
2729 	/* Context tables are copied, now write them to the root_entry table */
2730 	for (bus = 0; bus < 256; bus++) {
2731 		int idx = ext ? bus * 2 : bus;
2732 		u64 val;
2733 
2734 		if (ctxt_tbls[idx]) {
2735 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2736 			iommu->root_entry[bus].lo = val;
2737 		}
2738 
2739 		if (!ext || !ctxt_tbls[idx + 1])
2740 			continue;
2741 
2742 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2743 		iommu->root_entry[bus].hi = val;
2744 	}
2745 
2746 	spin_unlock(&iommu->lock);
2747 
2748 	kfree(ctxt_tbls);
2749 
2750 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2751 
2752 	ret = 0;
2753 
2754 out_unmap:
2755 	memunmap(old_rt);
2756 
2757 	return ret;
2758 }
2759 
2760 static int __init init_dmars(void)
2761 {
2762 	struct dmar_drhd_unit *drhd;
2763 	struct intel_iommu *iommu;
2764 	int ret;
2765 
2766 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2767 	if (ret)
2768 		goto free_iommu;
2769 
2770 	for_each_iommu(iommu, drhd) {
2771 		if (drhd->ignored) {
2772 			iommu_disable_translation(iommu);
2773 			continue;
2774 		}
2775 
2776 		/*
2777 		 * Find the max pasid size of all IOMMU's in the system.
2778 		 * We need to ensure the system pasid table is no bigger
2779 		 * than the smallest supported.
2780 		 */
2781 		if (pasid_supported(iommu)) {
2782 			u32 temp = 2 << ecap_pss(iommu->ecap);
2783 
2784 			intel_pasid_max_id = min_t(u32, temp,
2785 						   intel_pasid_max_id);
2786 		}
2787 
2788 		intel_iommu_init_qi(iommu);
2789 
2790 		ret = iommu_init_domains(iommu);
2791 		if (ret)
2792 			goto free_iommu;
2793 
2794 		init_translation_status(iommu);
2795 
2796 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2797 			iommu_disable_translation(iommu);
2798 			clear_translation_pre_enabled(iommu);
2799 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2800 				iommu->name);
2801 		}
2802 
2803 		/*
2804 		 * TBD:
2805 		 * we could share the same root & context tables
2806 		 * among all IOMMU's. Need to Split it later.
2807 		 */
2808 		ret = iommu_alloc_root_entry(iommu);
2809 		if (ret)
2810 			goto free_iommu;
2811 
2812 		if (translation_pre_enabled(iommu)) {
2813 			pr_info("Translation already enabled - trying to copy translation structures\n");
2814 
2815 			ret = copy_translation_tables(iommu);
2816 			if (ret) {
2817 				/*
2818 				 * We found the IOMMU with translation
2819 				 * enabled - but failed to copy over the
2820 				 * old root-entry table. Try to proceed
2821 				 * by disabling translation now and
2822 				 * allocating a clean root-entry table.
2823 				 * This might cause DMAR faults, but
2824 				 * probably the dump will still succeed.
2825 				 */
2826 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2827 				       iommu->name);
2828 				iommu_disable_translation(iommu);
2829 				clear_translation_pre_enabled(iommu);
2830 			} else {
2831 				pr_info("Copied translation tables from previous kernel for %s\n",
2832 					iommu->name);
2833 			}
2834 		}
2835 
2836 		if (!ecap_pass_through(iommu->ecap))
2837 			hw_pass_through = 0;
2838 		intel_svm_check(iommu);
2839 	}
2840 
2841 	/*
2842 	 * Now that qi is enabled on all iommus, set the root entry and flush
2843 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2844 	 * flush_context function will loop forever and the boot hangs.
2845 	 */
2846 	for_each_active_iommu(iommu, drhd) {
2847 		iommu_flush_write_buffer(iommu);
2848 		iommu_set_root_entry(iommu);
2849 	}
2850 
2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2852 	dmar_map_gfx = 0;
2853 #endif
2854 
2855 	if (!dmar_map_gfx)
2856 		iommu_identity_mapping |= IDENTMAP_GFX;
2857 
2858 	check_tylersburg_isoch();
2859 
2860 	ret = si_domain_init(hw_pass_through);
2861 	if (ret)
2862 		goto free_iommu;
2863 
2864 	/*
2865 	 * for each drhd
2866 	 *   enable fault log
2867 	 *   global invalidate context cache
2868 	 *   global invalidate iotlb
2869 	 *   enable translation
2870 	 */
2871 	for_each_iommu(iommu, drhd) {
2872 		if (drhd->ignored) {
2873 			/*
2874 			 * we always have to disable PMRs or DMA may fail on
2875 			 * this device
2876 			 */
2877 			if (force_on)
2878 				iommu_disable_protect_mem_regions(iommu);
2879 			continue;
2880 		}
2881 
2882 		iommu_flush_write_buffer(iommu);
2883 
2884 #ifdef CONFIG_INTEL_IOMMU_SVM
2885 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2886 			/*
2887 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2888 			 * could cause possible lock race condition.
2889 			 */
2890 			up_write(&dmar_global_lock);
2891 			ret = intel_svm_enable_prq(iommu);
2892 			down_write(&dmar_global_lock);
2893 			if (ret)
2894 				goto free_iommu;
2895 		}
2896 #endif
2897 		ret = dmar_set_interrupt(iommu);
2898 		if (ret)
2899 			goto free_iommu;
2900 	}
2901 
2902 	return 0;
2903 
2904 free_iommu:
2905 	for_each_active_iommu(iommu, drhd) {
2906 		disable_dmar_iommu(iommu);
2907 		free_dmar_iommu(iommu);
2908 	}
2909 	if (si_domain) {
2910 		domain_exit(si_domain);
2911 		si_domain = NULL;
2912 	}
2913 
2914 	return ret;
2915 }
2916 
2917 static void __init init_no_remapping_devices(void)
2918 {
2919 	struct dmar_drhd_unit *drhd;
2920 	struct device *dev;
2921 	int i;
2922 
2923 	for_each_drhd_unit(drhd) {
2924 		if (!drhd->include_all) {
2925 			for_each_active_dev_scope(drhd->devices,
2926 						  drhd->devices_cnt, i, dev)
2927 				break;
2928 			/* ignore DMAR unit if no devices exist */
2929 			if (i == drhd->devices_cnt)
2930 				drhd->ignored = 1;
2931 		}
2932 	}
2933 
2934 	for_each_active_drhd_unit(drhd) {
2935 		if (drhd->include_all)
2936 			continue;
2937 
2938 		for_each_active_dev_scope(drhd->devices,
2939 					  drhd->devices_cnt, i, dev)
2940 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2941 				break;
2942 		if (i < drhd->devices_cnt)
2943 			continue;
2944 
2945 		/* This IOMMU has *only* gfx devices. Either bypass it or
2946 		   set the gfx_mapped flag, as appropriate */
2947 		drhd->gfx_dedicated = 1;
2948 		if (!dmar_map_gfx)
2949 			drhd->ignored = 1;
2950 	}
2951 }
2952 
2953 #ifdef CONFIG_SUSPEND
2954 static int init_iommu_hw(void)
2955 {
2956 	struct dmar_drhd_unit *drhd;
2957 	struct intel_iommu *iommu = NULL;
2958 	int ret;
2959 
2960 	for_each_active_iommu(iommu, drhd) {
2961 		if (iommu->qi) {
2962 			ret = dmar_reenable_qi(iommu);
2963 			if (ret)
2964 				return ret;
2965 		}
2966 	}
2967 
2968 	for_each_iommu(iommu, drhd) {
2969 		if (drhd->ignored) {
2970 			/*
2971 			 * we always have to disable PMRs or DMA may fail on
2972 			 * this device
2973 			 */
2974 			if (force_on)
2975 				iommu_disable_protect_mem_regions(iommu);
2976 			continue;
2977 		}
2978 
2979 		iommu_flush_write_buffer(iommu);
2980 		iommu_set_root_entry(iommu);
2981 		iommu_enable_translation(iommu);
2982 		iommu_disable_protect_mem_regions(iommu);
2983 	}
2984 
2985 	return 0;
2986 }
2987 
2988 static void iommu_flush_all(void)
2989 {
2990 	struct dmar_drhd_unit *drhd;
2991 	struct intel_iommu *iommu;
2992 
2993 	for_each_active_iommu(iommu, drhd) {
2994 		iommu->flush.flush_context(iommu, 0, 0, 0,
2995 					   DMA_CCMD_GLOBAL_INVL);
2996 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2997 					 DMA_TLB_GLOBAL_FLUSH);
2998 	}
2999 }
3000 
3001 static int iommu_suspend(void)
3002 {
3003 	struct dmar_drhd_unit *drhd;
3004 	struct intel_iommu *iommu = NULL;
3005 	unsigned long flag;
3006 
3007 	iommu_flush_all();
3008 
3009 	for_each_active_iommu(iommu, drhd) {
3010 		iommu_disable_translation(iommu);
3011 
3012 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3013 
3014 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3015 			readl(iommu->reg + DMAR_FECTL_REG);
3016 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3017 			readl(iommu->reg + DMAR_FEDATA_REG);
3018 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3019 			readl(iommu->reg + DMAR_FEADDR_REG);
3020 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3021 			readl(iommu->reg + DMAR_FEUADDR_REG);
3022 
3023 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3024 	}
3025 	return 0;
3026 }
3027 
3028 static void iommu_resume(void)
3029 {
3030 	struct dmar_drhd_unit *drhd;
3031 	struct intel_iommu *iommu = NULL;
3032 	unsigned long flag;
3033 
3034 	if (init_iommu_hw()) {
3035 		if (force_on)
3036 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3037 		else
3038 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3039 		return;
3040 	}
3041 
3042 	for_each_active_iommu(iommu, drhd) {
3043 
3044 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3045 
3046 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3047 			iommu->reg + DMAR_FECTL_REG);
3048 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3049 			iommu->reg + DMAR_FEDATA_REG);
3050 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3051 			iommu->reg + DMAR_FEADDR_REG);
3052 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3053 			iommu->reg + DMAR_FEUADDR_REG);
3054 
3055 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3056 	}
3057 }
3058 
3059 static struct syscore_ops iommu_syscore_ops = {
3060 	.resume		= iommu_resume,
3061 	.suspend	= iommu_suspend,
3062 };
3063 
3064 static void __init init_iommu_pm_ops(void)
3065 {
3066 	register_syscore_ops(&iommu_syscore_ops);
3067 }
3068 
3069 #else
3070 static inline void init_iommu_pm_ops(void) {}
3071 #endif	/* CONFIG_PM */
3072 
3073 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3074 {
3075 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3076 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3077 	    rmrr->end_address <= rmrr->base_address ||
3078 	    arch_rmrr_sanity_check(rmrr))
3079 		return -EINVAL;
3080 
3081 	return 0;
3082 }
3083 
3084 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3085 {
3086 	struct acpi_dmar_reserved_memory *rmrr;
3087 	struct dmar_rmrr_unit *rmrru;
3088 
3089 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3090 	if (rmrr_sanity_check(rmrr)) {
3091 		pr_warn(FW_BUG
3092 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3093 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3094 			   rmrr->base_address, rmrr->end_address,
3095 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3096 			   dmi_get_system_info(DMI_BIOS_VERSION),
3097 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3098 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3099 	}
3100 
3101 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3102 	if (!rmrru)
3103 		goto out;
3104 
3105 	rmrru->hdr = header;
3106 
3107 	rmrru->base_address = rmrr->base_address;
3108 	rmrru->end_address = rmrr->end_address;
3109 
3110 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3111 				((void *)rmrr) + rmrr->header.length,
3112 				&rmrru->devices_cnt);
3113 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3114 		goto free_rmrru;
3115 
3116 	list_add(&rmrru->list, &dmar_rmrr_units);
3117 
3118 	return 0;
3119 free_rmrru:
3120 	kfree(rmrru);
3121 out:
3122 	return -ENOMEM;
3123 }
3124 
3125 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3126 {
3127 	struct dmar_atsr_unit *atsru;
3128 	struct acpi_dmar_atsr *tmp;
3129 
3130 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3131 				dmar_rcu_check()) {
3132 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3133 		if (atsr->segment != tmp->segment)
3134 			continue;
3135 		if (atsr->header.length != tmp->header.length)
3136 			continue;
3137 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3138 			return atsru;
3139 	}
3140 
3141 	return NULL;
3142 }
3143 
3144 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3145 {
3146 	struct acpi_dmar_atsr *atsr;
3147 	struct dmar_atsr_unit *atsru;
3148 
3149 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3150 		return 0;
3151 
3152 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3153 	atsru = dmar_find_atsr(atsr);
3154 	if (atsru)
3155 		return 0;
3156 
3157 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3158 	if (!atsru)
3159 		return -ENOMEM;
3160 
3161 	/*
3162 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3163 	 * copy the memory content because the memory buffer will be freed
3164 	 * on return.
3165 	 */
3166 	atsru->hdr = (void *)(atsru + 1);
3167 	memcpy(atsru->hdr, hdr, hdr->length);
3168 	atsru->include_all = atsr->flags & 0x1;
3169 	if (!atsru->include_all) {
3170 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3171 				(void *)atsr + atsr->header.length,
3172 				&atsru->devices_cnt);
3173 		if (atsru->devices_cnt && atsru->devices == NULL) {
3174 			kfree(atsru);
3175 			return -ENOMEM;
3176 		}
3177 	}
3178 
3179 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3180 
3181 	return 0;
3182 }
3183 
3184 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3185 {
3186 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3187 	kfree(atsru);
3188 }
3189 
3190 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3191 {
3192 	struct acpi_dmar_atsr *atsr;
3193 	struct dmar_atsr_unit *atsru;
3194 
3195 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3196 	atsru = dmar_find_atsr(atsr);
3197 	if (atsru) {
3198 		list_del_rcu(&atsru->list);
3199 		synchronize_rcu();
3200 		intel_iommu_free_atsr(atsru);
3201 	}
3202 
3203 	return 0;
3204 }
3205 
3206 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3207 {
3208 	int i;
3209 	struct device *dev;
3210 	struct acpi_dmar_atsr *atsr;
3211 	struct dmar_atsr_unit *atsru;
3212 
3213 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3214 	atsru = dmar_find_atsr(atsr);
3215 	if (!atsru)
3216 		return 0;
3217 
3218 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3219 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3220 					  i, dev)
3221 			return -EBUSY;
3222 	}
3223 
3224 	return 0;
3225 }
3226 
3227 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3228 {
3229 	struct dmar_satc_unit *satcu;
3230 	struct acpi_dmar_satc *tmp;
3231 
3232 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3233 				dmar_rcu_check()) {
3234 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3235 		if (satc->segment != tmp->segment)
3236 			continue;
3237 		if (satc->header.length != tmp->header.length)
3238 			continue;
3239 		if (memcmp(satc, tmp, satc->header.length) == 0)
3240 			return satcu;
3241 	}
3242 
3243 	return NULL;
3244 }
3245 
3246 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3247 {
3248 	struct acpi_dmar_satc *satc;
3249 	struct dmar_satc_unit *satcu;
3250 
3251 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3252 		return 0;
3253 
3254 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3255 	satcu = dmar_find_satc(satc);
3256 	if (satcu)
3257 		return 0;
3258 
3259 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3260 	if (!satcu)
3261 		return -ENOMEM;
3262 
3263 	satcu->hdr = (void *)(satcu + 1);
3264 	memcpy(satcu->hdr, hdr, hdr->length);
3265 	satcu->atc_required = satc->flags & 0x1;
3266 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3267 					      (void *)satc + satc->header.length,
3268 					      &satcu->devices_cnt);
3269 	if (satcu->devices_cnt && !satcu->devices) {
3270 		kfree(satcu);
3271 		return -ENOMEM;
3272 	}
3273 	list_add_rcu(&satcu->list, &dmar_satc_units);
3274 
3275 	return 0;
3276 }
3277 
3278 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3279 {
3280 	int sp, ret;
3281 	struct intel_iommu *iommu = dmaru->iommu;
3282 
3283 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3284 	if (ret)
3285 		goto out;
3286 
3287 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3288 		pr_warn("%s: Doesn't support hardware pass through.\n",
3289 			iommu->name);
3290 		return -ENXIO;
3291 	}
3292 
3293 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3294 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3295 		pr_warn("%s: Doesn't support large page.\n",
3296 			iommu->name);
3297 		return -ENXIO;
3298 	}
3299 
3300 	/*
3301 	 * Disable translation if already enabled prior to OS handover.
3302 	 */
3303 	if (iommu->gcmd & DMA_GCMD_TE)
3304 		iommu_disable_translation(iommu);
3305 
3306 	ret = iommu_init_domains(iommu);
3307 	if (ret == 0)
3308 		ret = iommu_alloc_root_entry(iommu);
3309 	if (ret)
3310 		goto out;
3311 
3312 	intel_svm_check(iommu);
3313 
3314 	if (dmaru->ignored) {
3315 		/*
3316 		 * we always have to disable PMRs or DMA may fail on this device
3317 		 */
3318 		if (force_on)
3319 			iommu_disable_protect_mem_regions(iommu);
3320 		return 0;
3321 	}
3322 
3323 	intel_iommu_init_qi(iommu);
3324 	iommu_flush_write_buffer(iommu);
3325 
3326 #ifdef CONFIG_INTEL_IOMMU_SVM
3327 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3328 		ret = intel_svm_enable_prq(iommu);
3329 		if (ret)
3330 			goto disable_iommu;
3331 	}
3332 #endif
3333 	ret = dmar_set_interrupt(iommu);
3334 	if (ret)
3335 		goto disable_iommu;
3336 
3337 	iommu_set_root_entry(iommu);
3338 	iommu_enable_translation(iommu);
3339 
3340 	iommu_disable_protect_mem_regions(iommu);
3341 	return 0;
3342 
3343 disable_iommu:
3344 	disable_dmar_iommu(iommu);
3345 out:
3346 	free_dmar_iommu(iommu);
3347 	return ret;
3348 }
3349 
3350 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3351 {
3352 	int ret = 0;
3353 	struct intel_iommu *iommu = dmaru->iommu;
3354 
3355 	if (!intel_iommu_enabled)
3356 		return 0;
3357 	if (iommu == NULL)
3358 		return -EINVAL;
3359 
3360 	if (insert) {
3361 		ret = intel_iommu_add(dmaru);
3362 	} else {
3363 		disable_dmar_iommu(iommu);
3364 		free_dmar_iommu(iommu);
3365 	}
3366 
3367 	return ret;
3368 }
3369 
3370 static void intel_iommu_free_dmars(void)
3371 {
3372 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3373 	struct dmar_atsr_unit *atsru, *atsr_n;
3374 	struct dmar_satc_unit *satcu, *satc_n;
3375 
3376 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3377 		list_del(&rmrru->list);
3378 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3379 		kfree(rmrru);
3380 	}
3381 
3382 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3383 		list_del(&atsru->list);
3384 		intel_iommu_free_atsr(atsru);
3385 	}
3386 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3387 		list_del(&satcu->list);
3388 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3389 		kfree(satcu);
3390 	}
3391 }
3392 
3393 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3394 {
3395 	struct dmar_satc_unit *satcu;
3396 	struct acpi_dmar_satc *satc;
3397 	struct device *tmp;
3398 	int i;
3399 
3400 	dev = pci_physfn(dev);
3401 	rcu_read_lock();
3402 
3403 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3404 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3405 		if (satc->segment != pci_domain_nr(dev->bus))
3406 			continue;
3407 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3408 			if (to_pci_dev(tmp) == dev)
3409 				goto out;
3410 	}
3411 	satcu = NULL;
3412 out:
3413 	rcu_read_unlock();
3414 	return satcu;
3415 }
3416 
3417 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3418 {
3419 	int i, ret = 1;
3420 	struct pci_bus *bus;
3421 	struct pci_dev *bridge = NULL;
3422 	struct device *tmp;
3423 	struct acpi_dmar_atsr *atsr;
3424 	struct dmar_atsr_unit *atsru;
3425 	struct dmar_satc_unit *satcu;
3426 
3427 	dev = pci_physfn(dev);
3428 	satcu = dmar_find_matched_satc_unit(dev);
3429 	if (satcu)
3430 		/*
3431 		 * This device supports ATS as it is in SATC table.
3432 		 * When IOMMU is in legacy mode, enabling ATS is done
3433 		 * automatically by HW for the device that requires
3434 		 * ATS, hence OS should not enable this device ATS
3435 		 * to avoid duplicated TLB invalidation.
3436 		 */
3437 		return !(satcu->atc_required && !sm_supported(iommu));
3438 
3439 	for (bus = dev->bus; bus; bus = bus->parent) {
3440 		bridge = bus->self;
3441 		/* If it's an integrated device, allow ATS */
3442 		if (!bridge)
3443 			return 1;
3444 		/* Connected via non-PCIe: no ATS */
3445 		if (!pci_is_pcie(bridge) ||
3446 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3447 			return 0;
3448 		/* If we found the root port, look it up in the ATSR */
3449 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3450 			break;
3451 	}
3452 
3453 	rcu_read_lock();
3454 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3455 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3456 		if (atsr->segment != pci_domain_nr(dev->bus))
3457 			continue;
3458 
3459 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3460 			if (tmp == &bridge->dev)
3461 				goto out;
3462 
3463 		if (atsru->include_all)
3464 			goto out;
3465 	}
3466 	ret = 0;
3467 out:
3468 	rcu_read_unlock();
3469 
3470 	return ret;
3471 }
3472 
3473 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3474 {
3475 	int ret;
3476 	struct dmar_rmrr_unit *rmrru;
3477 	struct dmar_atsr_unit *atsru;
3478 	struct dmar_satc_unit *satcu;
3479 	struct acpi_dmar_atsr *atsr;
3480 	struct acpi_dmar_reserved_memory *rmrr;
3481 	struct acpi_dmar_satc *satc;
3482 
3483 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3484 		return 0;
3485 
3486 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3487 		rmrr = container_of(rmrru->hdr,
3488 				    struct acpi_dmar_reserved_memory, header);
3489 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3490 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3491 				((void *)rmrr) + rmrr->header.length,
3492 				rmrr->segment, rmrru->devices,
3493 				rmrru->devices_cnt);
3494 			if (ret < 0)
3495 				return ret;
3496 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3497 			dmar_remove_dev_scope(info, rmrr->segment,
3498 				rmrru->devices, rmrru->devices_cnt);
3499 		}
3500 	}
3501 
3502 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3503 		if (atsru->include_all)
3504 			continue;
3505 
3506 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3507 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3508 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3509 					(void *)atsr + atsr->header.length,
3510 					atsr->segment, atsru->devices,
3511 					atsru->devices_cnt);
3512 			if (ret > 0)
3513 				break;
3514 			else if (ret < 0)
3515 				return ret;
3516 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3517 			if (dmar_remove_dev_scope(info, atsr->segment,
3518 					atsru->devices, atsru->devices_cnt))
3519 				break;
3520 		}
3521 	}
3522 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3523 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3524 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3525 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3526 					(void *)satc + satc->header.length,
3527 					satc->segment, satcu->devices,
3528 					satcu->devices_cnt);
3529 			if (ret > 0)
3530 				break;
3531 			else if (ret < 0)
3532 				return ret;
3533 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3534 			if (dmar_remove_dev_scope(info, satc->segment,
3535 					satcu->devices, satcu->devices_cnt))
3536 				break;
3537 		}
3538 	}
3539 
3540 	return 0;
3541 }
3542 
3543 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3544 				       unsigned long val, void *v)
3545 {
3546 	struct memory_notify *mhp = v;
3547 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3548 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3549 			mhp->nr_pages - 1);
3550 
3551 	switch (val) {
3552 	case MEM_GOING_ONLINE:
3553 		if (iommu_domain_identity_map(si_domain,
3554 					      start_vpfn, last_vpfn)) {
3555 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3556 				start_vpfn, last_vpfn);
3557 			return NOTIFY_BAD;
3558 		}
3559 		break;
3560 
3561 	case MEM_OFFLINE:
3562 	case MEM_CANCEL_ONLINE:
3563 		{
3564 			struct dmar_drhd_unit *drhd;
3565 			struct intel_iommu *iommu;
3566 			LIST_HEAD(freelist);
3567 
3568 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3569 
3570 			rcu_read_lock();
3571 			for_each_active_iommu(iommu, drhd)
3572 				iommu_flush_iotlb_psi(iommu, si_domain,
3573 					start_vpfn, mhp->nr_pages,
3574 					list_empty(&freelist), 0);
3575 			rcu_read_unlock();
3576 			put_pages_list(&freelist);
3577 		}
3578 		break;
3579 	}
3580 
3581 	return NOTIFY_OK;
3582 }
3583 
3584 static struct notifier_block intel_iommu_memory_nb = {
3585 	.notifier_call = intel_iommu_memory_notifier,
3586 	.priority = 0
3587 };
3588 
3589 static void intel_disable_iommus(void)
3590 {
3591 	struct intel_iommu *iommu = NULL;
3592 	struct dmar_drhd_unit *drhd;
3593 
3594 	for_each_iommu(iommu, drhd)
3595 		iommu_disable_translation(iommu);
3596 }
3597 
3598 void intel_iommu_shutdown(void)
3599 {
3600 	struct dmar_drhd_unit *drhd;
3601 	struct intel_iommu *iommu = NULL;
3602 
3603 	if (no_iommu || dmar_disabled)
3604 		return;
3605 
3606 	down_write(&dmar_global_lock);
3607 
3608 	/* Disable PMRs explicitly here. */
3609 	for_each_iommu(iommu, drhd)
3610 		iommu_disable_protect_mem_regions(iommu);
3611 
3612 	/* Make sure the IOMMUs are switched off */
3613 	intel_disable_iommus();
3614 
3615 	up_write(&dmar_global_lock);
3616 }
3617 
3618 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3619 {
3620 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3621 
3622 	return container_of(iommu_dev, struct intel_iommu, iommu);
3623 }
3624 
3625 static ssize_t version_show(struct device *dev,
3626 			    struct device_attribute *attr, char *buf)
3627 {
3628 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3629 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3630 	return sysfs_emit(buf, "%d:%d\n",
3631 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3632 }
3633 static DEVICE_ATTR_RO(version);
3634 
3635 static ssize_t address_show(struct device *dev,
3636 			    struct device_attribute *attr, char *buf)
3637 {
3638 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3639 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3640 }
3641 static DEVICE_ATTR_RO(address);
3642 
3643 static ssize_t cap_show(struct device *dev,
3644 			struct device_attribute *attr, char *buf)
3645 {
3646 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3647 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3648 }
3649 static DEVICE_ATTR_RO(cap);
3650 
3651 static ssize_t ecap_show(struct device *dev,
3652 			 struct device_attribute *attr, char *buf)
3653 {
3654 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3655 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3656 }
3657 static DEVICE_ATTR_RO(ecap);
3658 
3659 static ssize_t domains_supported_show(struct device *dev,
3660 				      struct device_attribute *attr, char *buf)
3661 {
3662 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3663 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3664 }
3665 static DEVICE_ATTR_RO(domains_supported);
3666 
3667 static ssize_t domains_used_show(struct device *dev,
3668 				 struct device_attribute *attr, char *buf)
3669 {
3670 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3671 	return sysfs_emit(buf, "%d\n",
3672 			  bitmap_weight(iommu->domain_ids,
3673 					cap_ndoms(iommu->cap)));
3674 }
3675 static DEVICE_ATTR_RO(domains_used);
3676 
3677 static struct attribute *intel_iommu_attrs[] = {
3678 	&dev_attr_version.attr,
3679 	&dev_attr_address.attr,
3680 	&dev_attr_cap.attr,
3681 	&dev_attr_ecap.attr,
3682 	&dev_attr_domains_supported.attr,
3683 	&dev_attr_domains_used.attr,
3684 	NULL,
3685 };
3686 
3687 static struct attribute_group intel_iommu_group = {
3688 	.name = "intel-iommu",
3689 	.attrs = intel_iommu_attrs,
3690 };
3691 
3692 const struct attribute_group *intel_iommu_groups[] = {
3693 	&intel_iommu_group,
3694 	NULL,
3695 };
3696 
3697 static inline bool has_external_pci(void)
3698 {
3699 	struct pci_dev *pdev = NULL;
3700 
3701 	for_each_pci_dev(pdev)
3702 		if (pdev->external_facing) {
3703 			pci_dev_put(pdev);
3704 			return true;
3705 		}
3706 
3707 	return false;
3708 }
3709 
3710 static int __init platform_optin_force_iommu(void)
3711 {
3712 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3713 		return 0;
3714 
3715 	if (no_iommu || dmar_disabled)
3716 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3717 
3718 	/*
3719 	 * If Intel-IOMMU is disabled by default, we will apply identity
3720 	 * map for all devices except those marked as being untrusted.
3721 	 */
3722 	if (dmar_disabled)
3723 		iommu_set_default_passthrough(false);
3724 
3725 	dmar_disabled = 0;
3726 	no_iommu = 0;
3727 
3728 	return 1;
3729 }
3730 
3731 static int __init probe_acpi_namespace_devices(void)
3732 {
3733 	struct dmar_drhd_unit *drhd;
3734 	/* To avoid a -Wunused-but-set-variable warning. */
3735 	struct intel_iommu *iommu __maybe_unused;
3736 	struct device *dev;
3737 	int i, ret = 0;
3738 
3739 	for_each_active_iommu(iommu, drhd) {
3740 		for_each_active_dev_scope(drhd->devices,
3741 					  drhd->devices_cnt, i, dev) {
3742 			struct acpi_device_physical_node *pn;
3743 			struct acpi_device *adev;
3744 
3745 			if (dev->bus != &acpi_bus_type)
3746 				continue;
3747 
3748 			adev = to_acpi_device(dev);
3749 			mutex_lock(&adev->physical_node_lock);
3750 			list_for_each_entry(pn,
3751 					    &adev->physical_node_list, node) {
3752 				ret = iommu_probe_device(pn->dev);
3753 				if (ret)
3754 					break;
3755 			}
3756 			mutex_unlock(&adev->physical_node_lock);
3757 
3758 			if (ret)
3759 				return ret;
3760 		}
3761 	}
3762 
3763 	return 0;
3764 }
3765 
3766 static __init int tboot_force_iommu(void)
3767 {
3768 	if (!tboot_enabled())
3769 		return 0;
3770 
3771 	if (no_iommu || dmar_disabled)
3772 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3773 
3774 	dmar_disabled = 0;
3775 	no_iommu = 0;
3776 
3777 	return 1;
3778 }
3779 
3780 int __init intel_iommu_init(void)
3781 {
3782 	int ret = -ENODEV;
3783 	struct dmar_drhd_unit *drhd;
3784 	struct intel_iommu *iommu;
3785 
3786 	/*
3787 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3788 	 * opt in, so enforce that.
3789 	 */
3790 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3791 		    platform_optin_force_iommu();
3792 
3793 	down_write(&dmar_global_lock);
3794 	if (dmar_table_init()) {
3795 		if (force_on)
3796 			panic("tboot: Failed to initialize DMAR table\n");
3797 		goto out_free_dmar;
3798 	}
3799 
3800 	if (dmar_dev_scope_init() < 0) {
3801 		if (force_on)
3802 			panic("tboot: Failed to initialize DMAR device scope\n");
3803 		goto out_free_dmar;
3804 	}
3805 
3806 	up_write(&dmar_global_lock);
3807 
3808 	/*
3809 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3810 	 * complain later when we register it under the lock.
3811 	 */
3812 	dmar_register_bus_notifier();
3813 
3814 	down_write(&dmar_global_lock);
3815 
3816 	if (!no_iommu)
3817 		intel_iommu_debugfs_init();
3818 
3819 	if (no_iommu || dmar_disabled) {
3820 		/*
3821 		 * We exit the function here to ensure IOMMU's remapping and
3822 		 * mempool aren't setup, which means that the IOMMU's PMRs
3823 		 * won't be disabled via the call to init_dmars(). So disable
3824 		 * it explicitly here. The PMRs were setup by tboot prior to
3825 		 * calling SENTER, but the kernel is expected to reset/tear
3826 		 * down the PMRs.
3827 		 */
3828 		if (intel_iommu_tboot_noforce) {
3829 			for_each_iommu(iommu, drhd)
3830 				iommu_disable_protect_mem_regions(iommu);
3831 		}
3832 
3833 		/*
3834 		 * Make sure the IOMMUs are switched off, even when we
3835 		 * boot into a kexec kernel and the previous kernel left
3836 		 * them enabled
3837 		 */
3838 		intel_disable_iommus();
3839 		goto out_free_dmar;
3840 	}
3841 
3842 	if (list_empty(&dmar_rmrr_units))
3843 		pr_info("No RMRR found\n");
3844 
3845 	if (list_empty(&dmar_atsr_units))
3846 		pr_info("No ATSR found\n");
3847 
3848 	if (list_empty(&dmar_satc_units))
3849 		pr_info("No SATC found\n");
3850 
3851 	init_no_remapping_devices();
3852 
3853 	ret = init_dmars();
3854 	if (ret) {
3855 		if (force_on)
3856 			panic("tboot: Failed to initialize DMARs\n");
3857 		pr_err("Initialization failed\n");
3858 		goto out_free_dmar;
3859 	}
3860 	up_write(&dmar_global_lock);
3861 
3862 	init_iommu_pm_ops();
3863 
3864 	down_read(&dmar_global_lock);
3865 	for_each_active_iommu(iommu, drhd) {
3866 		/*
3867 		 * The flush queue implementation does not perform
3868 		 * page-selective invalidations that are required for efficient
3869 		 * TLB flushes in virtual environments.  The benefit of batching
3870 		 * is likely to be much lower than the overhead of synchronizing
3871 		 * the virtual and physical IOMMU page-tables.
3872 		 */
3873 		if (cap_caching_mode(iommu->cap) &&
3874 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3875 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3876 			iommu_set_dma_strict();
3877 		}
3878 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3879 				       intel_iommu_groups,
3880 				       "%s", iommu->name);
3881 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3882 
3883 		iommu_pmu_register(iommu);
3884 	}
3885 	up_read(&dmar_global_lock);
3886 
3887 	if (si_domain && !hw_pass_through)
3888 		register_memory_notifier(&intel_iommu_memory_nb);
3889 
3890 	down_read(&dmar_global_lock);
3891 	if (probe_acpi_namespace_devices())
3892 		pr_warn("ACPI name space devices didn't probe correctly\n");
3893 
3894 	/* Finally, we enable the DMA remapping hardware. */
3895 	for_each_iommu(iommu, drhd) {
3896 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3897 			iommu_enable_translation(iommu);
3898 
3899 		iommu_disable_protect_mem_regions(iommu);
3900 	}
3901 	up_read(&dmar_global_lock);
3902 
3903 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3904 
3905 	intel_iommu_enabled = 1;
3906 
3907 	return 0;
3908 
3909 out_free_dmar:
3910 	intel_iommu_free_dmars();
3911 	up_write(&dmar_global_lock);
3912 	return ret;
3913 }
3914 
3915 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3916 {
3917 	struct device_domain_info *info = opaque;
3918 
3919 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3920 	return 0;
3921 }
3922 
3923 /*
3924  * NB - intel-iommu lacks any sort of reference counting for the users of
3925  * dependent devices.  If multiple endpoints have intersecting dependent
3926  * devices, unbinding the driver from any one of them will possibly leave
3927  * the others unable to operate.
3928  */
3929 static void domain_context_clear(struct device_domain_info *info)
3930 {
3931 	if (!dev_is_pci(info->dev))
3932 		domain_context_clear_one(info, info->bus, info->devfn);
3933 
3934 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3935 			       &domain_context_clear_one_cb, info);
3936 }
3937 
3938 static void dmar_remove_one_dev_info(struct device *dev)
3939 {
3940 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3941 	struct dmar_domain *domain = info->domain;
3942 	struct intel_iommu *iommu = info->iommu;
3943 	unsigned long flags;
3944 
3945 	if (!dev_is_real_dma_subdevice(info->dev)) {
3946 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3947 			intel_pasid_tear_down_entry(iommu, info->dev,
3948 					IOMMU_NO_PASID, false);
3949 
3950 		iommu_disable_pci_caps(info);
3951 		domain_context_clear(info);
3952 	}
3953 
3954 	spin_lock_irqsave(&domain->lock, flags);
3955 	list_del(&info->link);
3956 	spin_unlock_irqrestore(&domain->lock, flags);
3957 
3958 	domain_detach_iommu(domain, iommu);
3959 	info->domain = NULL;
3960 }
3961 
3962 /*
3963  * Clear the page table pointer in context or pasid table entries so that
3964  * all DMA requests without PASID from the device are blocked. If the page
3965  * table has been set, clean up the data structures.
3966  */
3967 void device_block_translation(struct device *dev)
3968 {
3969 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3970 	struct intel_iommu *iommu = info->iommu;
3971 	unsigned long flags;
3972 
3973 	iommu_disable_pci_caps(info);
3974 	if (!dev_is_real_dma_subdevice(dev)) {
3975 		if (sm_supported(iommu))
3976 			intel_pasid_tear_down_entry(iommu, dev,
3977 						    IOMMU_NO_PASID, false);
3978 		else
3979 			domain_context_clear(info);
3980 	}
3981 
3982 	if (!info->domain)
3983 		return;
3984 
3985 	spin_lock_irqsave(&info->domain->lock, flags);
3986 	list_del(&info->link);
3987 	spin_unlock_irqrestore(&info->domain->lock, flags);
3988 
3989 	domain_detach_iommu(info->domain, iommu);
3990 	info->domain = NULL;
3991 }
3992 
3993 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3994 {
3995 	int adjust_width;
3996 
3997 	/* calculate AGAW */
3998 	domain->gaw = guest_width;
3999 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4000 	domain->agaw = width_to_agaw(adjust_width);
4001 
4002 	domain->iommu_coherency = false;
4003 	domain->iommu_superpage = 0;
4004 	domain->max_addr = 0;
4005 
4006 	/* always allocate the top pgd */
4007 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4008 	if (!domain->pgd)
4009 		return -ENOMEM;
4010 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4011 	return 0;
4012 }
4013 
4014 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4015 				      struct device *dev)
4016 {
4017 	device_block_translation(dev);
4018 	return 0;
4019 }
4020 
4021 static struct iommu_domain blocking_domain = {
4022 	.type = IOMMU_DOMAIN_BLOCKED,
4023 	.ops = &(const struct iommu_domain_ops) {
4024 		.attach_dev	= blocking_domain_attach_dev,
4025 	}
4026 };
4027 
4028 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4029 {
4030 	struct dmar_domain *dmar_domain;
4031 	struct iommu_domain *domain;
4032 
4033 	switch (type) {
4034 	case IOMMU_DOMAIN_DMA:
4035 	case IOMMU_DOMAIN_UNMANAGED:
4036 		dmar_domain = alloc_domain(type);
4037 		if (!dmar_domain) {
4038 			pr_err("Can't allocate dmar_domain\n");
4039 			return NULL;
4040 		}
4041 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4042 			pr_err("Domain initialization failed\n");
4043 			domain_exit(dmar_domain);
4044 			return NULL;
4045 		}
4046 
4047 		domain = &dmar_domain->domain;
4048 		domain->geometry.aperture_start = 0;
4049 		domain->geometry.aperture_end   =
4050 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4051 		domain->geometry.force_aperture = true;
4052 
4053 		return domain;
4054 	case IOMMU_DOMAIN_IDENTITY:
4055 		return &si_domain->domain;
4056 	case IOMMU_DOMAIN_SVA:
4057 		return intel_svm_domain_alloc();
4058 	default:
4059 		return NULL;
4060 	}
4061 
4062 	return NULL;
4063 }
4064 
4065 static struct iommu_domain *
4066 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4067 			      struct iommu_domain *parent,
4068 			      const struct iommu_user_data *user_data)
4069 {
4070 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4071 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4072 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4073 	struct intel_iommu *iommu = info->iommu;
4074 	struct iommu_domain *domain;
4075 
4076 	/* Must be NESTING domain */
4077 	if (parent) {
4078 		if (!nested_supported(iommu) || flags)
4079 			return ERR_PTR(-EOPNOTSUPP);
4080 		return intel_nested_domain_alloc(parent, user_data);
4081 	}
4082 
4083 	if (flags &
4084 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4085 		return ERR_PTR(-EOPNOTSUPP);
4086 	if (nested_parent && !nested_supported(iommu))
4087 		return ERR_PTR(-EOPNOTSUPP);
4088 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4089 		return ERR_PTR(-EOPNOTSUPP);
4090 
4091 	/*
4092 	 * domain_alloc_user op needs to fully initialize a domain before
4093 	 * return, so uses iommu_domain_alloc() here for simple.
4094 	 */
4095 	domain = iommu_domain_alloc(dev->bus);
4096 	if (!domain)
4097 		return ERR_PTR(-ENOMEM);
4098 
4099 	if (nested_parent)
4100 		to_dmar_domain(domain)->nested_parent = true;
4101 
4102 	if (dirty_tracking) {
4103 		if (to_dmar_domain(domain)->use_first_level) {
4104 			iommu_domain_free(domain);
4105 			return ERR_PTR(-EOPNOTSUPP);
4106 		}
4107 		domain->dirty_ops = &intel_dirty_ops;
4108 	}
4109 
4110 	return domain;
4111 }
4112 
4113 static void intel_iommu_domain_free(struct iommu_domain *domain)
4114 {
4115 	if (domain != &si_domain->domain)
4116 		domain_exit(to_dmar_domain(domain));
4117 }
4118 
4119 int prepare_domain_attach_device(struct iommu_domain *domain,
4120 				 struct device *dev)
4121 {
4122 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4123 	struct intel_iommu *iommu;
4124 	int addr_width;
4125 
4126 	iommu = device_to_iommu(dev, NULL, NULL);
4127 	if (!iommu)
4128 		return -ENODEV;
4129 
4130 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4131 		return -EINVAL;
4132 
4133 	if (domain->dirty_ops && !ssads_supported(iommu))
4134 		return -EINVAL;
4135 
4136 	/* check if this iommu agaw is sufficient for max mapped address */
4137 	addr_width = agaw_to_width(iommu->agaw);
4138 	if (addr_width > cap_mgaw(iommu->cap))
4139 		addr_width = cap_mgaw(iommu->cap);
4140 
4141 	if (dmar_domain->max_addr > (1LL << addr_width))
4142 		return -EINVAL;
4143 	dmar_domain->gaw = addr_width;
4144 
4145 	/*
4146 	 * Knock out extra levels of page tables if necessary
4147 	 */
4148 	while (iommu->agaw < dmar_domain->agaw) {
4149 		struct dma_pte *pte;
4150 
4151 		pte = dmar_domain->pgd;
4152 		if (dma_pte_present(pte)) {
4153 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4154 			free_pgtable_page(pte);
4155 		}
4156 		dmar_domain->agaw--;
4157 	}
4158 
4159 	return 0;
4160 }
4161 
4162 static int intel_iommu_attach_device(struct iommu_domain *domain,
4163 				     struct device *dev)
4164 {
4165 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4166 	int ret;
4167 
4168 	if (info->domain)
4169 		device_block_translation(dev);
4170 
4171 	ret = prepare_domain_attach_device(domain, dev);
4172 	if (ret)
4173 		return ret;
4174 
4175 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4176 }
4177 
4178 static int intel_iommu_map(struct iommu_domain *domain,
4179 			   unsigned long iova, phys_addr_t hpa,
4180 			   size_t size, int iommu_prot, gfp_t gfp)
4181 {
4182 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4183 	u64 max_addr;
4184 	int prot = 0;
4185 
4186 	if (iommu_prot & IOMMU_READ)
4187 		prot |= DMA_PTE_READ;
4188 	if (iommu_prot & IOMMU_WRITE)
4189 		prot |= DMA_PTE_WRITE;
4190 	if (dmar_domain->set_pte_snp)
4191 		prot |= DMA_PTE_SNP;
4192 
4193 	max_addr = iova + size;
4194 	if (dmar_domain->max_addr < max_addr) {
4195 		u64 end;
4196 
4197 		/* check if minimum agaw is sufficient for mapped address */
4198 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4199 		if (end < max_addr) {
4200 			pr_err("%s: iommu width (%d) is not "
4201 			       "sufficient for the mapped address (%llx)\n",
4202 			       __func__, dmar_domain->gaw, max_addr);
4203 			return -EFAULT;
4204 		}
4205 		dmar_domain->max_addr = max_addr;
4206 	}
4207 	/* Round up size to next multiple of PAGE_SIZE, if it and
4208 	   the low bits of hpa would take us onto the next page */
4209 	size = aligned_nrpages(hpa, size);
4210 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4211 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4212 }
4213 
4214 static int intel_iommu_map_pages(struct iommu_domain *domain,
4215 				 unsigned long iova, phys_addr_t paddr,
4216 				 size_t pgsize, size_t pgcount,
4217 				 int prot, gfp_t gfp, size_t *mapped)
4218 {
4219 	unsigned long pgshift = __ffs(pgsize);
4220 	size_t size = pgcount << pgshift;
4221 	int ret;
4222 
4223 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4224 		return -EINVAL;
4225 
4226 	if (!IS_ALIGNED(iova | paddr, pgsize))
4227 		return -EINVAL;
4228 
4229 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4230 	if (!ret && mapped)
4231 		*mapped = size;
4232 
4233 	return ret;
4234 }
4235 
4236 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4237 				unsigned long iova, size_t size,
4238 				struct iommu_iotlb_gather *gather)
4239 {
4240 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4241 	unsigned long start_pfn, last_pfn;
4242 	int level = 0;
4243 
4244 	/* Cope with horrid API which requires us to unmap more than the
4245 	   size argument if it happens to be a large-page mapping. */
4246 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4247 				     &level, GFP_ATOMIC)))
4248 		return 0;
4249 
4250 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4251 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4252 
4253 	start_pfn = iova >> VTD_PAGE_SHIFT;
4254 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4255 
4256 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4257 
4258 	if (dmar_domain->max_addr == iova + size)
4259 		dmar_domain->max_addr = iova;
4260 
4261 	/*
4262 	 * We do not use page-selective IOTLB invalidation in flush queue,
4263 	 * so there is no need to track page and sync iotlb.
4264 	 */
4265 	if (!iommu_iotlb_gather_queued(gather))
4266 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4267 
4268 	return size;
4269 }
4270 
4271 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4272 				      unsigned long iova,
4273 				      size_t pgsize, size_t pgcount,
4274 				      struct iommu_iotlb_gather *gather)
4275 {
4276 	unsigned long pgshift = __ffs(pgsize);
4277 	size_t size = pgcount << pgshift;
4278 
4279 	return intel_iommu_unmap(domain, iova, size, gather);
4280 }
4281 
4282 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4283 				 struct iommu_iotlb_gather *gather)
4284 {
4285 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4286 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4287 	size_t size = gather->end - gather->start;
4288 	struct iommu_domain_info *info;
4289 	unsigned long start_pfn;
4290 	unsigned long nrpages;
4291 	unsigned long i;
4292 
4293 	nrpages = aligned_nrpages(gather->start, size);
4294 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4295 
4296 	xa_for_each(&dmar_domain->iommu_array, i, info)
4297 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4298 				      start_pfn, nrpages,
4299 				      list_empty(&gather->freelist), 0);
4300 
4301 	put_pages_list(&gather->freelist);
4302 }
4303 
4304 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4305 					    dma_addr_t iova)
4306 {
4307 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4308 	struct dma_pte *pte;
4309 	int level = 0;
4310 	u64 phys = 0;
4311 
4312 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4313 			     GFP_ATOMIC);
4314 	if (pte && dma_pte_present(pte))
4315 		phys = dma_pte_addr(pte) +
4316 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4317 						VTD_PAGE_SHIFT) - 1));
4318 
4319 	return phys;
4320 }
4321 
4322 static bool domain_support_force_snooping(struct dmar_domain *domain)
4323 {
4324 	struct device_domain_info *info;
4325 	bool support = true;
4326 
4327 	assert_spin_locked(&domain->lock);
4328 	list_for_each_entry(info, &domain->devices, link) {
4329 		if (!ecap_sc_support(info->iommu->ecap)) {
4330 			support = false;
4331 			break;
4332 		}
4333 	}
4334 
4335 	return support;
4336 }
4337 
4338 static void domain_set_force_snooping(struct dmar_domain *domain)
4339 {
4340 	struct device_domain_info *info;
4341 
4342 	assert_spin_locked(&domain->lock);
4343 	/*
4344 	 * Second level page table supports per-PTE snoop control. The
4345 	 * iommu_map() interface will handle this by setting SNP bit.
4346 	 */
4347 	if (!domain->use_first_level) {
4348 		domain->set_pte_snp = true;
4349 		return;
4350 	}
4351 
4352 	list_for_each_entry(info, &domain->devices, link)
4353 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4354 						     IOMMU_NO_PASID);
4355 }
4356 
4357 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4358 {
4359 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4360 	unsigned long flags;
4361 
4362 	if (dmar_domain->force_snooping)
4363 		return true;
4364 
4365 	spin_lock_irqsave(&dmar_domain->lock, flags);
4366 	if (!domain_support_force_snooping(dmar_domain) ||
4367 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4368 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4369 		return false;
4370 	}
4371 
4372 	domain_set_force_snooping(dmar_domain);
4373 	dmar_domain->force_snooping = true;
4374 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4375 
4376 	return true;
4377 }
4378 
4379 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4380 {
4381 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4382 
4383 	switch (cap) {
4384 	case IOMMU_CAP_CACHE_COHERENCY:
4385 	case IOMMU_CAP_DEFERRED_FLUSH:
4386 		return true;
4387 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4388 		return dmar_platform_optin();
4389 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4390 		return ecap_sc_support(info->iommu->ecap);
4391 	case IOMMU_CAP_DIRTY_TRACKING:
4392 		return ssads_supported(info->iommu);
4393 	default:
4394 		return false;
4395 	}
4396 }
4397 
4398 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4399 {
4400 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4401 	struct device_domain_info *info;
4402 	struct intel_iommu *iommu;
4403 	u8 bus, devfn;
4404 	int ret;
4405 
4406 	iommu = device_to_iommu(dev, &bus, &devfn);
4407 	if (!iommu || !iommu->iommu.ops)
4408 		return ERR_PTR(-ENODEV);
4409 
4410 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4411 	if (!info)
4412 		return ERR_PTR(-ENOMEM);
4413 
4414 	if (dev_is_real_dma_subdevice(dev)) {
4415 		info->bus = pdev->bus->number;
4416 		info->devfn = pdev->devfn;
4417 		info->segment = pci_domain_nr(pdev->bus);
4418 	} else {
4419 		info->bus = bus;
4420 		info->devfn = devfn;
4421 		info->segment = iommu->segment;
4422 	}
4423 
4424 	info->dev = dev;
4425 	info->iommu = iommu;
4426 	if (dev_is_pci(dev)) {
4427 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4428 		    pci_ats_supported(pdev) &&
4429 		    dmar_ats_supported(pdev, iommu)) {
4430 			info->ats_supported = 1;
4431 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4432 
4433 			/*
4434 			 * For IOMMU that supports device IOTLB throttling
4435 			 * (DIT), we assign PFSID to the invalidation desc
4436 			 * of a VF such that IOMMU HW can gauge queue depth
4437 			 * at PF level. If DIT is not set, PFSID will be
4438 			 * treated as reserved, which should be set to 0.
4439 			 */
4440 			if (ecap_dit(iommu->ecap))
4441 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4442 			info->ats_qdep = pci_ats_queue_depth(pdev);
4443 		}
4444 		if (sm_supported(iommu)) {
4445 			if (pasid_supported(iommu)) {
4446 				int features = pci_pasid_features(pdev);
4447 
4448 				if (features >= 0)
4449 					info->pasid_supported = features | 1;
4450 			}
4451 
4452 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4453 			    pci_pri_supported(pdev))
4454 				info->pri_supported = 1;
4455 		}
4456 	}
4457 
4458 	dev_iommu_priv_set(dev, info);
4459 
4460 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4461 		ret = intel_pasid_alloc_table(dev);
4462 		if (ret) {
4463 			dev_err(dev, "PASID table allocation failed\n");
4464 			dev_iommu_priv_set(dev, NULL);
4465 			kfree(info);
4466 			return ERR_PTR(ret);
4467 		}
4468 	}
4469 
4470 	intel_iommu_debugfs_create_dev(info);
4471 
4472 	return &iommu->iommu;
4473 }
4474 
4475 static void intel_iommu_release_device(struct device *dev)
4476 {
4477 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4478 
4479 	dmar_remove_one_dev_info(dev);
4480 	intel_pasid_free_table(dev);
4481 	intel_iommu_debugfs_remove_dev(info);
4482 	dev_iommu_priv_set(dev, NULL);
4483 	kfree(info);
4484 	set_dma_ops(dev, NULL);
4485 }
4486 
4487 static void intel_iommu_probe_finalize(struct device *dev)
4488 {
4489 	set_dma_ops(dev, NULL);
4490 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4491 }
4492 
4493 static void intel_iommu_get_resv_regions(struct device *device,
4494 					 struct list_head *head)
4495 {
4496 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4497 	struct iommu_resv_region *reg;
4498 	struct dmar_rmrr_unit *rmrr;
4499 	struct device *i_dev;
4500 	int i;
4501 
4502 	rcu_read_lock();
4503 	for_each_rmrr_units(rmrr) {
4504 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4505 					  i, i_dev) {
4506 			struct iommu_resv_region *resv;
4507 			enum iommu_resv_type type;
4508 			size_t length;
4509 
4510 			if (i_dev != device &&
4511 			    !is_downstream_to_pci_bridge(device, i_dev))
4512 				continue;
4513 
4514 			length = rmrr->end_address - rmrr->base_address + 1;
4515 
4516 			type = device_rmrr_is_relaxable(device) ?
4517 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4518 
4519 			resv = iommu_alloc_resv_region(rmrr->base_address,
4520 						       length, prot, type,
4521 						       GFP_ATOMIC);
4522 			if (!resv)
4523 				break;
4524 
4525 			list_add_tail(&resv->list, head);
4526 		}
4527 	}
4528 	rcu_read_unlock();
4529 
4530 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4531 	if (dev_is_pci(device)) {
4532 		struct pci_dev *pdev = to_pci_dev(device);
4533 
4534 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4535 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4536 					IOMMU_RESV_DIRECT_RELAXABLE,
4537 					GFP_KERNEL);
4538 			if (reg)
4539 				list_add_tail(&reg->list, head);
4540 		}
4541 	}
4542 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4543 
4544 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4545 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4546 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4547 	if (!reg)
4548 		return;
4549 	list_add_tail(&reg->list, head);
4550 }
4551 
4552 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4553 {
4554 	if (dev_is_pci(dev))
4555 		return pci_device_group(dev);
4556 	return generic_device_group(dev);
4557 }
4558 
4559 static int intel_iommu_enable_sva(struct device *dev)
4560 {
4561 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4562 	struct intel_iommu *iommu;
4563 
4564 	if (!info || dmar_disabled)
4565 		return -EINVAL;
4566 
4567 	iommu = info->iommu;
4568 	if (!iommu)
4569 		return -EINVAL;
4570 
4571 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4572 		return -ENODEV;
4573 
4574 	if (!info->pasid_enabled || !info->ats_enabled)
4575 		return -EINVAL;
4576 
4577 	/*
4578 	 * Devices having device-specific I/O fault handling should not
4579 	 * support PCI/PRI. The IOMMU side has no means to check the
4580 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4581 	 * default that if the device driver enables SVA on a non-PRI
4582 	 * device, it will handle IOPF in its own way.
4583 	 */
4584 	if (!info->pri_supported)
4585 		return 0;
4586 
4587 	/* Devices supporting PRI should have it enabled. */
4588 	if (!info->pri_enabled)
4589 		return -EINVAL;
4590 
4591 	return 0;
4592 }
4593 
4594 static int intel_iommu_enable_iopf(struct device *dev)
4595 {
4596 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4597 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4598 	struct intel_iommu *iommu;
4599 	int ret;
4600 
4601 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4602 		return -ENODEV;
4603 
4604 	if (info->pri_enabled)
4605 		return -EBUSY;
4606 
4607 	iommu = info->iommu;
4608 	if (!iommu)
4609 		return -EINVAL;
4610 
4611 	/* PASID is required in PRG Response Message. */
4612 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4613 		return -EINVAL;
4614 
4615 	ret = pci_reset_pri(pdev);
4616 	if (ret)
4617 		return ret;
4618 
4619 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4620 	if (ret)
4621 		return ret;
4622 
4623 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4624 	if (ret)
4625 		goto iopf_remove_device;
4626 
4627 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4628 	if (ret)
4629 		goto iopf_unregister_handler;
4630 	info->pri_enabled = 1;
4631 
4632 	return 0;
4633 
4634 iopf_unregister_handler:
4635 	iommu_unregister_device_fault_handler(dev);
4636 iopf_remove_device:
4637 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4638 
4639 	return ret;
4640 }
4641 
4642 static int intel_iommu_disable_iopf(struct device *dev)
4643 {
4644 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4645 	struct intel_iommu *iommu = info->iommu;
4646 
4647 	if (!info->pri_enabled)
4648 		return -EINVAL;
4649 
4650 	/*
4651 	 * PCIe spec states that by clearing PRI enable bit, the Page
4652 	 * Request Interface will not issue new page requests, but has
4653 	 * outstanding page requests that have been transmitted or are
4654 	 * queued for transmission. This is supposed to be called after
4655 	 * the device driver has stopped DMA, all PASIDs have been
4656 	 * unbound and the outstanding PRQs have been drained.
4657 	 */
4658 	pci_disable_pri(to_pci_dev(dev));
4659 	info->pri_enabled = 0;
4660 
4661 	/*
4662 	 * With PRI disabled and outstanding PRQs drained, unregistering
4663 	 * fault handler and removing device from iopf queue should never
4664 	 * fail.
4665 	 */
4666 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4667 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4668 
4669 	return 0;
4670 }
4671 
4672 static int
4673 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4674 {
4675 	switch (feat) {
4676 	case IOMMU_DEV_FEAT_IOPF:
4677 		return intel_iommu_enable_iopf(dev);
4678 
4679 	case IOMMU_DEV_FEAT_SVA:
4680 		return intel_iommu_enable_sva(dev);
4681 
4682 	default:
4683 		return -ENODEV;
4684 	}
4685 }
4686 
4687 static int
4688 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4689 {
4690 	switch (feat) {
4691 	case IOMMU_DEV_FEAT_IOPF:
4692 		return intel_iommu_disable_iopf(dev);
4693 
4694 	case IOMMU_DEV_FEAT_SVA:
4695 		return 0;
4696 
4697 	default:
4698 		return -ENODEV;
4699 	}
4700 }
4701 
4702 static bool intel_iommu_is_attach_deferred(struct device *dev)
4703 {
4704 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4705 
4706 	return translation_pre_enabled(info->iommu) && !info->domain;
4707 }
4708 
4709 /*
4710  * Check that the device does not live on an external facing PCI port that is
4711  * marked as untrusted. Such devices should not be able to apply quirks and
4712  * thus not be able to bypass the IOMMU restrictions.
4713  */
4714 static bool risky_device(struct pci_dev *pdev)
4715 {
4716 	if (pdev->untrusted) {
4717 		pci_info(pdev,
4718 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4719 			 pdev->vendor, pdev->device);
4720 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4721 		return true;
4722 	}
4723 	return false;
4724 }
4725 
4726 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4727 				      unsigned long iova, size_t size)
4728 {
4729 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4730 	unsigned long pages = aligned_nrpages(iova, size);
4731 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4732 	struct iommu_domain_info *info;
4733 	unsigned long i;
4734 
4735 	xa_for_each(&dmar_domain->iommu_array, i, info)
4736 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4737 	return 0;
4738 }
4739 
4740 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4741 {
4742 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4743 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4744 	struct dmar_domain *dmar_domain;
4745 	struct iommu_domain *domain;
4746 	unsigned long flags;
4747 
4748 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4749 	if (WARN_ON_ONCE(!domain))
4750 		goto out_tear_down;
4751 
4752 	/*
4753 	 * The SVA implementation needs to handle its own stuffs like the mm
4754 	 * notification. Before consolidating that code into iommu core, let
4755 	 * the intel sva code handle it.
4756 	 */
4757 	if (domain->type == IOMMU_DOMAIN_SVA) {
4758 		intel_svm_remove_dev_pasid(dev, pasid);
4759 		goto out_tear_down;
4760 	}
4761 
4762 	dmar_domain = to_dmar_domain(domain);
4763 	spin_lock_irqsave(&dmar_domain->lock, flags);
4764 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4765 		if (curr->dev == dev && curr->pasid == pasid) {
4766 			list_del(&curr->link_domain);
4767 			dev_pasid = curr;
4768 			break;
4769 		}
4770 	}
4771 	WARN_ON_ONCE(!dev_pasid);
4772 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4773 
4774 	domain_detach_iommu(dmar_domain, iommu);
4775 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4776 	kfree(dev_pasid);
4777 out_tear_down:
4778 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4779 	intel_drain_pasid_prq(dev, pasid);
4780 }
4781 
4782 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4783 				     struct device *dev, ioasid_t pasid)
4784 {
4785 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4786 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4787 	struct intel_iommu *iommu = info->iommu;
4788 	struct dev_pasid_info *dev_pasid;
4789 	unsigned long flags;
4790 	int ret;
4791 
4792 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4793 		return -EOPNOTSUPP;
4794 
4795 	if (domain->dirty_ops)
4796 		return -EINVAL;
4797 
4798 	if (context_copied(iommu, info->bus, info->devfn))
4799 		return -EBUSY;
4800 
4801 	ret = prepare_domain_attach_device(domain, dev);
4802 	if (ret)
4803 		return ret;
4804 
4805 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4806 	if (!dev_pasid)
4807 		return -ENOMEM;
4808 
4809 	ret = domain_attach_iommu(dmar_domain, iommu);
4810 	if (ret)
4811 		goto out_free;
4812 
4813 	if (domain_type_is_si(dmar_domain))
4814 		ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4815 						     dev, pasid);
4816 	else if (dmar_domain->use_first_level)
4817 		ret = domain_setup_first_level(iommu, dmar_domain,
4818 					       dev, pasid);
4819 	else
4820 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4821 						     dev, pasid);
4822 	if (ret)
4823 		goto out_detach_iommu;
4824 
4825 	dev_pasid->dev = dev;
4826 	dev_pasid->pasid = pasid;
4827 	spin_lock_irqsave(&dmar_domain->lock, flags);
4828 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4829 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4830 
4831 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4832 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4833 
4834 	return 0;
4835 out_detach_iommu:
4836 	domain_detach_iommu(dmar_domain, iommu);
4837 out_free:
4838 	kfree(dev_pasid);
4839 	return ret;
4840 }
4841 
4842 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4843 {
4844 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4845 	struct intel_iommu *iommu = info->iommu;
4846 	struct iommu_hw_info_vtd *vtd;
4847 
4848 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4849 	if (!vtd)
4850 		return ERR_PTR(-ENOMEM);
4851 
4852 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4853 	vtd->cap_reg = iommu->cap;
4854 	vtd->ecap_reg = iommu->ecap;
4855 	*length = sizeof(*vtd);
4856 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4857 	return vtd;
4858 }
4859 
4860 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4861 					  bool enable)
4862 {
4863 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4864 	struct device_domain_info *info;
4865 	int ret;
4866 
4867 	spin_lock(&dmar_domain->lock);
4868 	if (dmar_domain->dirty_tracking == enable)
4869 		goto out_unlock;
4870 
4871 	list_for_each_entry(info, &dmar_domain->devices, link) {
4872 		ret = intel_pasid_setup_dirty_tracking(info->iommu,
4873 						       info->domain, info->dev,
4874 						       IOMMU_NO_PASID, enable);
4875 		if (ret)
4876 			goto err_unwind;
4877 	}
4878 
4879 	dmar_domain->dirty_tracking = enable;
4880 out_unlock:
4881 	spin_unlock(&dmar_domain->lock);
4882 
4883 	return 0;
4884 
4885 err_unwind:
4886 	list_for_each_entry(info, &dmar_domain->devices, link)
4887 		intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4888 						 info->dev, IOMMU_NO_PASID,
4889 						 dmar_domain->dirty_tracking);
4890 	spin_unlock(&dmar_domain->lock);
4891 	return ret;
4892 }
4893 
4894 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4895 					    unsigned long iova, size_t size,
4896 					    unsigned long flags,
4897 					    struct iommu_dirty_bitmap *dirty)
4898 {
4899 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4900 	unsigned long end = iova + size - 1;
4901 	unsigned long pgsize;
4902 
4903 	/*
4904 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4905 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4906 	 * have occurred when we stopped dirty tracking. This ensures that we
4907 	 * never inherit dirtied bits from a previous cycle.
4908 	 */
4909 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4910 		return -EINVAL;
4911 
4912 	do {
4913 		struct dma_pte *pte;
4914 		int lvl = 0;
4915 
4916 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4917 				     GFP_ATOMIC);
4918 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4919 		if (!pte || !dma_pte_present(pte)) {
4920 			iova += pgsize;
4921 			continue;
4922 		}
4923 
4924 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4925 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4926 		iova += pgsize;
4927 	} while (iova < end);
4928 
4929 	return 0;
4930 }
4931 
4932 static const struct iommu_dirty_ops intel_dirty_ops = {
4933 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4934 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4935 };
4936 
4937 const struct iommu_ops intel_iommu_ops = {
4938 	.blocked_domain		= &blocking_domain,
4939 	.capable		= intel_iommu_capable,
4940 	.hw_info		= intel_iommu_hw_info,
4941 	.domain_alloc		= intel_iommu_domain_alloc,
4942 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4943 	.probe_device		= intel_iommu_probe_device,
4944 	.probe_finalize		= intel_iommu_probe_finalize,
4945 	.release_device		= intel_iommu_release_device,
4946 	.get_resv_regions	= intel_iommu_get_resv_regions,
4947 	.device_group		= intel_iommu_device_group,
4948 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4949 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4950 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4951 	.def_domain_type	= device_def_domain_type,
4952 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4953 	.pgsize_bitmap		= SZ_4K,
4954 #ifdef CONFIG_INTEL_IOMMU_SVM
4955 	.page_response		= intel_svm_page_response,
4956 #endif
4957 	.default_domain_ops = &(const struct iommu_domain_ops) {
4958 		.attach_dev		= intel_iommu_attach_device,
4959 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4960 		.map_pages		= intel_iommu_map_pages,
4961 		.unmap_pages		= intel_iommu_unmap_pages,
4962 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4963 		.flush_iotlb_all        = intel_flush_iotlb_all,
4964 		.iotlb_sync		= intel_iommu_tlb_sync,
4965 		.iova_to_phys		= intel_iommu_iova_to_phys,
4966 		.free			= intel_iommu_domain_free,
4967 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4968 	}
4969 };
4970 
4971 static void quirk_iommu_igfx(struct pci_dev *dev)
4972 {
4973 	if (risky_device(dev))
4974 		return;
4975 
4976 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4977 	dmar_map_gfx = 0;
4978 }
4979 
4980 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4988 
4989 /* Broadwell igfx malfunctions with dmar */
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5014 
5015 static void quirk_iommu_rwbf(struct pci_dev *dev)
5016 {
5017 	if (risky_device(dev))
5018 		return;
5019 
5020 	/*
5021 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5022 	 * but needs it. Same seems to hold for the desktop versions.
5023 	 */
5024 	pci_info(dev, "Forcing write-buffer flush capability\n");
5025 	rwbf_quirk = 1;
5026 }
5027 
5028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5035 
5036 #define GGC 0x52
5037 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5038 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5039 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5040 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5041 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5042 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5043 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5044 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5045 
5046 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5047 {
5048 	unsigned short ggc;
5049 
5050 	if (risky_device(dev))
5051 		return;
5052 
5053 	if (pci_read_config_word(dev, GGC, &ggc))
5054 		return;
5055 
5056 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5057 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5058 		dmar_map_gfx = 0;
5059 	} else if (dmar_map_gfx) {
5060 		/* we have to ensure the gfx device is idle before we flush */
5061 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5062 		iommu_set_dma_strict();
5063 	}
5064 }
5065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5069 
5070 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5071 {
5072 	unsigned short ver;
5073 
5074 	if (!IS_GFX_DEVICE(dev))
5075 		return;
5076 
5077 	ver = (dev->device >> 8) & 0xff;
5078 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5079 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5080 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5081 		return;
5082 
5083 	if (risky_device(dev))
5084 		return;
5085 
5086 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5087 	iommu_skip_te_disable = 1;
5088 }
5089 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5090 
5091 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5092    ISOCH DMAR unit for the Azalia sound device, but not give it any
5093    TLB entries, which causes it to deadlock. Check for that.  We do
5094    this in a function called from init_dmars(), instead of in a PCI
5095    quirk, because we don't want to print the obnoxious "BIOS broken"
5096    message if VT-d is actually disabled.
5097 */
5098 static void __init check_tylersburg_isoch(void)
5099 {
5100 	struct pci_dev *pdev;
5101 	uint32_t vtisochctrl;
5102 
5103 	/* If there's no Azalia in the system anyway, forget it. */
5104 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5105 	if (!pdev)
5106 		return;
5107 
5108 	if (risky_device(pdev)) {
5109 		pci_dev_put(pdev);
5110 		return;
5111 	}
5112 
5113 	pci_dev_put(pdev);
5114 
5115 	/* System Management Registers. Might be hidden, in which case
5116 	   we can't do the sanity check. But that's OK, because the
5117 	   known-broken BIOSes _don't_ actually hide it, so far. */
5118 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5119 	if (!pdev)
5120 		return;
5121 
5122 	if (risky_device(pdev)) {
5123 		pci_dev_put(pdev);
5124 		return;
5125 	}
5126 
5127 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5128 		pci_dev_put(pdev);
5129 		return;
5130 	}
5131 
5132 	pci_dev_put(pdev);
5133 
5134 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5135 	if (vtisochctrl & 1)
5136 		return;
5137 
5138 	/* Drop all bits other than the number of TLB entries */
5139 	vtisochctrl &= 0x1c;
5140 
5141 	/* If we have the recommended number of TLB entries (16), fine. */
5142 	if (vtisochctrl == 0x10)
5143 		return;
5144 
5145 	/* Zero TLB entries? You get to ride the short bus to school. */
5146 	if (!vtisochctrl) {
5147 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5148 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5149 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5150 		     dmi_get_system_info(DMI_BIOS_VERSION),
5151 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5152 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5153 		return;
5154 	}
5155 
5156 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5157 	       vtisochctrl);
5158 }
5159 
5160 /*
5161  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5162  * invalidation completion before posted writes initiated with translated address
5163  * that utilized translations matching the invalidation address range, violating
5164  * the invalidation completion ordering.
5165  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5166  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5167  * under the control of the trusted/privileged host device driver must use this
5168  * quirk.
5169  * Device TLBs are invalidated under the following six conditions:
5170  * 1. Device driver does DMA API unmap IOVA
5171  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5172  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5173  *    exit_mmap() due to crash
5174  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5175  *    VM has to free pages that were unmapped
5176  * 5. Userspace driver unmaps a DMA buffer
5177  * 6. Cache invalidation in vSVA usage (upcoming)
5178  *
5179  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5180  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5181  * invalidate TLB the same way as normal user unmap which will use this quirk.
5182  * The dTLB invalidation after PASID cache flush does not need this quirk.
5183  *
5184  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5185  */
5186 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5187 			       unsigned long address, unsigned long mask,
5188 			       u32 pasid, u16 qdep)
5189 {
5190 	u16 sid;
5191 
5192 	if (likely(!info->dtlb_extra_inval))
5193 		return;
5194 
5195 	sid = PCI_DEVID(info->bus, info->devfn);
5196 	if (pasid == IOMMU_NO_PASID) {
5197 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5198 				   qdep, address, mask);
5199 	} else {
5200 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5201 					 pasid, qdep, address, mask);
5202 	}
5203 }
5204 
5205 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5206 
5207 /*
5208  * Function to submit a command to the enhanced command interface. The
5209  * valid enhanced command descriptions are defined in Table 47 of the
5210  * VT-d spec. The VT-d hardware implementation may support some but not
5211  * all commands, which can be determined by checking the Enhanced
5212  * Command Capability Register.
5213  *
5214  * Return values:
5215  *  - 0: Command successful without any error;
5216  *  - Negative: software error value;
5217  *  - Nonzero positive: failure status code defined in Table 48.
5218  */
5219 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5220 {
5221 	unsigned long flags;
5222 	u64 res;
5223 	int ret;
5224 
5225 	if (!cap_ecmds(iommu->cap))
5226 		return -ENODEV;
5227 
5228 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5229 
5230 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5231 	if (res & DMA_ECMD_ECRSP_IP) {
5232 		ret = -EBUSY;
5233 		goto err;
5234 	}
5235 
5236 	/*
5237 	 * Unconditionally write the operand B, because
5238 	 * - There is no side effect if an ecmd doesn't require an
5239 	 *   operand B, but we set the register to some value.
5240 	 * - It's not invoked in any critical path. The extra MMIO
5241 	 *   write doesn't bring any performance concerns.
5242 	 */
5243 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5244 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5245 
5246 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5247 		      !(res & DMA_ECMD_ECRSP_IP), res);
5248 
5249 	if (res & DMA_ECMD_ECRSP_IP) {
5250 		ret = -ETIMEDOUT;
5251 		goto err;
5252 	}
5253 
5254 	ret = ecmd_get_status_code(res);
5255 err:
5256 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5257 
5258 	return ret;
5259 }
5260