xref: /linux/drivers/iommu/intel/iommu.c (revision d0fde6aae2bacdc024fff43461ba0f325375fa97)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN		(1)
63 
64 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65 
66 /* page table handling */
67 #define LEVEL_STRIDE		(9)
68 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69 
70 static inline int agaw_to_level(int agaw)
71 {
72 	return agaw + 2;
73 }
74 
75 static inline int agaw_to_width(int agaw)
76 {
77 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79 
80 static inline int width_to_agaw(int width)
81 {
82 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84 
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 	return (level - 1) * LEVEL_STRIDE;
88 }
89 
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94 
95 static inline u64 level_mask(int level)
96 {
97 	return -1ULL << level_to_offset_bits(level);
98 }
99 
100 static inline u64 level_size(int level)
101 {
102 	return 1ULL << level_to_offset_bits(level);
103 }
104 
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 	return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109 
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114 
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 	return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 	return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 	return page_to_dma_pfn(virt_to_page(p));
132 }
133 
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_set_present(struct context_entry *context)
172 {
173 	context->lo |= 1;
174 }
175 
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 	context->lo &= (((u64)-1) << 2) | 1;
179 }
180 
181 static inline void context_set_translation_type(struct context_entry *context,
182 						unsigned long value)
183 {
184 	context->lo &= (((u64)-1) << 4) | 3;
185 	context->lo |= (value & 3) << 2;
186 }
187 
188 static inline void context_set_address_root(struct context_entry *context,
189 					    unsigned long value)
190 {
191 	context->lo &= ~VTD_PAGE_MASK;
192 	context->lo |= value & VTD_PAGE_MASK;
193 }
194 
195 static inline void context_set_address_width(struct context_entry *context,
196 					     unsigned long value)
197 {
198 	context->hi |= value & 7;
199 }
200 
201 static inline void context_set_domain_id(struct context_entry *context,
202 					 unsigned long value)
203 {
204 	context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206 
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 	context->lo |= CONTEXT_PASIDE;
210 }
211 
212 static inline int context_domain_id(struct context_entry *c)
213 {
214 	return((c->hi >> 8) & 0xffff);
215 }
216 
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 	context->lo = 0;
220 	context->hi = 0;
221 }
222 
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 	if (!iommu->copied_tables)
226 		return false;
227 
228 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230 
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236 
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242 
243 /*
244  * This domain is a statically identity mapping domain.
245  *	1. This domain creats a static 1:1 mapping to all usable memory.
246  * 	2. It maps to each iommu if successful.
247  *	3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251 
252 struct dmar_rmrr_unit {
253 	struct list_head list;		/* list of rmrr units	*/
254 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
255 	u64	base_address;		/* reserved base address*/
256 	u64	end_address;		/* reserved end address */
257 	struct dmar_dev_scope *devices;	/* target devices */
258 	int	devices_cnt;		/* target device count */
259 };
260 
261 struct dmar_atsr_unit {
262 	struct list_head list;		/* list of ATSR units */
263 	struct acpi_dmar_header *hdr;	/* ACPI header */
264 	struct dmar_dev_scope *devices;	/* target devices */
265 	int devices_cnt;		/* target device count */
266 	u8 include_all:1;		/* include all ports */
267 };
268 
269 struct dmar_satc_unit {
270 	struct list_head list;		/* list of SATC units */
271 	struct acpi_dmar_header *hdr;	/* ACPI header */
272 	struct dmar_dev_scope *devices;	/* target devices */
273 	struct intel_iommu *iommu;	/* the corresponding iommu */
274 	int devices_cnt;		/* target device count */
275 	u8 atc_required:1;		/* ATS is required */
276 };
277 
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281 
282 #define for_each_rmrr_units(rmrr) \
283 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284 
285 static void intel_iommu_domain_free(struct iommu_domain *domain);
286 
287 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
288 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
289 
290 int intel_iommu_enabled = 0;
291 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
292 
293 static int dmar_map_gfx = 1;
294 static int intel_iommu_superpage = 1;
295 static int iommu_identity_mapping;
296 static int iommu_skip_te_disable;
297 
298 #define IDENTMAP_GFX		2
299 #define IDENTMAP_AZALIA		4
300 
301 const struct iommu_ops intel_iommu_ops;
302 const struct iommu_dirty_ops intel_dirty_ops;
303 
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308 
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313 
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 	u32 gsts;
317 
318 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 	if (gsts & DMA_GSTS_TES)
320 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322 
323 static int __init intel_iommu_setup(char *str)
324 {
325 	if (!str)
326 		return -EINVAL;
327 
328 	while (*str) {
329 		if (!strncmp(str, "on", 2)) {
330 			dmar_disabled = 0;
331 			pr_info("IOMMU enabled\n");
332 		} else if (!strncmp(str, "off", 3)) {
333 			dmar_disabled = 1;
334 			no_platform_optin = 1;
335 			pr_info("IOMMU disabled\n");
336 		} else if (!strncmp(str, "igfx_off", 8)) {
337 			dmar_map_gfx = 0;
338 			pr_info("Disable GFX device mapping\n");
339 		} else if (!strncmp(str, "forcedac", 8)) {
340 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 			iommu_dma_forcedac = true;
342 		} else if (!strncmp(str, "strict", 6)) {
343 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 			iommu_set_dma_strict();
345 		} else if (!strncmp(str, "sp_off", 6)) {
346 			pr_info("Disable supported super page\n");
347 			intel_iommu_superpage = 0;
348 		} else if (!strncmp(str, "sm_on", 5)) {
349 			pr_info("Enable scalable mode if hardware supports\n");
350 			intel_iommu_sm = 1;
351 		} else if (!strncmp(str, "sm_off", 6)) {
352 			pr_info("Scalable mode is disallowed\n");
353 			intel_iommu_sm = 0;
354 		} else if (!strncmp(str, "tboot_noforce", 13)) {
355 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 			intel_iommu_tboot_noforce = 1;
357 		} else {
358 			pr_notice("Unknown option - '%s'\n", str);
359 		}
360 
361 		str += strcspn(str, ",");
362 		while (*str == ',')
363 			str++;
364 	}
365 
366 	return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369 
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 	struct page *page;
373 	void *vaddr = NULL;
374 
375 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 	if (page)
377 		vaddr = page_address(page);
378 	return vaddr;
379 }
380 
381 void free_pgtable_page(void *vaddr)
382 {
383 	free_page((unsigned long)vaddr);
384 }
385 
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390 
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 				       unsigned long pfn)
393 {
394 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395 
396 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398 
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 	unsigned long fl_sagaw, sl_sagaw;
407 
408 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 	sl_sagaw = cap_sagaw(iommu->cap);
410 
411 	/* Second level only. */
412 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 		return sl_sagaw;
414 
415 	/* First level only. */
416 	if (!ecap_slts(iommu->ecap))
417 		return fl_sagaw;
418 
419 	return fl_sagaw & sl_sagaw;
420 }
421 
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 	unsigned long sagaw;
425 	int agaw;
426 
427 	sagaw = __iommu_calculate_sagaw(iommu);
428 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 		if (test_bit(agaw, &sagaw))
430 			break;
431 	}
432 
433 	return agaw;
434 }
435 
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443 
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453 
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 	return sm_supported(iommu) ?
457 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459 
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 	struct iommu_domain_info *info;
463 	struct dmar_drhd_unit *drhd;
464 	struct intel_iommu *iommu;
465 	bool found = false;
466 	unsigned long i;
467 
468 	domain->iommu_coherency = true;
469 	xa_for_each(&domain->iommu_array, i, info) {
470 		found = true;
471 		if (!iommu_paging_structure_coherency(info->iommu)) {
472 			domain->iommu_coherency = false;
473 			break;
474 		}
475 	}
476 	if (found)
477 		return;
478 
479 	/* No hardware attached; use lowest common denominator */
480 	rcu_read_lock();
481 	for_each_active_iommu(iommu, drhd) {
482 		if (!iommu_paging_structure_coherency(iommu)) {
483 			domain->iommu_coherency = false;
484 			break;
485 		}
486 	}
487 	rcu_read_unlock();
488 }
489 
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 					 struct intel_iommu *skip)
492 {
493 	struct dmar_drhd_unit *drhd;
494 	struct intel_iommu *iommu;
495 	int mask = 0x3;
496 
497 	if (!intel_iommu_superpage)
498 		return 0;
499 
500 	/* set iommu_superpage to the smallest common denominator */
501 	rcu_read_lock();
502 	for_each_active_iommu(iommu, drhd) {
503 		if (iommu != skip) {
504 			if (domain && domain->use_first_level) {
505 				if (!cap_fl1gp_support(iommu->cap))
506 					mask = 0x1;
507 			} else {
508 				mask &= cap_super_page_val(iommu->cap);
509 			}
510 
511 			if (!mask)
512 				break;
513 		}
514 	}
515 	rcu_read_unlock();
516 
517 	return fls(mask);
518 }
519 
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 	struct device_domain_info *info;
523 	int nid = NUMA_NO_NODE;
524 	unsigned long flags;
525 
526 	spin_lock_irqsave(&domain->lock, flags);
527 	list_for_each_entry(info, &domain->devices, link) {
528 		/*
529 		 * There could possibly be multiple device numa nodes as devices
530 		 * within the same domain may sit behind different IOMMUs. There
531 		 * isn't perfect answer in such situation, so we select first
532 		 * come first served policy.
533 		 */
534 		nid = dev_to_node(info->dev);
535 		if (nid != NUMA_NO_NODE)
536 			break;
537 	}
538 	spin_unlock_irqrestore(&domain->lock, flags);
539 
540 	return nid;
541 }
542 
543 static void domain_update_iotlb(struct dmar_domain *domain);
544 
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 	unsigned long bitmap = 0;
549 
550 	/*
551 	 * 1-level super page supports page size of 2MiB, 2-level super page
552 	 * supports page size of both 2MiB and 1GiB.
553 	 */
554 	if (domain->iommu_superpage == 1)
555 		bitmap |= SZ_2M;
556 	else if (domain->iommu_superpage == 2)
557 		bitmap |= SZ_2M | SZ_1G;
558 
559 	return bitmap;
560 }
561 
562 /* Some capabilities may be different across iommus */
563 void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 	domain_update_iommu_coherency(domain);
566 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567 
568 	/*
569 	 * If RHSA is missing, we should default to the device numa domain
570 	 * as fall back.
571 	 */
572 	if (domain->nid == NUMA_NO_NODE)
573 		domain->nid = domain_update_device_node(domain);
574 
575 	/*
576 	 * First-level translation restricts the input-address to a
577 	 * canonical address (i.e., address bits 63:N have the same
578 	 * value as address bit [N-1], where N is 48-bits with 4-level
579 	 * paging and 57-bits with 5-level paging). Hence, skip bit
580 	 * [N-1].
581 	 */
582 	if (domain->use_first_level)
583 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 	else
585 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586 
587 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 	domain_update_iotlb(domain);
589 }
590 
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 					 u8 devfn, int alloc)
593 {
594 	struct root_entry *root = &iommu->root_entry[bus];
595 	struct context_entry *context;
596 	u64 *entry;
597 
598 	/*
599 	 * Except that the caller requested to allocate a new entry,
600 	 * returning a copied context entry makes no sense.
601 	 */
602 	if (!alloc && context_copied(iommu, bus, devfn))
603 		return NULL;
604 
605 	entry = &root->lo;
606 	if (sm_supported(iommu)) {
607 		if (devfn >= 0x80) {
608 			devfn -= 0x80;
609 			entry = &root->hi;
610 		}
611 		devfn *= 2;
612 	}
613 	if (*entry & 1)
614 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 	else {
616 		unsigned long phy_addr;
617 		if (!alloc)
618 			return NULL;
619 
620 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 		if (!context)
622 			return NULL;
623 
624 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 		phy_addr = virt_to_phys((void *)context);
626 		*entry = phy_addr | 1;
627 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
628 	}
629 	return &context[devfn];
630 }
631 
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *				 sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 	struct pci_dev *pdev, *pbridge;
644 
645 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 		return false;
647 
648 	pdev = to_pci_dev(dev);
649 	pbridge = to_pci_dev(bridge);
650 
651 	if (pbridge->subordinate &&
652 	    pbridge->subordinate->number <= pdev->bus->number &&
653 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 		return true;
655 
656 	return false;
657 }
658 
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 	struct dmar_drhd_unit *drhd;
662 	u32 vtbar;
663 	int rc;
664 
665 	/* We know that this device on this chipset has its own IOMMU.
666 	 * If we find it under a different IOMMU, then the BIOS is lying
667 	 * to us. Hope that the IOMMU for this device is actually
668 	 * disabled, and it needs no translation...
669 	 */
670 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 	if (rc) {
672 		/* "can't" happen */
673 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 		return false;
675 	}
676 	vtbar &= 0xffff0000;
677 
678 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
679 	drhd = dmar_find_matched_drhd_unit(pdev);
680 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 		return true;
684 	}
685 
686 	return false;
687 }
688 
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 	if (!iommu || iommu->drhd->ignored)
692 		return true;
693 
694 	if (dev_is_pci(dev)) {
695 		struct pci_dev *pdev = to_pci_dev(dev);
696 
697 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 		    quirk_ioat_snb_local_iommu(pdev))
700 			return true;
701 	}
702 
703 	return false;
704 }
705 
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 	struct dmar_drhd_unit *drhd = NULL;
709 	struct pci_dev *pdev = NULL;
710 	struct intel_iommu *iommu;
711 	struct device *tmp;
712 	u16 segment = 0;
713 	int i;
714 
715 	if (!dev)
716 		return NULL;
717 
718 	if (dev_is_pci(dev)) {
719 		struct pci_dev *pf_pdev;
720 
721 		pdev = pci_real_dma_dev(to_pci_dev(dev));
722 
723 		/* VFs aren't listed in scope tables; we need to look up
724 		 * the PF instead to find the IOMMU. */
725 		pf_pdev = pci_physfn(pdev);
726 		dev = &pf_pdev->dev;
727 		segment = pci_domain_nr(pdev->bus);
728 	} else if (has_acpi_companion(dev))
729 		dev = &ACPI_COMPANION(dev)->dev;
730 
731 	rcu_read_lock();
732 	for_each_iommu(iommu, drhd) {
733 		if (pdev && segment != drhd->segment)
734 			continue;
735 
736 		for_each_active_dev_scope(drhd->devices,
737 					  drhd->devices_cnt, i, tmp) {
738 			if (tmp == dev) {
739 				/* For a VF use its original BDF# not that of the PF
740 				 * which we used for the IOMMU lookup. Strictly speaking
741 				 * we could do this for all PCI devices; we only need to
742 				 * get the BDF# from the scope table for ACPI matches. */
743 				if (pdev && pdev->is_virtfn)
744 					goto got_pdev;
745 
746 				if (bus && devfn) {
747 					*bus = drhd->devices[i].bus;
748 					*devfn = drhd->devices[i].devfn;
749 				}
750 				goto out;
751 			}
752 
753 			if (is_downstream_to_pci_bridge(dev, tmp))
754 				goto got_pdev;
755 		}
756 
757 		if (pdev && drhd->include_all) {
758 got_pdev:
759 			if (bus && devfn) {
760 				*bus = pdev->bus->number;
761 				*devfn = pdev->devfn;
762 			}
763 			goto out;
764 		}
765 	}
766 	iommu = NULL;
767 out:
768 	if (iommu_is_dummy(iommu, dev))
769 		iommu = NULL;
770 
771 	rcu_read_unlock();
772 
773 	return iommu;
774 }
775 
776 static void domain_flush_cache(struct dmar_domain *domain,
777 			       void *addr, int size)
778 {
779 	if (!domain->iommu_coherency)
780 		clflush_cache_range(addr, size);
781 }
782 
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 	struct context_entry *context;
786 	int i;
787 
788 	if (!iommu->root_entry)
789 		return;
790 
791 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 		context = iommu_context_addr(iommu, i, 0, 0);
793 		if (context)
794 			free_pgtable_page(context);
795 
796 		if (!sm_supported(iommu))
797 			continue;
798 
799 		context = iommu_context_addr(iommu, i, 0x80, 0);
800 		if (context)
801 			free_pgtable_page(context);
802 	}
803 
804 	free_pgtable_page(iommu->root_entry);
805 	iommu->root_entry = NULL;
806 }
807 
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 	struct dma_pte *pte;
813 	int offset;
814 
815 	while (1) {
816 		offset = pfn_level_offset(pfn, level);
817 		pte = &parent[offset];
818 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 			pr_info("PTE not present at level %d\n", level);
820 			break;
821 		}
822 
823 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824 
825 		if (level == 1)
826 			break;
827 
828 		parent = phys_to_virt(dma_pte_addr(pte));
829 		level--;
830 	}
831 }
832 
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 			  unsigned long long addr, u32 pasid)
835 {
836 	struct pasid_dir_entry *dir, *pde;
837 	struct pasid_entry *entries, *pte;
838 	struct context_entry *ctx_entry;
839 	struct root_entry *rt_entry;
840 	int i, dir_index, index, level;
841 	u8 devfn = source_id & 0xff;
842 	u8 bus = source_id >> 8;
843 	struct dma_pte *pgtable;
844 
845 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846 
847 	/* root entry dump */
848 	rt_entry = &iommu->root_entry[bus];
849 	if (!rt_entry) {
850 		pr_info("root table entry is not present\n");
851 		return;
852 	}
853 
854 	if (sm_supported(iommu))
855 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 			rt_entry->hi, rt_entry->lo);
857 	else
858 		pr_info("root entry: 0x%016llx", rt_entry->lo);
859 
860 	/* context entry dump */
861 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862 	if (!ctx_entry) {
863 		pr_info("context table entry is not present\n");
864 		return;
865 	}
866 
867 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 		ctx_entry->hi, ctx_entry->lo);
869 
870 	/* legacy mode does not require PASID entries */
871 	if (!sm_supported(iommu)) {
872 		level = agaw_to_level(ctx_entry->hi & 7);
873 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 		goto pgtable_walk;
875 	}
876 
877 	/* get the pointer to pasid directory entry */
878 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 	if (!dir) {
880 		pr_info("pasid directory entry is not present\n");
881 		return;
882 	}
883 	/* For request-without-pasid, get the pasid from context entry */
884 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885 		pasid = IOMMU_NO_PASID;
886 
887 	dir_index = pasid >> PASID_PDE_SHIFT;
888 	pde = &dir[dir_index];
889 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890 
891 	/* get the pointer to the pasid table entry */
892 	entries = get_pasid_table_from_pde(pde);
893 	if (!entries) {
894 		pr_info("pasid table entry is not present\n");
895 		return;
896 	}
897 	index = pasid & PASID_PTE_MASK;
898 	pte = &entries[index];
899 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901 
902 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905 	} else {
906 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908 	}
909 
910 pgtable_walk:
911 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914 
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916 				      unsigned long pfn, int *target_level,
917 				      gfp_t gfp)
918 {
919 	struct dma_pte *parent, *pte;
920 	int level = agaw_to_level(domain->agaw);
921 	int offset;
922 
923 	if (!domain_pfn_supported(domain, pfn))
924 		/* Address beyond IOMMU's addressing capabilities. */
925 		return NULL;
926 
927 	parent = domain->pgd;
928 
929 	while (1) {
930 		void *tmp_page;
931 
932 		offset = pfn_level_offset(pfn, level);
933 		pte = &parent[offset];
934 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935 			break;
936 		if (level == *target_level)
937 			break;
938 
939 		if (!dma_pte_present(pte)) {
940 			uint64_t pteval;
941 
942 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
943 
944 			if (!tmp_page)
945 				return NULL;
946 
947 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949 			if (domain->use_first_level)
950 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951 
952 			if (cmpxchg64(&pte->val, 0ULL, pteval))
953 				/* Someone else set it while we were thinking; use theirs. */
954 				free_pgtable_page(tmp_page);
955 			else
956 				domain_flush_cache(domain, pte, sizeof(*pte));
957 		}
958 		if (level == 1)
959 			break;
960 
961 		parent = phys_to_virt(dma_pte_addr(pte));
962 		level--;
963 	}
964 
965 	if (!*target_level)
966 		*target_level = level;
967 
968 	return pte;
969 }
970 
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973 					 unsigned long pfn,
974 					 int level, int *large_page)
975 {
976 	struct dma_pte *parent, *pte;
977 	int total = agaw_to_level(domain->agaw);
978 	int offset;
979 
980 	parent = domain->pgd;
981 	while (level <= total) {
982 		offset = pfn_level_offset(pfn, total);
983 		pte = &parent[offset];
984 		if (level == total)
985 			return pte;
986 
987 		if (!dma_pte_present(pte)) {
988 			*large_page = total;
989 			break;
990 		}
991 
992 		if (dma_pte_superpage(pte)) {
993 			*large_page = total;
994 			return pte;
995 		}
996 
997 		parent = phys_to_virt(dma_pte_addr(pte));
998 		total--;
999 	}
1000 	return NULL;
1001 }
1002 
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005 				unsigned long start_pfn,
1006 				unsigned long last_pfn)
1007 {
1008 	unsigned int large_page;
1009 	struct dma_pte *first_pte, *pte;
1010 
1011 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 	    WARN_ON(start_pfn > last_pfn))
1013 		return;
1014 
1015 	/* we don't need lock here; nobody else touches the iova range */
1016 	do {
1017 		large_page = 1;
1018 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019 		if (!pte) {
1020 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021 			continue;
1022 		}
1023 		do {
1024 			dma_clear_pte(pte);
1025 			start_pfn += lvl_to_nr_pages(large_page);
1026 			pte++;
1027 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028 
1029 		domain_flush_cache(domain, first_pte,
1030 				   (void *)pte - (void *)first_pte);
1031 
1032 	} while (start_pfn && start_pfn <= last_pfn);
1033 }
1034 
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036 			       int retain_level, struct dma_pte *pte,
1037 			       unsigned long pfn, unsigned long start_pfn,
1038 			       unsigned long last_pfn)
1039 {
1040 	pfn = max(start_pfn, pfn);
1041 	pte = &pte[pfn_level_offset(pfn, level)];
1042 
1043 	do {
1044 		unsigned long level_pfn;
1045 		struct dma_pte *level_pte;
1046 
1047 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048 			goto next;
1049 
1050 		level_pfn = pfn & level_mask(level);
1051 		level_pte = phys_to_virt(dma_pte_addr(pte));
1052 
1053 		if (level > 2) {
1054 			dma_pte_free_level(domain, level - 1, retain_level,
1055 					   level_pte, level_pfn, start_pfn,
1056 					   last_pfn);
1057 		}
1058 
1059 		/*
1060 		 * Free the page table if we're below the level we want to
1061 		 * retain and the range covers the entire table.
1062 		 */
1063 		if (level < retain_level && !(start_pfn > level_pfn ||
1064 		      last_pfn < level_pfn + level_size(level) - 1)) {
1065 			dma_clear_pte(pte);
1066 			domain_flush_cache(domain, pte, sizeof(*pte));
1067 			free_pgtable_page(level_pte);
1068 		}
1069 next:
1070 		pfn += level_size(level);
1071 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073 
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079 				   unsigned long start_pfn,
1080 				   unsigned long last_pfn,
1081 				   int retain_level)
1082 {
1083 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 
1085 	/* We don't need lock here; nobody else touches the iova range */
1086 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 			   domain->pgd, 0, start_pfn, last_pfn);
1088 
1089 	/* free pgd */
1090 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 		free_pgtable_page(domain->pgd);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 				    int level, struct dma_pte *pte,
1104 				    struct list_head *freelist)
1105 {
1106 	struct page *pg;
1107 
1108 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 	list_add_tail(&pg->lru, freelist);
1110 
1111 	if (level == 1)
1112 		return;
1113 
1114 	pte = page_address(pg);
1115 	do {
1116 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118 		pte++;
1119 	} while (!first_pte_in_page(pte));
1120 }
1121 
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 				struct dma_pte *pte, unsigned long pfn,
1124 				unsigned long start_pfn, unsigned long last_pfn,
1125 				struct list_head *freelist)
1126 {
1127 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128 
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn = pfn & level_mask(level);
1134 
1135 		if (!dma_pte_present(pte))
1136 			goto next;
1137 
1138 		/* If range covers entire pagetable, free it */
1139 		if (start_pfn <= level_pfn &&
1140 		    last_pfn >= level_pfn + level_size(level) - 1) {
1141 			/* These suborbinate page tables are going away entirely. Don't
1142 			   bother to clear them; we're just going to *free* them. */
1143 			if (level > 1 && !dma_pte_superpage(pte))
1144 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145 
1146 			dma_clear_pte(pte);
1147 			if (!first_pte)
1148 				first_pte = pte;
1149 			last_pte = pte;
1150 		} else if (level > 1) {
1151 			/* Recurse down into a level that isn't *entirely* obsolete */
1152 			dma_pte_clear_level(domain, level - 1,
1153 					    phys_to_virt(dma_pte_addr(pte)),
1154 					    level_pfn, start_pfn, last_pfn,
1155 					    freelist);
1156 		}
1157 next:
1158 		pfn = level_pfn + level_size(level);
1159 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160 
1161 	if (first_pte)
1162 		domain_flush_cache(domain, first_pte,
1163 				   (void *)++last_pte - (void *)first_pte);
1164 }
1165 
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 			 unsigned long last_pfn, struct list_head *freelist)
1171 {
1172 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 	    WARN_ON(start_pfn > last_pfn))
1174 		return;
1175 
1176 	/* we don't need lock here; nobody else touches the iova range */
1177 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1179 
1180 	/* free pgd */
1181 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 		struct page *pgd_page = virt_to_page(domain->pgd);
1183 		list_add_tail(&pgd_page->lru, freelist);
1184 		domain->pgd = NULL;
1185 	}
1186 }
1187 
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191 	struct root_entry *root;
1192 
1193 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194 	if (!root) {
1195 		pr_err("Allocating root entry for %s failed\n",
1196 			iommu->name);
1197 		return -ENOMEM;
1198 	}
1199 
1200 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1201 	iommu->root_entry = root;
1202 
1203 	return 0;
1204 }
1205 
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208 	u64 addr;
1209 	u32 sts;
1210 	unsigned long flag;
1211 
1212 	addr = virt_to_phys(iommu->root_entry);
1213 	if (sm_supported(iommu))
1214 		addr |= DMA_RTADDR_SMT;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218 
1219 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220 
1221 	/* Make sure hardware complete it */
1222 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223 		      readl, (sts & DMA_GSTS_RTPS), sts);
1224 
1225 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226 
1227 	/*
1228 	 * Hardware invalidates all DMA remapping hardware translation
1229 	 * caches as part of SRTP flow.
1230 	 */
1231 	if (cap_esrtps(iommu->cap))
1232 		return;
1233 
1234 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 	if (sm_supported(iommu))
1236 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239 
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242 	u32 val;
1243 	unsigned long flag;
1244 
1245 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246 		return;
1247 
1248 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250 
1251 	/* Make sure hardware complete it */
1252 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1254 
1255 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257 
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260 				  u16 did, u16 source_id, u8 function_mask,
1261 				  u64 type)
1262 {
1263 	u64 val = 0;
1264 	unsigned long flag;
1265 
1266 	switch (type) {
1267 	case DMA_CCMD_GLOBAL_INVL:
1268 		val = DMA_CCMD_GLOBAL_INVL;
1269 		break;
1270 	case DMA_CCMD_DOMAIN_INVL:
1271 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 		break;
1273 	case DMA_CCMD_DEVICE_INVL:
1274 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276 		break;
1277 	default:
1278 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279 			iommu->name, type);
1280 		return;
1281 	}
1282 	val |= DMA_CCMD_ICC;
1283 
1284 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 				u64 addr, unsigned int size_order, u64 type)
1297 {
1298 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 	u64 val = 0, val_iva = 0;
1300 	unsigned long flag;
1301 
1302 	switch (type) {
1303 	case DMA_TLB_GLOBAL_FLUSH:
1304 		/* global flush doesn't need set IVA_REG */
1305 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 		break;
1307 	case DMA_TLB_DSI_FLUSH:
1308 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 		break;
1310 	case DMA_TLB_PSI_FLUSH:
1311 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 		/* IH bit is passed in as part of address */
1313 		val_iva = size_order | addr;
1314 		break;
1315 	default:
1316 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317 			iommu->name, type);
1318 		return;
1319 	}
1320 
1321 	if (cap_write_drain(iommu->cap))
1322 		val |= DMA_TLB_WRITE_DRAIN;
1323 
1324 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 	/* Note: Only uses first TLB reg currently */
1326 	if (val_iva)
1327 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329 
1330 	/* Make sure hardware complete it */
1331 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333 
1334 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 
1336 	/* check IOTLB invalidation granularity */
1337 	if (DMA_TLB_IAIG(val) == 0)
1338 		pr_err("Flush IOTLB failed\n");
1339 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 			(unsigned long long)DMA_TLB_IIRG(type),
1342 			(unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344 
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349 	struct device_domain_info *info;
1350 	unsigned long flags;
1351 
1352 	spin_lock_irqsave(&domain->lock, flags);
1353 	list_for_each_entry(info, &domain->devices, link) {
1354 		if (info->iommu == iommu && info->bus == bus &&
1355 		    info->devfn == devfn) {
1356 			spin_unlock_irqrestore(&domain->lock, flags);
1357 			return info;
1358 		}
1359 	}
1360 	spin_unlock_irqrestore(&domain->lock, flags);
1361 
1362 	return NULL;
1363 }
1364 
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 	struct dev_pasid_info *dev_pasid;
1368 	struct device_domain_info *info;
1369 	bool has_iotlb_device = false;
1370 	unsigned long flags;
1371 
1372 	spin_lock_irqsave(&domain->lock, flags);
1373 	list_for_each_entry(info, &domain->devices, link) {
1374 		if (info->ats_enabled) {
1375 			has_iotlb_device = true;
1376 			break;
1377 		}
1378 	}
1379 
1380 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 		info = dev_iommu_priv_get(dev_pasid->dev);
1382 		if (info->ats_enabled) {
1383 			has_iotlb_device = true;
1384 			break;
1385 		}
1386 	}
1387 	domain->has_iotlb_device = has_iotlb_device;
1388 	spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390 
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401 		return false;
1402 
1403 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404 		return false;
1405 
1406 	return true;
1407 }
1408 
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411 	struct pci_dev *pdev;
1412 
1413 	if (!dev_is_pci(info->dev))
1414 		return;
1415 
1416 	pdev = to_pci_dev(info->dev);
1417 
1418 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1419 	   the device if you enable PASID support after ATS support is
1420 	   undefined. So always enable PASID support on devices which
1421 	   have it, even if we can't yet know if we're ever going to
1422 	   use it. */
1423 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 		info->pasid_enabled = 1;
1425 
1426 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428 		info->ats_enabled = 1;
1429 		domain_update_iotlb(info->domain);
1430 	}
1431 }
1432 
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435 	struct pci_dev *pdev;
1436 
1437 	if (!dev_is_pci(info->dev))
1438 		return;
1439 
1440 	pdev = to_pci_dev(info->dev);
1441 
1442 	if (info->ats_enabled) {
1443 		pci_disable_ats(pdev);
1444 		info->ats_enabled = 0;
1445 		domain_update_iotlb(info->domain);
1446 	}
1447 
1448 	if (info->pasid_enabled) {
1449 		pci_disable_pasid(pdev);
1450 		info->pasid_enabled = 0;
1451 	}
1452 }
1453 
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 				    u64 addr, unsigned int mask)
1456 {
1457 	u16 sid, qdep;
1458 
1459 	if (!info || !info->ats_enabled)
1460 		return;
1461 
1462 	sid = info->bus << 8 | info->devfn;
1463 	qdep = info->ats_qdep;
1464 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 			   qdep, addr, mask);
1466 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468 
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 				  u64 addr, unsigned mask)
1471 {
1472 	struct dev_pasid_info *dev_pasid;
1473 	struct device_domain_info *info;
1474 	unsigned long flags;
1475 
1476 	if (!domain->has_iotlb_device)
1477 		return;
1478 
1479 	spin_lock_irqsave(&domain->lock, flags);
1480 	list_for_each_entry(info, &domain->devices, link)
1481 		__iommu_flush_dev_iotlb(info, addr, mask);
1482 
1483 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 		info = dev_iommu_priv_get(dev_pasid->dev);
1485 
1486 		if (!info->ats_enabled)
1487 			continue;
1488 
1489 		qi_flush_dev_iotlb_pasid(info->iommu,
1490 					 PCI_DEVID(info->bus, info->devfn),
1491 					 info->pfsid, dev_pasid->pasid,
1492 					 info->ats_qdep, addr,
1493 					 mask);
1494 	}
1495 	spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497 
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 				     struct dmar_domain *domain, u64 addr,
1500 				     unsigned long npages, bool ih)
1501 {
1502 	u16 did = domain_id_iommu(domain, iommu);
1503 	struct dev_pasid_info *dev_pasid;
1504 	unsigned long flags;
1505 
1506 	spin_lock_irqsave(&domain->lock, flags);
1507 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509 
1510 	if (!list_empty(&domain->devices))
1511 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512 	spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514 
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 				  struct dmar_domain *domain,
1517 				  unsigned long pfn, unsigned int pages,
1518 				  int ih, int map)
1519 {
1520 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 	unsigned int mask = ilog2(aligned_pages);
1522 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523 	u16 did = domain_id_iommu(domain, iommu);
1524 
1525 	if (WARN_ON(!pages))
1526 		return;
1527 
1528 	if (ih)
1529 		ih = 1 << 6;
1530 
1531 	if (domain->use_first_level) {
1532 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533 	} else {
1534 		unsigned long bitmask = aligned_pages - 1;
1535 
1536 		/*
1537 		 * PSI masks the low order bits of the base address. If the
1538 		 * address isn't aligned to the mask, then compute a mask value
1539 		 * needed to ensure the target range is flushed.
1540 		 */
1541 		if (unlikely(bitmask & pfn)) {
1542 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543 
1544 			/*
1545 			 * Since end_pfn <= pfn + bitmask, the only way bits
1546 			 * higher than bitmask can differ in pfn and end_pfn is
1547 			 * by carrying. This means after masking out bitmask,
1548 			 * high bits starting with the first set bit in
1549 			 * shared_bits are all equal in both pfn and end_pfn.
1550 			 */
1551 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553 		}
1554 
1555 		/*
1556 		 * Fallback to domain selective flush if no PSI support or
1557 		 * the size is too big.
1558 		 */
1559 		if (!cap_pgsel_inv(iommu->cap) ||
1560 		    mask > cap_max_amask_val(iommu->cap))
1561 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 							DMA_TLB_DSI_FLUSH);
1563 		else
1564 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 							DMA_TLB_PSI_FLUSH);
1566 	}
1567 
1568 	/*
1569 	 * In caching mode, changes of pages from non-present to present require
1570 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571 	 */
1572 	if (!cap_caching_mode(iommu->cap) || !map)
1573 		iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575 
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 					struct dmar_domain *domain,
1579 					unsigned long pfn, unsigned int pages)
1580 {
1581 	/*
1582 	 * It's a non-present to present mapping. Only flush if caching mode
1583 	 * and second level.
1584 	 */
1585 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 	else
1588 		iommu_flush_write_buffer(iommu);
1589 }
1590 
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594 	struct iommu_domain_info *info;
1595 	unsigned long idx;
1596 
1597 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 		struct intel_iommu *iommu = info->iommu;
1599 		u16 did = domain_id_iommu(dmar_domain, iommu);
1600 
1601 		if (dmar_domain->use_first_level)
1602 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603 		else
1604 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605 						 DMA_TLB_DSI_FLUSH);
1606 
1607 		if (!cap_caching_mode(iommu->cap))
1608 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609 	}
1610 }
1611 
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614 	u32 pmen;
1615 	unsigned long flags;
1616 
1617 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618 		return;
1619 
1620 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 	pmen &= ~DMA_PMEN_EPM;
1623 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624 
1625 	/* wait for the protected region status bit to clear */
1626 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flags;
1636 
1637 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 	iommu->gcmd |= DMA_GCMD_TE;
1639 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640 
1641 	/* Make sure hardware complete it */
1642 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643 		      readl, (sts & DMA_GSTS_TES), sts);
1644 
1645 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647 
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650 	u32 sts;
1651 	unsigned long flag;
1652 
1653 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655 		return;
1656 
1657 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658 	iommu->gcmd &= ~DMA_GCMD_TE;
1659 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660 
1661 	/* Make sure hardware complete it */
1662 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1664 
1665 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667 
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670 	u32 ndomains;
1671 
1672 	ndomains = cap_ndoms(iommu->cap);
1673 	pr_debug("%s: Number of Domains supported <%d>\n",
1674 		 iommu->name, ndomains);
1675 
1676 	spin_lock_init(&iommu->lock);
1677 
1678 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679 	if (!iommu->domain_ids)
1680 		return -ENOMEM;
1681 
1682 	/*
1683 	 * If Caching mode is set, then invalid translations are tagged
1684 	 * with domain-id 0, hence we need to pre-allocate it. We also
1685 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 	 * make sure it is not used for a real domain.
1687 	 */
1688 	set_bit(0, iommu->domain_ids);
1689 
1690 	/*
1691 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 	 * entry for first-level or pass-through translation modes should
1693 	 * be programmed with a domain id different from those used for
1694 	 * second-level or nested translation. We reserve a domain id for
1695 	 * this purpose.
1696 	 */
1697 	if (sm_supported(iommu))
1698 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699 
1700 	return 0;
1701 }
1702 
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705 	if (!iommu->domain_ids)
1706 		return;
1707 
1708 	/*
1709 	 * All iommu domains must have been detached from the devices,
1710 	 * hence there should be no domain IDs in use.
1711 	 */
1712 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 		    > NUM_RESERVED_DID))
1714 		return;
1715 
1716 	if (iommu->gcmd & DMA_GCMD_TE)
1717 		iommu_disable_translation(iommu);
1718 }
1719 
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722 	if (iommu->domain_ids) {
1723 		bitmap_free(iommu->domain_ids);
1724 		iommu->domain_ids = NULL;
1725 	}
1726 
1727 	if (iommu->copied_tables) {
1728 		bitmap_free(iommu->copied_tables);
1729 		iommu->copied_tables = NULL;
1730 	}
1731 
1732 	/* free context mapping */
1733 	free_context_table(iommu);
1734 
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736 	if (pasid_supported(iommu)) {
1737 		if (ecap_prs(iommu->ecap))
1738 			intel_svm_finish_prq(iommu);
1739 	}
1740 #endif
1741 }
1742 
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749 	/* Only SL is available in legacy mode */
1750 	if (!scalable_mode_support())
1751 		return false;
1752 
1753 	/* Only level (either FL or SL) is available, just use it */
1754 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 		return intel_cap_flts_sanity();
1756 
1757 	/* Both levels are available, decide it based on domain type */
1758 	return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760 
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763 	struct dmar_domain *domain;
1764 
1765 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766 	if (!domain)
1767 		return NULL;
1768 
1769 	domain->nid = NUMA_NO_NODE;
1770 	if (first_level_by_default(type))
1771 		domain->use_first_level = true;
1772 	domain->has_iotlb_device = false;
1773 	INIT_LIST_HEAD(&domain->devices);
1774 	INIT_LIST_HEAD(&domain->dev_pasids);
1775 	spin_lock_init(&domain->lock);
1776 	xa_init(&domain->iommu_array);
1777 
1778 	return domain;
1779 }
1780 
1781 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1782 {
1783 	struct iommu_domain_info *info, *curr;
1784 	unsigned long ndomains;
1785 	int num, ret = -ENOSPC;
1786 
1787 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1788 	if (!info)
1789 		return -ENOMEM;
1790 
1791 	spin_lock(&iommu->lock);
1792 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1793 	if (curr) {
1794 		curr->refcnt++;
1795 		spin_unlock(&iommu->lock);
1796 		kfree(info);
1797 		return 0;
1798 	}
1799 
1800 	ndomains = cap_ndoms(iommu->cap);
1801 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802 	if (num >= ndomains) {
1803 		pr_err("%s: No free domain ids\n", iommu->name);
1804 		goto err_unlock;
1805 	}
1806 
1807 	set_bit(num, iommu->domain_ids);
1808 	info->refcnt	= 1;
1809 	info->did	= num;
1810 	info->iommu	= iommu;
1811 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1812 			  NULL, info, GFP_ATOMIC);
1813 	if (curr) {
1814 		ret = xa_err(curr) ? : -EBUSY;
1815 		goto err_clear;
1816 	}
1817 	domain_update_iommu_cap(domain);
1818 
1819 	spin_unlock(&iommu->lock);
1820 	return 0;
1821 
1822 err_clear:
1823 	clear_bit(info->did, iommu->domain_ids);
1824 err_unlock:
1825 	spin_unlock(&iommu->lock);
1826 	kfree(info);
1827 	return ret;
1828 }
1829 
1830 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1831 {
1832 	struct iommu_domain_info *info;
1833 
1834 	spin_lock(&iommu->lock);
1835 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1836 	if (--info->refcnt == 0) {
1837 		clear_bit(info->did, iommu->domain_ids);
1838 		xa_erase(&domain->iommu_array, iommu->seq_id);
1839 		domain->nid = NUMA_NO_NODE;
1840 		domain_update_iommu_cap(domain);
1841 		kfree(info);
1842 	}
1843 	spin_unlock(&iommu->lock);
1844 }
1845 
1846 static inline int guestwidth_to_adjustwidth(int gaw)
1847 {
1848 	int agaw;
1849 	int r = (gaw - 12) % 9;
1850 
1851 	if (r == 0)
1852 		agaw = gaw;
1853 	else
1854 		agaw = gaw + 9 - r;
1855 	if (agaw > 64)
1856 		agaw = 64;
1857 	return agaw;
1858 }
1859 
1860 static void domain_exit(struct dmar_domain *domain)
1861 {
1862 	if (domain->pgd) {
1863 		LIST_HEAD(freelist);
1864 
1865 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1866 		put_pages_list(&freelist);
1867 	}
1868 
1869 	if (WARN_ON(!list_empty(&domain->devices)))
1870 		return;
1871 
1872 	kfree(domain);
1873 }
1874 
1875 /*
1876  * Get the PASID directory size for scalable mode context entry.
1877  * Value of X in the PDTS field of a scalable mode context entry
1878  * indicates PASID directory with 2^(X + 7) entries.
1879  */
1880 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1881 {
1882 	unsigned long pds, max_pde;
1883 
1884 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1885 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1886 	if (pds < 7)
1887 		return 0;
1888 
1889 	return pds - 7;
1890 }
1891 
1892 /*
1893  * Set the RID_PASID field of a scalable mode context entry. The
1894  * IOMMU hardware will use the PASID value set in this field for
1895  * DMA translations of DMA requests without PASID.
1896  */
1897 static inline void
1898 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1899 {
1900 	context->hi |= pasid & ((1 << 20) - 1);
1901 }
1902 
1903 /*
1904  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1905  * entry.
1906  */
1907 static inline void context_set_sm_dte(struct context_entry *context)
1908 {
1909 	context->lo |= BIT_ULL(2);
1910 }
1911 
1912 /*
1913  * Set the PRE(Page Request Enable) field of a scalable mode context
1914  * entry.
1915  */
1916 static inline void context_set_sm_pre(struct context_entry *context)
1917 {
1918 	context->lo |= BIT_ULL(4);
1919 }
1920 
1921 /* Convert value to context PASID directory size field coding. */
1922 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1923 
1924 static int domain_context_mapping_one(struct dmar_domain *domain,
1925 				      struct intel_iommu *iommu,
1926 				      struct pasid_table *table,
1927 				      u8 bus, u8 devfn)
1928 {
1929 	struct device_domain_info *info =
1930 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1931 	u16 did = domain_id_iommu(domain, iommu);
1932 	int translation = CONTEXT_TT_MULTI_LEVEL;
1933 	struct context_entry *context;
1934 	int ret;
1935 
1936 	if (hw_pass_through && domain_type_is_si(domain))
1937 		translation = CONTEXT_TT_PASS_THROUGH;
1938 
1939 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1940 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941 
1942 	spin_lock(&iommu->lock);
1943 	ret = -ENOMEM;
1944 	context = iommu_context_addr(iommu, bus, devfn, 1);
1945 	if (!context)
1946 		goto out_unlock;
1947 
1948 	ret = 0;
1949 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1950 		goto out_unlock;
1951 
1952 	/*
1953 	 * For kdump cases, old valid entries may be cached due to the
1954 	 * in-flight DMA and copied pgtable, but there is no unmapping
1955 	 * behaviour for them, thus we need an explicit cache flush for
1956 	 * the newly-mapped device. For kdump, at this point, the device
1957 	 * is supposed to finish reset at its driver probe stage, so no
1958 	 * in-flight DMA will exist, and we don't need to worry anymore
1959 	 * hereafter.
1960 	 */
1961 	if (context_copied(iommu, bus, devfn)) {
1962 		u16 did_old = context_domain_id(context);
1963 
1964 		if (did_old < cap_ndoms(iommu->cap)) {
1965 			iommu->flush.flush_context(iommu, did_old,
1966 						   (((u16)bus) << 8) | devfn,
1967 						   DMA_CCMD_MASK_NOBIT,
1968 						   DMA_CCMD_DEVICE_INVL);
1969 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1970 						 DMA_TLB_DSI_FLUSH);
1971 		}
1972 
1973 		clear_context_copied(iommu, bus, devfn);
1974 	}
1975 
1976 	context_clear_entry(context);
1977 
1978 	if (sm_supported(iommu)) {
1979 		unsigned long pds;
1980 
1981 		/* Setup the PASID DIR pointer: */
1982 		pds = context_get_sm_pds(table);
1983 		context->lo = (u64)virt_to_phys(table->table) |
1984 				context_pdts(pds);
1985 
1986 		/* Setup the RID_PASID field: */
1987 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1988 
1989 		/*
1990 		 * Setup the Device-TLB enable bit and Page request
1991 		 * Enable bit:
1992 		 */
1993 		if (info && info->ats_supported)
1994 			context_set_sm_dte(context);
1995 		if (info && info->pri_supported)
1996 			context_set_sm_pre(context);
1997 		if (info && info->pasid_supported)
1998 			context_set_pasid(context);
1999 	} else {
2000 		struct dma_pte *pgd = domain->pgd;
2001 		int agaw;
2002 
2003 		context_set_domain_id(context, did);
2004 
2005 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2006 			/*
2007 			 * Skip top levels of page tables for iommu which has
2008 			 * less agaw than default. Unnecessary for PT mode.
2009 			 */
2010 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2011 				ret = -ENOMEM;
2012 				pgd = phys_to_virt(dma_pte_addr(pgd));
2013 				if (!dma_pte_present(pgd))
2014 					goto out_unlock;
2015 			}
2016 
2017 			if (info && info->ats_supported)
2018 				translation = CONTEXT_TT_DEV_IOTLB;
2019 			else
2020 				translation = CONTEXT_TT_MULTI_LEVEL;
2021 
2022 			context_set_address_root(context, virt_to_phys(pgd));
2023 			context_set_address_width(context, agaw);
2024 		} else {
2025 			/*
2026 			 * In pass through mode, AW must be programmed to
2027 			 * indicate the largest AGAW value supported by
2028 			 * hardware. And ASR is ignored by hardware.
2029 			 */
2030 			context_set_address_width(context, iommu->msagaw);
2031 		}
2032 
2033 		context_set_translation_type(context, translation);
2034 	}
2035 
2036 	context_set_fault_enable(context);
2037 	context_set_present(context);
2038 	if (!ecap_coherent(iommu->ecap))
2039 		clflush_cache_range(context, sizeof(*context));
2040 
2041 	/*
2042 	 * It's a non-present to present mapping. If hardware doesn't cache
2043 	 * non-present entry we only need to flush the write-buffer. If the
2044 	 * _does_ cache non-present entries, then it does so in the special
2045 	 * domain #0, which we have to flush:
2046 	 */
2047 	if (cap_caching_mode(iommu->cap)) {
2048 		iommu->flush.flush_context(iommu, 0,
2049 					   (((u16)bus) << 8) | devfn,
2050 					   DMA_CCMD_MASK_NOBIT,
2051 					   DMA_CCMD_DEVICE_INVL);
2052 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2053 	} else {
2054 		iommu_flush_write_buffer(iommu);
2055 	}
2056 
2057 	ret = 0;
2058 
2059 out_unlock:
2060 	spin_unlock(&iommu->lock);
2061 
2062 	return ret;
2063 }
2064 
2065 struct domain_context_mapping_data {
2066 	struct dmar_domain *domain;
2067 	struct intel_iommu *iommu;
2068 	struct pasid_table *table;
2069 };
2070 
2071 static int domain_context_mapping_cb(struct pci_dev *pdev,
2072 				     u16 alias, void *opaque)
2073 {
2074 	struct domain_context_mapping_data *data = opaque;
2075 
2076 	return domain_context_mapping_one(data->domain, data->iommu,
2077 					  data->table, PCI_BUS_NUM(alias),
2078 					  alias & 0xff);
2079 }
2080 
2081 static int
2082 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2083 {
2084 	struct domain_context_mapping_data data;
2085 	struct pasid_table *table;
2086 	struct intel_iommu *iommu;
2087 	u8 bus, devfn;
2088 
2089 	iommu = device_to_iommu(dev, &bus, &devfn);
2090 	if (!iommu)
2091 		return -ENODEV;
2092 
2093 	table = intel_pasid_get_table(dev);
2094 
2095 	if (!dev_is_pci(dev))
2096 		return domain_context_mapping_one(domain, iommu, table,
2097 						  bus, devfn);
2098 
2099 	data.domain = domain;
2100 	data.iommu = iommu;
2101 	data.table = table;
2102 
2103 	return pci_for_each_dma_alias(to_pci_dev(dev),
2104 				      &domain_context_mapping_cb, &data);
2105 }
2106 
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109 					    size_t size)
2110 {
2111 	host_addr &= ~PAGE_MASK;
2112 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114 
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117 					  unsigned long iov_pfn,
2118 					  unsigned long phy_pfn,
2119 					  unsigned long pages)
2120 {
2121 	int support, level = 1;
2122 	unsigned long pfnmerge;
2123 
2124 	support = domain->iommu_superpage;
2125 
2126 	/* To use a large page, the virtual *and* physical addresses
2127 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128 	   of them will mean we have to use smaller pages. So just
2129 	   merge them and check both at once. */
2130 	pfnmerge = iov_pfn | phy_pfn;
2131 
2132 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133 		pages >>= VTD_STRIDE_SHIFT;
2134 		if (!pages)
2135 			break;
2136 		pfnmerge >>= VTD_STRIDE_SHIFT;
2137 		level++;
2138 		support--;
2139 	}
2140 	return level;
2141 }
2142 
2143 /*
2144  * Ensure that old small page tables are removed to make room for superpage(s).
2145  * We're going to add new large pages, so make sure we don't remove their parent
2146  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2147  */
2148 static void switch_to_super_page(struct dmar_domain *domain,
2149 				 unsigned long start_pfn,
2150 				 unsigned long end_pfn, int level)
2151 {
2152 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2153 	struct iommu_domain_info *info;
2154 	struct dma_pte *pte = NULL;
2155 	unsigned long i;
2156 
2157 	while (start_pfn <= end_pfn) {
2158 		if (!pte)
2159 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2160 					     GFP_ATOMIC);
2161 
2162 		if (dma_pte_present(pte)) {
2163 			dma_pte_free_pagetable(domain, start_pfn,
2164 					       start_pfn + lvl_pages - 1,
2165 					       level + 1);
2166 
2167 			xa_for_each(&domain->iommu_array, i, info)
2168 				iommu_flush_iotlb_psi(info->iommu, domain,
2169 						      start_pfn, lvl_pages,
2170 						      0, 0);
2171 		}
2172 
2173 		pte++;
2174 		start_pfn += lvl_pages;
2175 		if (first_pte_in_page(pte))
2176 			pte = NULL;
2177 	}
2178 }
2179 
2180 static int
2181 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2183 		 gfp_t gfp)
2184 {
2185 	struct dma_pte *first_pte = NULL, *pte = NULL;
2186 	unsigned int largepage_lvl = 0;
2187 	unsigned long lvl_pages = 0;
2188 	phys_addr_t pteval;
2189 	u64 attr;
2190 
2191 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2192 		return -EINVAL;
2193 
2194 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2195 		return -EINVAL;
2196 
2197 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2198 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2199 		return -EINVAL;
2200 	}
2201 
2202 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2203 	attr |= DMA_FL_PTE_PRESENT;
2204 	if (domain->use_first_level) {
2205 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2206 		if (prot & DMA_PTE_WRITE)
2207 			attr |= DMA_FL_PTE_DIRTY;
2208 	}
2209 
2210 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2211 
2212 	while (nr_pages > 0) {
2213 		uint64_t tmp;
2214 
2215 		if (!pte) {
2216 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2217 					phys_pfn, nr_pages);
2218 
2219 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2220 					     gfp);
2221 			if (!pte)
2222 				return -ENOMEM;
2223 			first_pte = pte;
2224 
2225 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226 
2227 			/* It is large page*/
2228 			if (largepage_lvl > 1) {
2229 				unsigned long end_pfn;
2230 				unsigned long pages_to_remove;
2231 
2232 				pteval |= DMA_PTE_LARGE_PAGE;
2233 				pages_to_remove = min_t(unsigned long, nr_pages,
2234 							nr_pte_to_next_page(pte) * lvl_pages);
2235 				end_pfn = iov_pfn + pages_to_remove - 1;
2236 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237 			} else {
2238 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239 			}
2240 
2241 		}
2242 		/* We don't need lock here, nobody else
2243 		 * touches the iova range
2244 		 */
2245 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246 		if (tmp) {
2247 			static int dumps = 5;
2248 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249 				iov_pfn, tmp, (unsigned long long)pteval);
2250 			if (dumps) {
2251 				dumps--;
2252 				debug_dma_dump_mappings(NULL);
2253 			}
2254 			WARN_ON(1);
2255 		}
2256 
2257 		nr_pages -= lvl_pages;
2258 		iov_pfn += lvl_pages;
2259 		phys_pfn += lvl_pages;
2260 		pteval += lvl_pages * VTD_PAGE_SIZE;
2261 
2262 		/* If the next PTE would be the first in a new page, then we
2263 		 * need to flush the cache on the entries we've just written.
2264 		 * And then we'll need to recalculate 'pte', so clear it and
2265 		 * let it get set again in the if (!pte) block above.
2266 		 *
2267 		 * If we're done (!nr_pages) we need to flush the cache too.
2268 		 *
2269 		 * Also if we've been setting superpages, we may need to
2270 		 * recalculate 'pte' and switch back to smaller pages for the
2271 		 * end of the mapping, if the trailing size is not enough to
2272 		 * use another superpage (i.e. nr_pages < lvl_pages).
2273 		 */
2274 		pte++;
2275 		if (!nr_pages || first_pte_in_page(pte) ||
2276 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277 			domain_flush_cache(domain, first_pte,
2278 					   (void *)pte - (void *)first_pte);
2279 			pte = NULL;
2280 		}
2281 	}
2282 
2283 	return 0;
2284 }
2285 
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288 	struct intel_iommu *iommu = info->iommu;
2289 	struct context_entry *context;
2290 	u16 did_old;
2291 
2292 	if (!iommu)
2293 		return;
2294 
2295 	spin_lock(&iommu->lock);
2296 	context = iommu_context_addr(iommu, bus, devfn, 0);
2297 	if (!context) {
2298 		spin_unlock(&iommu->lock);
2299 		return;
2300 	}
2301 
2302 	if (sm_supported(iommu)) {
2303 		if (hw_pass_through && domain_type_is_si(info->domain))
2304 			did_old = FLPT_DEFAULT_DID;
2305 		else
2306 			did_old = domain_id_iommu(info->domain, iommu);
2307 	} else {
2308 		did_old = context_domain_id(context);
2309 	}
2310 
2311 	context_clear_entry(context);
2312 	__iommu_flush_cache(iommu, context, sizeof(*context));
2313 	spin_unlock(&iommu->lock);
2314 	iommu->flush.flush_context(iommu,
2315 				   did_old,
2316 				   (((u16)bus) << 8) | devfn,
2317 				   DMA_CCMD_MASK_NOBIT,
2318 				   DMA_CCMD_DEVICE_INVL);
2319 
2320 	if (sm_supported(iommu))
2321 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322 
2323 	iommu->flush.flush_iotlb(iommu,
2324 				 did_old,
2325 				 0,
2326 				 0,
2327 				 DMA_TLB_DSI_FLUSH);
2328 
2329 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331 
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333 				    struct dmar_domain *domain,
2334 				    struct device *dev,
2335 				    u32 pasid)
2336 {
2337 	struct dma_pte *pgd = domain->pgd;
2338 	int agaw, level;
2339 	int flags = 0;
2340 
2341 	/*
2342 	 * Skip top levels of page tables for iommu which has
2343 	 * less agaw than default. Unnecessary for PT mode.
2344 	 */
2345 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346 		pgd = phys_to_virt(dma_pte_addr(pgd));
2347 		if (!dma_pte_present(pgd))
2348 			return -ENOMEM;
2349 	}
2350 
2351 	level = agaw_to_level(agaw);
2352 	if (level != 4 && level != 5)
2353 		return -EINVAL;
2354 
2355 	if (level == 5)
2356 		flags |= PASID_FLAG_FL5LP;
2357 
2358 	if (domain->force_snooping)
2359 		flags |= PASID_FLAG_PAGE_SNOOP;
2360 
2361 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2362 					     domain_id_iommu(domain, iommu),
2363 					     flags);
2364 }
2365 
2366 static bool dev_is_real_dma_subdevice(struct device *dev)
2367 {
2368 	return dev && dev_is_pci(dev) &&
2369 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2370 }
2371 
2372 static int iommu_domain_identity_map(struct dmar_domain *domain,
2373 				     unsigned long first_vpfn,
2374 				     unsigned long last_vpfn)
2375 {
2376 	/*
2377 	 * RMRR range might have overlap with physical memory range,
2378 	 * clear it first
2379 	 */
2380 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2381 
2382 	return __domain_mapping(domain, first_vpfn,
2383 				first_vpfn, last_vpfn - first_vpfn + 1,
2384 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2385 }
2386 
2387 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2388 
2389 static int __init si_domain_init(int hw)
2390 {
2391 	struct dmar_rmrr_unit *rmrr;
2392 	struct device *dev;
2393 	int i, nid, ret;
2394 
2395 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2396 	if (!si_domain)
2397 		return -EFAULT;
2398 
2399 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2400 		domain_exit(si_domain);
2401 		si_domain = NULL;
2402 		return -EFAULT;
2403 	}
2404 
2405 	if (hw)
2406 		return 0;
2407 
2408 	for_each_online_node(nid) {
2409 		unsigned long start_pfn, end_pfn;
2410 		int i;
2411 
2412 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2413 			ret = iommu_domain_identity_map(si_domain,
2414 					mm_to_dma_pfn_start(start_pfn),
2415 					mm_to_dma_pfn_end(end_pfn));
2416 			if (ret)
2417 				return ret;
2418 		}
2419 	}
2420 
2421 	/*
2422 	 * Identity map the RMRRs so that devices with RMRRs could also use
2423 	 * the si_domain.
2424 	 */
2425 	for_each_rmrr_units(rmrr) {
2426 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2427 					  i, dev) {
2428 			unsigned long long start = rmrr->base_address;
2429 			unsigned long long end = rmrr->end_address;
2430 
2431 			if (WARN_ON(end < start ||
2432 				    end >> agaw_to_width(si_domain->agaw)))
2433 				continue;
2434 
2435 			ret = iommu_domain_identity_map(si_domain,
2436 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2437 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2438 			if (ret)
2439 				return ret;
2440 		}
2441 	}
2442 
2443 	return 0;
2444 }
2445 
2446 static int dmar_domain_attach_device(struct dmar_domain *domain,
2447 				     struct device *dev)
2448 {
2449 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2450 	struct intel_iommu *iommu;
2451 	unsigned long flags;
2452 	u8 bus, devfn;
2453 	int ret;
2454 
2455 	iommu = device_to_iommu(dev, &bus, &devfn);
2456 	if (!iommu)
2457 		return -ENODEV;
2458 
2459 	ret = domain_attach_iommu(domain, iommu);
2460 	if (ret)
2461 		return ret;
2462 	info->domain = domain;
2463 	spin_lock_irqsave(&domain->lock, flags);
2464 	list_add(&info->link, &domain->devices);
2465 	spin_unlock_irqrestore(&domain->lock, flags);
2466 
2467 	/* PASID table is mandatory for a PCI device in scalable mode. */
2468 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469 		/* Setup the PASID entry for requests without PASID: */
2470 		if (hw_pass_through && domain_type_is_si(domain))
2471 			ret = intel_pasid_setup_pass_through(iommu, domain,
2472 					dev, IOMMU_NO_PASID);
2473 		else if (domain->use_first_level)
2474 			ret = domain_setup_first_level(iommu, domain, dev,
2475 					IOMMU_NO_PASID);
2476 		else
2477 			ret = intel_pasid_setup_second_level(iommu, domain,
2478 					dev, IOMMU_NO_PASID);
2479 		if (ret) {
2480 			dev_err(dev, "Setup RID2PASID failed\n");
2481 			device_block_translation(dev);
2482 			return ret;
2483 		}
2484 	}
2485 
2486 	ret = domain_context_mapping(domain, dev);
2487 	if (ret) {
2488 		dev_err(dev, "Domain context map failed\n");
2489 		device_block_translation(dev);
2490 		return ret;
2491 	}
2492 
2493 	iommu_enable_pci_caps(info);
2494 
2495 	return 0;
2496 }
2497 
2498 /**
2499  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500  * is relaxable (ie. is allowed to be not enforced under some conditions)
2501  * @dev: device handle
2502  *
2503  * We assume that PCI USB devices with RMRRs have them largely
2504  * for historical reasons and that the RMRR space is not actively used post
2505  * boot.  This exclusion may change if vendors begin to abuse it.
2506  *
2507  * The same exception is made for graphics devices, with the requirement that
2508  * any use of the RMRR regions will be torn down before assigning the device
2509  * to a guest.
2510  *
2511  * Return: true if the RMRR is relaxable, false otherwise
2512  */
2513 static bool device_rmrr_is_relaxable(struct device *dev)
2514 {
2515 	struct pci_dev *pdev;
2516 
2517 	if (!dev_is_pci(dev))
2518 		return false;
2519 
2520 	pdev = to_pci_dev(dev);
2521 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522 		return true;
2523 	else
2524 		return false;
2525 }
2526 
2527 /*
2528  * Return the required default domain type for a specific device.
2529  *
2530  * @dev: the device in query
2531  * @startup: true if this is during early boot
2532  *
2533  * Returns:
2534  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536  *  - 0: both identity and dynamic domains work for this device
2537  */
2538 static int device_def_domain_type(struct device *dev)
2539 {
2540 	if (dev_is_pci(dev)) {
2541 		struct pci_dev *pdev = to_pci_dev(dev);
2542 
2543 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544 			return IOMMU_DOMAIN_IDENTITY;
2545 
2546 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547 			return IOMMU_DOMAIN_IDENTITY;
2548 	}
2549 
2550 	return 0;
2551 }
2552 
2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554 {
2555 	/*
2556 	 * Start from the sane iommu hardware state.
2557 	 * If the queued invalidation is already initialized by us
2558 	 * (for example, while enabling interrupt-remapping) then
2559 	 * we got the things already rolling from a sane state.
2560 	 */
2561 	if (!iommu->qi) {
2562 		/*
2563 		 * Clear any previous faults.
2564 		 */
2565 		dmar_fault(-1, iommu);
2566 		/*
2567 		 * Disable queued invalidation if supported and already enabled
2568 		 * before OS handover.
2569 		 */
2570 		dmar_disable_qi(iommu);
2571 	}
2572 
2573 	if (dmar_enable_qi(iommu)) {
2574 		/*
2575 		 * Queued Invalidate not enabled, use Register Based Invalidate
2576 		 */
2577 		iommu->flush.flush_context = __iommu_flush_context;
2578 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579 		pr_info("%s: Using Register based invalidation\n",
2580 			iommu->name);
2581 	} else {
2582 		iommu->flush.flush_context = qi_flush_context;
2583 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2584 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2585 	}
2586 }
2587 
2588 static int copy_context_table(struct intel_iommu *iommu,
2589 			      struct root_entry *old_re,
2590 			      struct context_entry **tbl,
2591 			      int bus, bool ext)
2592 {
2593 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594 	struct context_entry *new_ce = NULL, ce;
2595 	struct context_entry *old_ce = NULL;
2596 	struct root_entry re;
2597 	phys_addr_t old_ce_phys;
2598 
2599 	tbl_idx = ext ? bus * 2 : bus;
2600 	memcpy(&re, old_re, sizeof(re));
2601 
2602 	for (devfn = 0; devfn < 256; devfn++) {
2603 		/* First calculate the correct index */
2604 		idx = (ext ? devfn * 2 : devfn) % 256;
2605 
2606 		if (idx == 0) {
2607 			/* First save what we may have and clean up */
2608 			if (new_ce) {
2609 				tbl[tbl_idx] = new_ce;
2610 				__iommu_flush_cache(iommu, new_ce,
2611 						    VTD_PAGE_SIZE);
2612 				pos = 1;
2613 			}
2614 
2615 			if (old_ce)
2616 				memunmap(old_ce);
2617 
2618 			ret = 0;
2619 			if (devfn < 0x80)
2620 				old_ce_phys = root_entry_lctp(&re);
2621 			else
2622 				old_ce_phys = root_entry_uctp(&re);
2623 
2624 			if (!old_ce_phys) {
2625 				if (ext && devfn == 0) {
2626 					/* No LCTP, try UCTP */
2627 					devfn = 0x7f;
2628 					continue;
2629 				} else {
2630 					goto out;
2631 				}
2632 			}
2633 
2634 			ret = -ENOMEM;
2635 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636 					MEMREMAP_WB);
2637 			if (!old_ce)
2638 				goto out;
2639 
2640 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641 			if (!new_ce)
2642 				goto out_unmap;
2643 
2644 			ret = 0;
2645 		}
2646 
2647 		/* Now copy the context entry */
2648 		memcpy(&ce, old_ce + idx, sizeof(ce));
2649 
2650 		if (!context_present(&ce))
2651 			continue;
2652 
2653 		did = context_domain_id(&ce);
2654 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2655 			set_bit(did, iommu->domain_ids);
2656 
2657 		set_context_copied(iommu, bus, devfn);
2658 		new_ce[idx] = ce;
2659 	}
2660 
2661 	tbl[tbl_idx + pos] = new_ce;
2662 
2663 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664 
2665 out_unmap:
2666 	memunmap(old_ce);
2667 
2668 out:
2669 	return ret;
2670 }
2671 
2672 static int copy_translation_tables(struct intel_iommu *iommu)
2673 {
2674 	struct context_entry **ctxt_tbls;
2675 	struct root_entry *old_rt;
2676 	phys_addr_t old_rt_phys;
2677 	int ctxt_table_entries;
2678 	u64 rtaddr_reg;
2679 	int bus, ret;
2680 	bool new_ext, ext;
2681 
2682 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684 	new_ext    = !!sm_supported(iommu);
2685 
2686 	/*
2687 	 * The RTT bit can only be changed when translation is disabled,
2688 	 * but disabling translation means to open a window for data
2689 	 * corruption. So bail out and don't copy anything if we would
2690 	 * have to change the bit.
2691 	 */
2692 	if (new_ext != ext)
2693 		return -EINVAL;
2694 
2695 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696 	if (!iommu->copied_tables)
2697 		return -ENOMEM;
2698 
2699 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700 	if (!old_rt_phys)
2701 		return -EINVAL;
2702 
2703 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704 	if (!old_rt)
2705 		return -ENOMEM;
2706 
2707 	/* This is too big for the stack - allocate it from slab */
2708 	ctxt_table_entries = ext ? 512 : 256;
2709 	ret = -ENOMEM;
2710 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711 	if (!ctxt_tbls)
2712 		goto out_unmap;
2713 
2714 	for (bus = 0; bus < 256; bus++) {
2715 		ret = copy_context_table(iommu, &old_rt[bus],
2716 					 ctxt_tbls, bus, ext);
2717 		if (ret) {
2718 			pr_err("%s: Failed to copy context table for bus %d\n",
2719 				iommu->name, bus);
2720 			continue;
2721 		}
2722 	}
2723 
2724 	spin_lock(&iommu->lock);
2725 
2726 	/* Context tables are copied, now write them to the root_entry table */
2727 	for (bus = 0; bus < 256; bus++) {
2728 		int idx = ext ? bus * 2 : bus;
2729 		u64 val;
2730 
2731 		if (ctxt_tbls[idx]) {
2732 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733 			iommu->root_entry[bus].lo = val;
2734 		}
2735 
2736 		if (!ext || !ctxt_tbls[idx + 1])
2737 			continue;
2738 
2739 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740 		iommu->root_entry[bus].hi = val;
2741 	}
2742 
2743 	spin_unlock(&iommu->lock);
2744 
2745 	kfree(ctxt_tbls);
2746 
2747 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748 
2749 	ret = 0;
2750 
2751 out_unmap:
2752 	memunmap(old_rt);
2753 
2754 	return ret;
2755 }
2756 
2757 static int __init init_dmars(void)
2758 {
2759 	struct dmar_drhd_unit *drhd;
2760 	struct intel_iommu *iommu;
2761 	int ret;
2762 
2763 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764 	if (ret)
2765 		goto free_iommu;
2766 
2767 	for_each_iommu(iommu, drhd) {
2768 		if (drhd->ignored) {
2769 			iommu_disable_translation(iommu);
2770 			continue;
2771 		}
2772 
2773 		/*
2774 		 * Find the max pasid size of all IOMMU's in the system.
2775 		 * We need to ensure the system pasid table is no bigger
2776 		 * than the smallest supported.
2777 		 */
2778 		if (pasid_supported(iommu)) {
2779 			u32 temp = 2 << ecap_pss(iommu->ecap);
2780 
2781 			intel_pasid_max_id = min_t(u32, temp,
2782 						   intel_pasid_max_id);
2783 		}
2784 
2785 		intel_iommu_init_qi(iommu);
2786 
2787 		ret = iommu_init_domains(iommu);
2788 		if (ret)
2789 			goto free_iommu;
2790 
2791 		init_translation_status(iommu);
2792 
2793 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794 			iommu_disable_translation(iommu);
2795 			clear_translation_pre_enabled(iommu);
2796 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797 				iommu->name);
2798 		}
2799 
2800 		/*
2801 		 * TBD:
2802 		 * we could share the same root & context tables
2803 		 * among all IOMMU's. Need to Split it later.
2804 		 */
2805 		ret = iommu_alloc_root_entry(iommu);
2806 		if (ret)
2807 			goto free_iommu;
2808 
2809 		if (translation_pre_enabled(iommu)) {
2810 			pr_info("Translation already enabled - trying to copy translation structures\n");
2811 
2812 			ret = copy_translation_tables(iommu);
2813 			if (ret) {
2814 				/*
2815 				 * We found the IOMMU with translation
2816 				 * enabled - but failed to copy over the
2817 				 * old root-entry table. Try to proceed
2818 				 * by disabling translation now and
2819 				 * allocating a clean root-entry table.
2820 				 * This might cause DMAR faults, but
2821 				 * probably the dump will still succeed.
2822 				 */
2823 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824 				       iommu->name);
2825 				iommu_disable_translation(iommu);
2826 				clear_translation_pre_enabled(iommu);
2827 			} else {
2828 				pr_info("Copied translation tables from previous kernel for %s\n",
2829 					iommu->name);
2830 			}
2831 		}
2832 
2833 		if (!ecap_pass_through(iommu->ecap))
2834 			hw_pass_through = 0;
2835 		intel_svm_check(iommu);
2836 	}
2837 
2838 	/*
2839 	 * Now that qi is enabled on all iommus, set the root entry and flush
2840 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2841 	 * flush_context function will loop forever and the boot hangs.
2842 	 */
2843 	for_each_active_iommu(iommu, drhd) {
2844 		iommu_flush_write_buffer(iommu);
2845 		iommu_set_root_entry(iommu);
2846 	}
2847 
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849 	dmar_map_gfx = 0;
2850 #endif
2851 
2852 	if (!dmar_map_gfx)
2853 		iommu_identity_mapping |= IDENTMAP_GFX;
2854 
2855 	check_tylersburg_isoch();
2856 
2857 	ret = si_domain_init(hw_pass_through);
2858 	if (ret)
2859 		goto free_iommu;
2860 
2861 	/*
2862 	 * for each drhd
2863 	 *   enable fault log
2864 	 *   global invalidate context cache
2865 	 *   global invalidate iotlb
2866 	 *   enable translation
2867 	 */
2868 	for_each_iommu(iommu, drhd) {
2869 		if (drhd->ignored) {
2870 			/*
2871 			 * we always have to disable PMRs or DMA may fail on
2872 			 * this device
2873 			 */
2874 			if (force_on)
2875 				iommu_disable_protect_mem_regions(iommu);
2876 			continue;
2877 		}
2878 
2879 		iommu_flush_write_buffer(iommu);
2880 
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883 			/*
2884 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885 			 * could cause possible lock race condition.
2886 			 */
2887 			up_write(&dmar_global_lock);
2888 			ret = intel_svm_enable_prq(iommu);
2889 			down_write(&dmar_global_lock);
2890 			if (ret)
2891 				goto free_iommu;
2892 		}
2893 #endif
2894 		ret = dmar_set_interrupt(iommu);
2895 		if (ret)
2896 			goto free_iommu;
2897 	}
2898 
2899 	return 0;
2900 
2901 free_iommu:
2902 	for_each_active_iommu(iommu, drhd) {
2903 		disable_dmar_iommu(iommu);
2904 		free_dmar_iommu(iommu);
2905 	}
2906 	if (si_domain) {
2907 		domain_exit(si_domain);
2908 		si_domain = NULL;
2909 	}
2910 
2911 	return ret;
2912 }
2913 
2914 static void __init init_no_remapping_devices(void)
2915 {
2916 	struct dmar_drhd_unit *drhd;
2917 	struct device *dev;
2918 	int i;
2919 
2920 	for_each_drhd_unit(drhd) {
2921 		if (!drhd->include_all) {
2922 			for_each_active_dev_scope(drhd->devices,
2923 						  drhd->devices_cnt, i, dev)
2924 				break;
2925 			/* ignore DMAR unit if no devices exist */
2926 			if (i == drhd->devices_cnt)
2927 				drhd->ignored = 1;
2928 		}
2929 	}
2930 
2931 	for_each_active_drhd_unit(drhd) {
2932 		if (drhd->include_all)
2933 			continue;
2934 
2935 		for_each_active_dev_scope(drhd->devices,
2936 					  drhd->devices_cnt, i, dev)
2937 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938 				break;
2939 		if (i < drhd->devices_cnt)
2940 			continue;
2941 
2942 		/* This IOMMU has *only* gfx devices. Either bypass it or
2943 		   set the gfx_mapped flag, as appropriate */
2944 		drhd->gfx_dedicated = 1;
2945 		if (!dmar_map_gfx)
2946 			drhd->ignored = 1;
2947 	}
2948 }
2949 
2950 #ifdef CONFIG_SUSPEND
2951 static int init_iommu_hw(void)
2952 {
2953 	struct dmar_drhd_unit *drhd;
2954 	struct intel_iommu *iommu = NULL;
2955 	int ret;
2956 
2957 	for_each_active_iommu(iommu, drhd) {
2958 		if (iommu->qi) {
2959 			ret = dmar_reenable_qi(iommu);
2960 			if (ret)
2961 				return ret;
2962 		}
2963 	}
2964 
2965 	for_each_iommu(iommu, drhd) {
2966 		if (drhd->ignored) {
2967 			/*
2968 			 * we always have to disable PMRs or DMA may fail on
2969 			 * this device
2970 			 */
2971 			if (force_on)
2972 				iommu_disable_protect_mem_regions(iommu);
2973 			continue;
2974 		}
2975 
2976 		iommu_flush_write_buffer(iommu);
2977 		iommu_set_root_entry(iommu);
2978 		iommu_enable_translation(iommu);
2979 		iommu_disable_protect_mem_regions(iommu);
2980 	}
2981 
2982 	return 0;
2983 }
2984 
2985 static void iommu_flush_all(void)
2986 {
2987 	struct dmar_drhd_unit *drhd;
2988 	struct intel_iommu *iommu;
2989 
2990 	for_each_active_iommu(iommu, drhd) {
2991 		iommu->flush.flush_context(iommu, 0, 0, 0,
2992 					   DMA_CCMD_GLOBAL_INVL);
2993 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994 					 DMA_TLB_GLOBAL_FLUSH);
2995 	}
2996 }
2997 
2998 static int iommu_suspend(void)
2999 {
3000 	struct dmar_drhd_unit *drhd;
3001 	struct intel_iommu *iommu = NULL;
3002 	unsigned long flag;
3003 
3004 	iommu_flush_all();
3005 
3006 	for_each_active_iommu(iommu, drhd) {
3007 		iommu_disable_translation(iommu);
3008 
3009 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010 
3011 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012 			readl(iommu->reg + DMAR_FECTL_REG);
3013 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014 			readl(iommu->reg + DMAR_FEDATA_REG);
3015 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016 			readl(iommu->reg + DMAR_FEADDR_REG);
3017 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018 			readl(iommu->reg + DMAR_FEUADDR_REG);
3019 
3020 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021 	}
3022 	return 0;
3023 }
3024 
3025 static void iommu_resume(void)
3026 {
3027 	struct dmar_drhd_unit *drhd;
3028 	struct intel_iommu *iommu = NULL;
3029 	unsigned long flag;
3030 
3031 	if (init_iommu_hw()) {
3032 		if (force_on)
3033 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034 		else
3035 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036 		return;
3037 	}
3038 
3039 	for_each_active_iommu(iommu, drhd) {
3040 
3041 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042 
3043 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044 			iommu->reg + DMAR_FECTL_REG);
3045 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046 			iommu->reg + DMAR_FEDATA_REG);
3047 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048 			iommu->reg + DMAR_FEADDR_REG);
3049 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050 			iommu->reg + DMAR_FEUADDR_REG);
3051 
3052 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053 	}
3054 }
3055 
3056 static struct syscore_ops iommu_syscore_ops = {
3057 	.resume		= iommu_resume,
3058 	.suspend	= iommu_suspend,
3059 };
3060 
3061 static void __init init_iommu_pm_ops(void)
3062 {
3063 	register_syscore_ops(&iommu_syscore_ops);
3064 }
3065 
3066 #else
3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif	/* CONFIG_PM */
3069 
3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071 {
3072 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074 	    rmrr->end_address <= rmrr->base_address ||
3075 	    arch_rmrr_sanity_check(rmrr))
3076 		return -EINVAL;
3077 
3078 	return 0;
3079 }
3080 
3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082 {
3083 	struct acpi_dmar_reserved_memory *rmrr;
3084 	struct dmar_rmrr_unit *rmrru;
3085 
3086 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3087 	if (rmrr_sanity_check(rmrr)) {
3088 		pr_warn(FW_BUG
3089 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091 			   rmrr->base_address, rmrr->end_address,
3092 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3093 			   dmi_get_system_info(DMI_BIOS_VERSION),
3094 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3095 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096 	}
3097 
3098 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099 	if (!rmrru)
3100 		goto out;
3101 
3102 	rmrru->hdr = header;
3103 
3104 	rmrru->base_address = rmrr->base_address;
3105 	rmrru->end_address = rmrr->end_address;
3106 
3107 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108 				((void *)rmrr) + rmrr->header.length,
3109 				&rmrru->devices_cnt);
3110 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3111 		goto free_rmrru;
3112 
3113 	list_add(&rmrru->list, &dmar_rmrr_units);
3114 
3115 	return 0;
3116 free_rmrru:
3117 	kfree(rmrru);
3118 out:
3119 	return -ENOMEM;
3120 }
3121 
3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123 {
3124 	struct dmar_atsr_unit *atsru;
3125 	struct acpi_dmar_atsr *tmp;
3126 
3127 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128 				dmar_rcu_check()) {
3129 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130 		if (atsr->segment != tmp->segment)
3131 			continue;
3132 		if (atsr->header.length != tmp->header.length)
3133 			continue;
3134 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135 			return atsru;
3136 	}
3137 
3138 	return NULL;
3139 }
3140 
3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142 {
3143 	struct acpi_dmar_atsr *atsr;
3144 	struct dmar_atsr_unit *atsru;
3145 
3146 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147 		return 0;
3148 
3149 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150 	atsru = dmar_find_atsr(atsr);
3151 	if (atsru)
3152 		return 0;
3153 
3154 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155 	if (!atsru)
3156 		return -ENOMEM;
3157 
3158 	/*
3159 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3160 	 * copy the memory content because the memory buffer will be freed
3161 	 * on return.
3162 	 */
3163 	atsru->hdr = (void *)(atsru + 1);
3164 	memcpy(atsru->hdr, hdr, hdr->length);
3165 	atsru->include_all = atsr->flags & 0x1;
3166 	if (!atsru->include_all) {
3167 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168 				(void *)atsr + atsr->header.length,
3169 				&atsru->devices_cnt);
3170 		if (atsru->devices_cnt && atsru->devices == NULL) {
3171 			kfree(atsru);
3172 			return -ENOMEM;
3173 		}
3174 	}
3175 
3176 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3177 
3178 	return 0;
3179 }
3180 
3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182 {
3183 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184 	kfree(atsru);
3185 }
3186 
3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188 {
3189 	struct acpi_dmar_atsr *atsr;
3190 	struct dmar_atsr_unit *atsru;
3191 
3192 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193 	atsru = dmar_find_atsr(atsr);
3194 	if (atsru) {
3195 		list_del_rcu(&atsru->list);
3196 		synchronize_rcu();
3197 		intel_iommu_free_atsr(atsru);
3198 	}
3199 
3200 	return 0;
3201 }
3202 
3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204 {
3205 	int i;
3206 	struct device *dev;
3207 	struct acpi_dmar_atsr *atsr;
3208 	struct dmar_atsr_unit *atsru;
3209 
3210 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211 	atsru = dmar_find_atsr(atsr);
3212 	if (!atsru)
3213 		return 0;
3214 
3215 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217 					  i, dev)
3218 			return -EBUSY;
3219 	}
3220 
3221 	return 0;
3222 }
3223 
3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225 {
3226 	struct dmar_satc_unit *satcu;
3227 	struct acpi_dmar_satc *tmp;
3228 
3229 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230 				dmar_rcu_check()) {
3231 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232 		if (satc->segment != tmp->segment)
3233 			continue;
3234 		if (satc->header.length != tmp->header.length)
3235 			continue;
3236 		if (memcmp(satc, tmp, satc->header.length) == 0)
3237 			return satcu;
3238 	}
3239 
3240 	return NULL;
3241 }
3242 
3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244 {
3245 	struct acpi_dmar_satc *satc;
3246 	struct dmar_satc_unit *satcu;
3247 
3248 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249 		return 0;
3250 
3251 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3252 	satcu = dmar_find_satc(satc);
3253 	if (satcu)
3254 		return 0;
3255 
3256 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257 	if (!satcu)
3258 		return -ENOMEM;
3259 
3260 	satcu->hdr = (void *)(satcu + 1);
3261 	memcpy(satcu->hdr, hdr, hdr->length);
3262 	satcu->atc_required = satc->flags & 0x1;
3263 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264 					      (void *)satc + satc->header.length,
3265 					      &satcu->devices_cnt);
3266 	if (satcu->devices_cnt && !satcu->devices) {
3267 		kfree(satcu);
3268 		return -ENOMEM;
3269 	}
3270 	list_add_rcu(&satcu->list, &dmar_satc_units);
3271 
3272 	return 0;
3273 }
3274 
3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276 {
3277 	int sp, ret;
3278 	struct intel_iommu *iommu = dmaru->iommu;
3279 
3280 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281 	if (ret)
3282 		goto out;
3283 
3284 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285 		pr_warn("%s: Doesn't support hardware pass through.\n",
3286 			iommu->name);
3287 		return -ENXIO;
3288 	}
3289 
3290 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292 		pr_warn("%s: Doesn't support large page.\n",
3293 			iommu->name);
3294 		return -ENXIO;
3295 	}
3296 
3297 	/*
3298 	 * Disable translation if already enabled prior to OS handover.
3299 	 */
3300 	if (iommu->gcmd & DMA_GCMD_TE)
3301 		iommu_disable_translation(iommu);
3302 
3303 	ret = iommu_init_domains(iommu);
3304 	if (ret == 0)
3305 		ret = iommu_alloc_root_entry(iommu);
3306 	if (ret)
3307 		goto out;
3308 
3309 	intel_svm_check(iommu);
3310 
3311 	if (dmaru->ignored) {
3312 		/*
3313 		 * we always have to disable PMRs or DMA may fail on this device
3314 		 */
3315 		if (force_on)
3316 			iommu_disable_protect_mem_regions(iommu);
3317 		return 0;
3318 	}
3319 
3320 	intel_iommu_init_qi(iommu);
3321 	iommu_flush_write_buffer(iommu);
3322 
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325 		ret = intel_svm_enable_prq(iommu);
3326 		if (ret)
3327 			goto disable_iommu;
3328 	}
3329 #endif
3330 	ret = dmar_set_interrupt(iommu);
3331 	if (ret)
3332 		goto disable_iommu;
3333 
3334 	iommu_set_root_entry(iommu);
3335 	iommu_enable_translation(iommu);
3336 
3337 	iommu_disable_protect_mem_regions(iommu);
3338 	return 0;
3339 
3340 disable_iommu:
3341 	disable_dmar_iommu(iommu);
3342 out:
3343 	free_dmar_iommu(iommu);
3344 	return ret;
3345 }
3346 
3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348 {
3349 	int ret = 0;
3350 	struct intel_iommu *iommu = dmaru->iommu;
3351 
3352 	if (!intel_iommu_enabled)
3353 		return 0;
3354 	if (iommu == NULL)
3355 		return -EINVAL;
3356 
3357 	if (insert) {
3358 		ret = intel_iommu_add(dmaru);
3359 	} else {
3360 		disable_dmar_iommu(iommu);
3361 		free_dmar_iommu(iommu);
3362 	}
3363 
3364 	return ret;
3365 }
3366 
3367 static void intel_iommu_free_dmars(void)
3368 {
3369 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370 	struct dmar_atsr_unit *atsru, *atsr_n;
3371 	struct dmar_satc_unit *satcu, *satc_n;
3372 
3373 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374 		list_del(&rmrru->list);
3375 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376 		kfree(rmrru);
3377 	}
3378 
3379 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380 		list_del(&atsru->list);
3381 		intel_iommu_free_atsr(atsru);
3382 	}
3383 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384 		list_del(&satcu->list);
3385 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386 		kfree(satcu);
3387 	}
3388 }
3389 
3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391 {
3392 	struct dmar_satc_unit *satcu;
3393 	struct acpi_dmar_satc *satc;
3394 	struct device *tmp;
3395 	int i;
3396 
3397 	dev = pci_physfn(dev);
3398 	rcu_read_lock();
3399 
3400 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402 		if (satc->segment != pci_domain_nr(dev->bus))
3403 			continue;
3404 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405 			if (to_pci_dev(tmp) == dev)
3406 				goto out;
3407 	}
3408 	satcu = NULL;
3409 out:
3410 	rcu_read_unlock();
3411 	return satcu;
3412 }
3413 
3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415 {
3416 	int i, ret = 1;
3417 	struct pci_bus *bus;
3418 	struct pci_dev *bridge = NULL;
3419 	struct device *tmp;
3420 	struct acpi_dmar_atsr *atsr;
3421 	struct dmar_atsr_unit *atsru;
3422 	struct dmar_satc_unit *satcu;
3423 
3424 	dev = pci_physfn(dev);
3425 	satcu = dmar_find_matched_satc_unit(dev);
3426 	if (satcu)
3427 		/*
3428 		 * This device supports ATS as it is in SATC table.
3429 		 * When IOMMU is in legacy mode, enabling ATS is done
3430 		 * automatically by HW for the device that requires
3431 		 * ATS, hence OS should not enable this device ATS
3432 		 * to avoid duplicated TLB invalidation.
3433 		 */
3434 		return !(satcu->atc_required && !sm_supported(iommu));
3435 
3436 	for (bus = dev->bus; bus; bus = bus->parent) {
3437 		bridge = bus->self;
3438 		/* If it's an integrated device, allow ATS */
3439 		if (!bridge)
3440 			return 1;
3441 		/* Connected via non-PCIe: no ATS */
3442 		if (!pci_is_pcie(bridge) ||
3443 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444 			return 0;
3445 		/* If we found the root port, look it up in the ATSR */
3446 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447 			break;
3448 	}
3449 
3450 	rcu_read_lock();
3451 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453 		if (atsr->segment != pci_domain_nr(dev->bus))
3454 			continue;
3455 
3456 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457 			if (tmp == &bridge->dev)
3458 				goto out;
3459 
3460 		if (atsru->include_all)
3461 			goto out;
3462 	}
3463 	ret = 0;
3464 out:
3465 	rcu_read_unlock();
3466 
3467 	return ret;
3468 }
3469 
3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471 {
3472 	int ret;
3473 	struct dmar_rmrr_unit *rmrru;
3474 	struct dmar_atsr_unit *atsru;
3475 	struct dmar_satc_unit *satcu;
3476 	struct acpi_dmar_atsr *atsr;
3477 	struct acpi_dmar_reserved_memory *rmrr;
3478 	struct acpi_dmar_satc *satc;
3479 
3480 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481 		return 0;
3482 
3483 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484 		rmrr = container_of(rmrru->hdr,
3485 				    struct acpi_dmar_reserved_memory, header);
3486 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488 				((void *)rmrr) + rmrr->header.length,
3489 				rmrr->segment, rmrru->devices,
3490 				rmrru->devices_cnt);
3491 			if (ret < 0)
3492 				return ret;
3493 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494 			dmar_remove_dev_scope(info, rmrr->segment,
3495 				rmrru->devices, rmrru->devices_cnt);
3496 		}
3497 	}
3498 
3499 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500 		if (atsru->include_all)
3501 			continue;
3502 
3503 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506 					(void *)atsr + atsr->header.length,
3507 					atsr->segment, atsru->devices,
3508 					atsru->devices_cnt);
3509 			if (ret > 0)
3510 				break;
3511 			else if (ret < 0)
3512 				return ret;
3513 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514 			if (dmar_remove_dev_scope(info, atsr->segment,
3515 					atsru->devices, atsru->devices_cnt))
3516 				break;
3517 		}
3518 	}
3519 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3520 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523 					(void *)satc + satc->header.length,
3524 					satc->segment, satcu->devices,
3525 					satcu->devices_cnt);
3526 			if (ret > 0)
3527 				break;
3528 			else if (ret < 0)
3529 				return ret;
3530 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531 			if (dmar_remove_dev_scope(info, satc->segment,
3532 					satcu->devices, satcu->devices_cnt))
3533 				break;
3534 		}
3535 	}
3536 
3537 	return 0;
3538 }
3539 
3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541 				       unsigned long val, void *v)
3542 {
3543 	struct memory_notify *mhp = v;
3544 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546 			mhp->nr_pages - 1);
3547 
3548 	switch (val) {
3549 	case MEM_GOING_ONLINE:
3550 		if (iommu_domain_identity_map(si_domain,
3551 					      start_vpfn, last_vpfn)) {
3552 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553 				start_vpfn, last_vpfn);
3554 			return NOTIFY_BAD;
3555 		}
3556 		break;
3557 
3558 	case MEM_OFFLINE:
3559 	case MEM_CANCEL_ONLINE:
3560 		{
3561 			struct dmar_drhd_unit *drhd;
3562 			struct intel_iommu *iommu;
3563 			LIST_HEAD(freelist);
3564 
3565 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566 
3567 			rcu_read_lock();
3568 			for_each_active_iommu(iommu, drhd)
3569 				iommu_flush_iotlb_psi(iommu, si_domain,
3570 					start_vpfn, mhp->nr_pages,
3571 					list_empty(&freelist), 0);
3572 			rcu_read_unlock();
3573 			put_pages_list(&freelist);
3574 		}
3575 		break;
3576 	}
3577 
3578 	return NOTIFY_OK;
3579 }
3580 
3581 static struct notifier_block intel_iommu_memory_nb = {
3582 	.notifier_call = intel_iommu_memory_notifier,
3583 	.priority = 0
3584 };
3585 
3586 static void intel_disable_iommus(void)
3587 {
3588 	struct intel_iommu *iommu = NULL;
3589 	struct dmar_drhd_unit *drhd;
3590 
3591 	for_each_iommu(iommu, drhd)
3592 		iommu_disable_translation(iommu);
3593 }
3594 
3595 void intel_iommu_shutdown(void)
3596 {
3597 	struct dmar_drhd_unit *drhd;
3598 	struct intel_iommu *iommu = NULL;
3599 
3600 	if (no_iommu || dmar_disabled)
3601 		return;
3602 
3603 	down_write(&dmar_global_lock);
3604 
3605 	/* Disable PMRs explicitly here. */
3606 	for_each_iommu(iommu, drhd)
3607 		iommu_disable_protect_mem_regions(iommu);
3608 
3609 	/* Make sure the IOMMUs are switched off */
3610 	intel_disable_iommus();
3611 
3612 	up_write(&dmar_global_lock);
3613 }
3614 
3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616 {
3617 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618 
3619 	return container_of(iommu_dev, struct intel_iommu, iommu);
3620 }
3621 
3622 static ssize_t version_show(struct device *dev,
3623 			    struct device_attribute *attr, char *buf)
3624 {
3625 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627 	return sysfs_emit(buf, "%d:%d\n",
3628 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629 }
3630 static DEVICE_ATTR_RO(version);
3631 
3632 static ssize_t address_show(struct device *dev,
3633 			    struct device_attribute *attr, char *buf)
3634 {
3635 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637 }
3638 static DEVICE_ATTR_RO(address);
3639 
3640 static ssize_t cap_show(struct device *dev,
3641 			struct device_attribute *attr, char *buf)
3642 {
3643 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3645 }
3646 static DEVICE_ATTR_RO(cap);
3647 
3648 static ssize_t ecap_show(struct device *dev,
3649 			 struct device_attribute *attr, char *buf)
3650 {
3651 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653 }
3654 static DEVICE_ATTR_RO(ecap);
3655 
3656 static ssize_t domains_supported_show(struct device *dev,
3657 				      struct device_attribute *attr, char *buf)
3658 {
3659 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661 }
3662 static DEVICE_ATTR_RO(domains_supported);
3663 
3664 static ssize_t domains_used_show(struct device *dev,
3665 				 struct device_attribute *attr, char *buf)
3666 {
3667 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668 	return sysfs_emit(buf, "%d\n",
3669 			  bitmap_weight(iommu->domain_ids,
3670 					cap_ndoms(iommu->cap)));
3671 }
3672 static DEVICE_ATTR_RO(domains_used);
3673 
3674 static struct attribute *intel_iommu_attrs[] = {
3675 	&dev_attr_version.attr,
3676 	&dev_attr_address.attr,
3677 	&dev_attr_cap.attr,
3678 	&dev_attr_ecap.attr,
3679 	&dev_attr_domains_supported.attr,
3680 	&dev_attr_domains_used.attr,
3681 	NULL,
3682 };
3683 
3684 static struct attribute_group intel_iommu_group = {
3685 	.name = "intel-iommu",
3686 	.attrs = intel_iommu_attrs,
3687 };
3688 
3689 const struct attribute_group *intel_iommu_groups[] = {
3690 	&intel_iommu_group,
3691 	NULL,
3692 };
3693 
3694 static inline bool has_external_pci(void)
3695 {
3696 	struct pci_dev *pdev = NULL;
3697 
3698 	for_each_pci_dev(pdev)
3699 		if (pdev->external_facing) {
3700 			pci_dev_put(pdev);
3701 			return true;
3702 		}
3703 
3704 	return false;
3705 }
3706 
3707 static int __init platform_optin_force_iommu(void)
3708 {
3709 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710 		return 0;
3711 
3712 	if (no_iommu || dmar_disabled)
3713 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714 
3715 	/*
3716 	 * If Intel-IOMMU is disabled by default, we will apply identity
3717 	 * map for all devices except those marked as being untrusted.
3718 	 */
3719 	if (dmar_disabled)
3720 		iommu_set_default_passthrough(false);
3721 
3722 	dmar_disabled = 0;
3723 	no_iommu = 0;
3724 
3725 	return 1;
3726 }
3727 
3728 static int __init probe_acpi_namespace_devices(void)
3729 {
3730 	struct dmar_drhd_unit *drhd;
3731 	/* To avoid a -Wunused-but-set-variable warning. */
3732 	struct intel_iommu *iommu __maybe_unused;
3733 	struct device *dev;
3734 	int i, ret = 0;
3735 
3736 	for_each_active_iommu(iommu, drhd) {
3737 		for_each_active_dev_scope(drhd->devices,
3738 					  drhd->devices_cnt, i, dev) {
3739 			struct acpi_device_physical_node *pn;
3740 			struct acpi_device *adev;
3741 
3742 			if (dev->bus != &acpi_bus_type)
3743 				continue;
3744 
3745 			adev = to_acpi_device(dev);
3746 			mutex_lock(&adev->physical_node_lock);
3747 			list_for_each_entry(pn,
3748 					    &adev->physical_node_list, node) {
3749 				ret = iommu_probe_device(pn->dev);
3750 				if (ret)
3751 					break;
3752 			}
3753 			mutex_unlock(&adev->physical_node_lock);
3754 
3755 			if (ret)
3756 				return ret;
3757 		}
3758 	}
3759 
3760 	return 0;
3761 }
3762 
3763 static __init int tboot_force_iommu(void)
3764 {
3765 	if (!tboot_enabled())
3766 		return 0;
3767 
3768 	if (no_iommu || dmar_disabled)
3769 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3770 
3771 	dmar_disabled = 0;
3772 	no_iommu = 0;
3773 
3774 	return 1;
3775 }
3776 
3777 int __init intel_iommu_init(void)
3778 {
3779 	int ret = -ENODEV;
3780 	struct dmar_drhd_unit *drhd;
3781 	struct intel_iommu *iommu;
3782 
3783 	/*
3784 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3785 	 * opt in, so enforce that.
3786 	 */
3787 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788 		    platform_optin_force_iommu();
3789 
3790 	down_write(&dmar_global_lock);
3791 	if (dmar_table_init()) {
3792 		if (force_on)
3793 			panic("tboot: Failed to initialize DMAR table\n");
3794 		goto out_free_dmar;
3795 	}
3796 
3797 	if (dmar_dev_scope_init() < 0) {
3798 		if (force_on)
3799 			panic("tboot: Failed to initialize DMAR device scope\n");
3800 		goto out_free_dmar;
3801 	}
3802 
3803 	up_write(&dmar_global_lock);
3804 
3805 	/*
3806 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3807 	 * complain later when we register it under the lock.
3808 	 */
3809 	dmar_register_bus_notifier();
3810 
3811 	down_write(&dmar_global_lock);
3812 
3813 	if (!no_iommu)
3814 		intel_iommu_debugfs_init();
3815 
3816 	if (no_iommu || dmar_disabled) {
3817 		/*
3818 		 * We exit the function here to ensure IOMMU's remapping and
3819 		 * mempool aren't setup, which means that the IOMMU's PMRs
3820 		 * won't be disabled via the call to init_dmars(). So disable
3821 		 * it explicitly here. The PMRs were setup by tboot prior to
3822 		 * calling SENTER, but the kernel is expected to reset/tear
3823 		 * down the PMRs.
3824 		 */
3825 		if (intel_iommu_tboot_noforce) {
3826 			for_each_iommu(iommu, drhd)
3827 				iommu_disable_protect_mem_regions(iommu);
3828 		}
3829 
3830 		/*
3831 		 * Make sure the IOMMUs are switched off, even when we
3832 		 * boot into a kexec kernel and the previous kernel left
3833 		 * them enabled
3834 		 */
3835 		intel_disable_iommus();
3836 		goto out_free_dmar;
3837 	}
3838 
3839 	if (list_empty(&dmar_rmrr_units))
3840 		pr_info("No RMRR found\n");
3841 
3842 	if (list_empty(&dmar_atsr_units))
3843 		pr_info("No ATSR found\n");
3844 
3845 	if (list_empty(&dmar_satc_units))
3846 		pr_info("No SATC found\n");
3847 
3848 	init_no_remapping_devices();
3849 
3850 	ret = init_dmars();
3851 	if (ret) {
3852 		if (force_on)
3853 			panic("tboot: Failed to initialize DMARs\n");
3854 		pr_err("Initialization failed\n");
3855 		goto out_free_dmar;
3856 	}
3857 	up_write(&dmar_global_lock);
3858 
3859 	init_iommu_pm_ops();
3860 
3861 	down_read(&dmar_global_lock);
3862 	for_each_active_iommu(iommu, drhd) {
3863 		/*
3864 		 * The flush queue implementation does not perform
3865 		 * page-selective invalidations that are required for efficient
3866 		 * TLB flushes in virtual environments.  The benefit of batching
3867 		 * is likely to be much lower than the overhead of synchronizing
3868 		 * the virtual and physical IOMMU page-tables.
3869 		 */
3870 		if (cap_caching_mode(iommu->cap) &&
3871 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873 			iommu_set_dma_strict();
3874 		}
3875 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3876 				       intel_iommu_groups,
3877 				       "%s", iommu->name);
3878 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879 
3880 		iommu_pmu_register(iommu);
3881 	}
3882 	up_read(&dmar_global_lock);
3883 
3884 	if (si_domain && !hw_pass_through)
3885 		register_memory_notifier(&intel_iommu_memory_nb);
3886 
3887 	down_read(&dmar_global_lock);
3888 	if (probe_acpi_namespace_devices())
3889 		pr_warn("ACPI name space devices didn't probe correctly\n");
3890 
3891 	/* Finally, we enable the DMA remapping hardware. */
3892 	for_each_iommu(iommu, drhd) {
3893 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3894 			iommu_enable_translation(iommu);
3895 
3896 		iommu_disable_protect_mem_regions(iommu);
3897 	}
3898 	up_read(&dmar_global_lock);
3899 
3900 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901 
3902 	intel_iommu_enabled = 1;
3903 
3904 	return 0;
3905 
3906 out_free_dmar:
3907 	intel_iommu_free_dmars();
3908 	up_write(&dmar_global_lock);
3909 	return ret;
3910 }
3911 
3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913 {
3914 	struct device_domain_info *info = opaque;
3915 
3916 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917 	return 0;
3918 }
3919 
3920 /*
3921  * NB - intel-iommu lacks any sort of reference counting for the users of
3922  * dependent devices.  If multiple endpoints have intersecting dependent
3923  * devices, unbinding the driver from any one of them will possibly leave
3924  * the others unable to operate.
3925  */
3926 static void domain_context_clear(struct device_domain_info *info)
3927 {
3928 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3929 		return;
3930 
3931 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3932 			       &domain_context_clear_one_cb, info);
3933 }
3934 
3935 static void dmar_remove_one_dev_info(struct device *dev)
3936 {
3937 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3938 	struct dmar_domain *domain = info->domain;
3939 	struct intel_iommu *iommu = info->iommu;
3940 	unsigned long flags;
3941 
3942 	if (!dev_is_real_dma_subdevice(info->dev)) {
3943 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3944 			intel_pasid_tear_down_entry(iommu, info->dev,
3945 					IOMMU_NO_PASID, false);
3946 
3947 		iommu_disable_pci_caps(info);
3948 		domain_context_clear(info);
3949 	}
3950 
3951 	spin_lock_irqsave(&domain->lock, flags);
3952 	list_del(&info->link);
3953 	spin_unlock_irqrestore(&domain->lock, flags);
3954 
3955 	domain_detach_iommu(domain, iommu);
3956 	info->domain = NULL;
3957 }
3958 
3959 /*
3960  * Clear the page table pointer in context or pasid table entries so that
3961  * all DMA requests without PASID from the device are blocked. If the page
3962  * table has been set, clean up the data structures.
3963  */
3964 void device_block_translation(struct device *dev)
3965 {
3966 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3967 	struct intel_iommu *iommu = info->iommu;
3968 	unsigned long flags;
3969 
3970 	iommu_disable_pci_caps(info);
3971 	if (!dev_is_real_dma_subdevice(dev)) {
3972 		if (sm_supported(iommu))
3973 			intel_pasid_tear_down_entry(iommu, dev,
3974 						    IOMMU_NO_PASID, false);
3975 		else
3976 			domain_context_clear(info);
3977 	}
3978 
3979 	if (!info->domain)
3980 		return;
3981 
3982 	spin_lock_irqsave(&info->domain->lock, flags);
3983 	list_del(&info->link);
3984 	spin_unlock_irqrestore(&info->domain->lock, flags);
3985 
3986 	domain_detach_iommu(info->domain, iommu);
3987 	info->domain = NULL;
3988 }
3989 
3990 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3991 {
3992 	int adjust_width;
3993 
3994 	/* calculate AGAW */
3995 	domain->gaw = guest_width;
3996 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3997 	domain->agaw = width_to_agaw(adjust_width);
3998 
3999 	domain->iommu_coherency = false;
4000 	domain->iommu_superpage = 0;
4001 	domain->max_addr = 0;
4002 
4003 	/* always allocate the top pgd */
4004 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4005 	if (!domain->pgd)
4006 		return -ENOMEM;
4007 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4008 	return 0;
4009 }
4010 
4011 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4012 				      struct device *dev)
4013 {
4014 	device_block_translation(dev);
4015 	return 0;
4016 }
4017 
4018 static struct iommu_domain blocking_domain = {
4019 	.type = IOMMU_DOMAIN_BLOCKED,
4020 	.ops = &(const struct iommu_domain_ops) {
4021 		.attach_dev	= blocking_domain_attach_dev,
4022 	}
4023 };
4024 
4025 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4026 {
4027 	struct dmar_domain *dmar_domain;
4028 	struct iommu_domain *domain;
4029 
4030 	switch (type) {
4031 	case IOMMU_DOMAIN_DMA:
4032 	case IOMMU_DOMAIN_UNMANAGED:
4033 		dmar_domain = alloc_domain(type);
4034 		if (!dmar_domain) {
4035 			pr_err("Can't allocate dmar_domain\n");
4036 			return NULL;
4037 		}
4038 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4039 			pr_err("Domain initialization failed\n");
4040 			domain_exit(dmar_domain);
4041 			return NULL;
4042 		}
4043 
4044 		domain = &dmar_domain->domain;
4045 		domain->geometry.aperture_start = 0;
4046 		domain->geometry.aperture_end   =
4047 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4048 		domain->geometry.force_aperture = true;
4049 
4050 		return domain;
4051 	case IOMMU_DOMAIN_IDENTITY:
4052 		return &si_domain->domain;
4053 	case IOMMU_DOMAIN_SVA:
4054 		return intel_svm_domain_alloc();
4055 	default:
4056 		return NULL;
4057 	}
4058 
4059 	return NULL;
4060 }
4061 
4062 static struct iommu_domain *
4063 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
4064 			      struct iommu_domain *parent,
4065 			      const struct iommu_user_data *user_data)
4066 {
4067 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4068 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
4069 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
4070 	struct intel_iommu *iommu = info->iommu;
4071 	struct iommu_domain *domain;
4072 
4073 	/* Must be NESTING domain */
4074 	if (parent) {
4075 		if (!nested_supported(iommu) || flags)
4076 			return ERR_PTR(-EOPNOTSUPP);
4077 		return intel_nested_domain_alloc(parent, user_data);
4078 	}
4079 
4080 	if (flags &
4081 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
4082 		return ERR_PTR(-EOPNOTSUPP);
4083 	if (nested_parent && !nested_supported(iommu))
4084 		return ERR_PTR(-EOPNOTSUPP);
4085 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
4086 		return ERR_PTR(-EOPNOTSUPP);
4087 
4088 	/*
4089 	 * domain_alloc_user op needs to fully initialize a domain before
4090 	 * return, so uses iommu_domain_alloc() here for simple.
4091 	 */
4092 	domain = iommu_domain_alloc(dev->bus);
4093 	if (!domain)
4094 		return ERR_PTR(-ENOMEM);
4095 
4096 	if (nested_parent)
4097 		to_dmar_domain(domain)->nested_parent = true;
4098 
4099 	if (dirty_tracking) {
4100 		if (to_dmar_domain(domain)->use_first_level) {
4101 			iommu_domain_free(domain);
4102 			return ERR_PTR(-EOPNOTSUPP);
4103 		}
4104 		domain->dirty_ops = &intel_dirty_ops;
4105 	}
4106 
4107 	return domain;
4108 }
4109 
4110 static void intel_iommu_domain_free(struct iommu_domain *domain)
4111 {
4112 	if (domain != &si_domain->domain)
4113 		domain_exit(to_dmar_domain(domain));
4114 }
4115 
4116 int prepare_domain_attach_device(struct iommu_domain *domain,
4117 				 struct device *dev)
4118 {
4119 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4120 	struct intel_iommu *iommu;
4121 	int addr_width;
4122 
4123 	iommu = device_to_iommu(dev, NULL, NULL);
4124 	if (!iommu)
4125 		return -ENODEV;
4126 
4127 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4128 		return -EINVAL;
4129 
4130 	if (domain->dirty_ops && !ssads_supported(iommu))
4131 		return -EINVAL;
4132 
4133 	/* check if this iommu agaw is sufficient for max mapped address */
4134 	addr_width = agaw_to_width(iommu->agaw);
4135 	if (addr_width > cap_mgaw(iommu->cap))
4136 		addr_width = cap_mgaw(iommu->cap);
4137 
4138 	if (dmar_domain->max_addr > (1LL << addr_width))
4139 		return -EINVAL;
4140 	dmar_domain->gaw = addr_width;
4141 
4142 	/*
4143 	 * Knock out extra levels of page tables if necessary
4144 	 */
4145 	while (iommu->agaw < dmar_domain->agaw) {
4146 		struct dma_pte *pte;
4147 
4148 		pte = dmar_domain->pgd;
4149 		if (dma_pte_present(pte)) {
4150 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4151 			free_pgtable_page(pte);
4152 		}
4153 		dmar_domain->agaw--;
4154 	}
4155 
4156 	return 0;
4157 }
4158 
4159 static int intel_iommu_attach_device(struct iommu_domain *domain,
4160 				     struct device *dev)
4161 {
4162 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4163 	int ret;
4164 
4165 	if (info->domain)
4166 		device_block_translation(dev);
4167 
4168 	ret = prepare_domain_attach_device(domain, dev);
4169 	if (ret)
4170 		return ret;
4171 
4172 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4173 }
4174 
4175 static int intel_iommu_map(struct iommu_domain *domain,
4176 			   unsigned long iova, phys_addr_t hpa,
4177 			   size_t size, int iommu_prot, gfp_t gfp)
4178 {
4179 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4180 	u64 max_addr;
4181 	int prot = 0;
4182 
4183 	if (iommu_prot & IOMMU_READ)
4184 		prot |= DMA_PTE_READ;
4185 	if (iommu_prot & IOMMU_WRITE)
4186 		prot |= DMA_PTE_WRITE;
4187 	if (dmar_domain->set_pte_snp)
4188 		prot |= DMA_PTE_SNP;
4189 
4190 	max_addr = iova + size;
4191 	if (dmar_domain->max_addr < max_addr) {
4192 		u64 end;
4193 
4194 		/* check if minimum agaw is sufficient for mapped address */
4195 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4196 		if (end < max_addr) {
4197 			pr_err("%s: iommu width (%d) is not "
4198 			       "sufficient for the mapped address (%llx)\n",
4199 			       __func__, dmar_domain->gaw, max_addr);
4200 			return -EFAULT;
4201 		}
4202 		dmar_domain->max_addr = max_addr;
4203 	}
4204 	/* Round up size to next multiple of PAGE_SIZE, if it and
4205 	   the low bits of hpa would take us onto the next page */
4206 	size = aligned_nrpages(hpa, size);
4207 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4208 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4209 }
4210 
4211 static int intel_iommu_map_pages(struct iommu_domain *domain,
4212 				 unsigned long iova, phys_addr_t paddr,
4213 				 size_t pgsize, size_t pgcount,
4214 				 int prot, gfp_t gfp, size_t *mapped)
4215 {
4216 	unsigned long pgshift = __ffs(pgsize);
4217 	size_t size = pgcount << pgshift;
4218 	int ret;
4219 
4220 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4221 		return -EINVAL;
4222 
4223 	if (!IS_ALIGNED(iova | paddr, pgsize))
4224 		return -EINVAL;
4225 
4226 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4227 	if (!ret && mapped)
4228 		*mapped = size;
4229 
4230 	return ret;
4231 }
4232 
4233 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4234 				unsigned long iova, size_t size,
4235 				struct iommu_iotlb_gather *gather)
4236 {
4237 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4238 	unsigned long start_pfn, last_pfn;
4239 	int level = 0;
4240 
4241 	/* Cope with horrid API which requires us to unmap more than the
4242 	   size argument if it happens to be a large-page mapping. */
4243 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4244 				     &level, GFP_ATOMIC)))
4245 		return 0;
4246 
4247 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4248 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4249 
4250 	start_pfn = iova >> VTD_PAGE_SHIFT;
4251 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4252 
4253 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4254 
4255 	if (dmar_domain->max_addr == iova + size)
4256 		dmar_domain->max_addr = iova;
4257 
4258 	/*
4259 	 * We do not use page-selective IOTLB invalidation in flush queue,
4260 	 * so there is no need to track page and sync iotlb.
4261 	 */
4262 	if (!iommu_iotlb_gather_queued(gather))
4263 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4264 
4265 	return size;
4266 }
4267 
4268 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4269 				      unsigned long iova,
4270 				      size_t pgsize, size_t pgcount,
4271 				      struct iommu_iotlb_gather *gather)
4272 {
4273 	unsigned long pgshift = __ffs(pgsize);
4274 	size_t size = pgcount << pgshift;
4275 
4276 	return intel_iommu_unmap(domain, iova, size, gather);
4277 }
4278 
4279 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4280 				 struct iommu_iotlb_gather *gather)
4281 {
4282 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4283 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4284 	size_t size = gather->end - gather->start;
4285 	struct iommu_domain_info *info;
4286 	unsigned long start_pfn;
4287 	unsigned long nrpages;
4288 	unsigned long i;
4289 
4290 	nrpages = aligned_nrpages(gather->start, size);
4291 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4292 
4293 	xa_for_each(&dmar_domain->iommu_array, i, info)
4294 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4295 				      start_pfn, nrpages,
4296 				      list_empty(&gather->freelist), 0);
4297 
4298 	put_pages_list(&gather->freelist);
4299 }
4300 
4301 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4302 					    dma_addr_t iova)
4303 {
4304 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4305 	struct dma_pte *pte;
4306 	int level = 0;
4307 	u64 phys = 0;
4308 
4309 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4310 			     GFP_ATOMIC);
4311 	if (pte && dma_pte_present(pte))
4312 		phys = dma_pte_addr(pte) +
4313 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4314 						VTD_PAGE_SHIFT) - 1));
4315 
4316 	return phys;
4317 }
4318 
4319 static bool domain_support_force_snooping(struct dmar_domain *domain)
4320 {
4321 	struct device_domain_info *info;
4322 	bool support = true;
4323 
4324 	assert_spin_locked(&domain->lock);
4325 	list_for_each_entry(info, &domain->devices, link) {
4326 		if (!ecap_sc_support(info->iommu->ecap)) {
4327 			support = false;
4328 			break;
4329 		}
4330 	}
4331 
4332 	return support;
4333 }
4334 
4335 static void domain_set_force_snooping(struct dmar_domain *domain)
4336 {
4337 	struct device_domain_info *info;
4338 
4339 	assert_spin_locked(&domain->lock);
4340 	/*
4341 	 * Second level page table supports per-PTE snoop control. The
4342 	 * iommu_map() interface will handle this by setting SNP bit.
4343 	 */
4344 	if (!domain->use_first_level) {
4345 		domain->set_pte_snp = true;
4346 		return;
4347 	}
4348 
4349 	list_for_each_entry(info, &domain->devices, link)
4350 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4351 						     IOMMU_NO_PASID);
4352 }
4353 
4354 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4355 {
4356 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4357 	unsigned long flags;
4358 
4359 	if (dmar_domain->force_snooping)
4360 		return true;
4361 
4362 	spin_lock_irqsave(&dmar_domain->lock, flags);
4363 	if (!domain_support_force_snooping(dmar_domain)) {
4364 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4365 		return false;
4366 	}
4367 
4368 	domain_set_force_snooping(dmar_domain);
4369 	dmar_domain->force_snooping = true;
4370 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4371 
4372 	return true;
4373 }
4374 
4375 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4376 {
4377 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4378 
4379 	switch (cap) {
4380 	case IOMMU_CAP_CACHE_COHERENCY:
4381 	case IOMMU_CAP_DEFERRED_FLUSH:
4382 		return true;
4383 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4384 		return dmar_platform_optin();
4385 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4386 		return ecap_sc_support(info->iommu->ecap);
4387 	case IOMMU_CAP_DIRTY_TRACKING:
4388 		return ssads_supported(info->iommu);
4389 	default:
4390 		return false;
4391 	}
4392 }
4393 
4394 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4395 {
4396 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4397 	struct device_domain_info *info;
4398 	struct intel_iommu *iommu;
4399 	u8 bus, devfn;
4400 	int ret;
4401 
4402 	iommu = device_to_iommu(dev, &bus, &devfn);
4403 	if (!iommu || !iommu->iommu.ops)
4404 		return ERR_PTR(-ENODEV);
4405 
4406 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4407 	if (!info)
4408 		return ERR_PTR(-ENOMEM);
4409 
4410 	if (dev_is_real_dma_subdevice(dev)) {
4411 		info->bus = pdev->bus->number;
4412 		info->devfn = pdev->devfn;
4413 		info->segment = pci_domain_nr(pdev->bus);
4414 	} else {
4415 		info->bus = bus;
4416 		info->devfn = devfn;
4417 		info->segment = iommu->segment;
4418 	}
4419 
4420 	info->dev = dev;
4421 	info->iommu = iommu;
4422 	if (dev_is_pci(dev)) {
4423 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4424 		    pci_ats_supported(pdev) &&
4425 		    dmar_ats_supported(pdev, iommu)) {
4426 			info->ats_supported = 1;
4427 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4428 
4429 			/*
4430 			 * For IOMMU that supports device IOTLB throttling
4431 			 * (DIT), we assign PFSID to the invalidation desc
4432 			 * of a VF such that IOMMU HW can gauge queue depth
4433 			 * at PF level. If DIT is not set, PFSID will be
4434 			 * treated as reserved, which should be set to 0.
4435 			 */
4436 			if (ecap_dit(iommu->ecap))
4437 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4438 			info->ats_qdep = pci_ats_queue_depth(pdev);
4439 		}
4440 		if (sm_supported(iommu)) {
4441 			if (pasid_supported(iommu)) {
4442 				int features = pci_pasid_features(pdev);
4443 
4444 				if (features >= 0)
4445 					info->pasid_supported = features | 1;
4446 			}
4447 
4448 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4449 			    pci_pri_supported(pdev))
4450 				info->pri_supported = 1;
4451 		}
4452 	}
4453 
4454 	dev_iommu_priv_set(dev, info);
4455 
4456 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4457 		ret = intel_pasid_alloc_table(dev);
4458 		if (ret) {
4459 			dev_err(dev, "PASID table allocation failed\n");
4460 			dev_iommu_priv_set(dev, NULL);
4461 			kfree(info);
4462 			return ERR_PTR(ret);
4463 		}
4464 	}
4465 
4466 	intel_iommu_debugfs_create_dev(info);
4467 
4468 	return &iommu->iommu;
4469 }
4470 
4471 static void intel_iommu_release_device(struct device *dev)
4472 {
4473 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4474 
4475 	dmar_remove_one_dev_info(dev);
4476 	intel_pasid_free_table(dev);
4477 	intel_iommu_debugfs_remove_dev(info);
4478 	dev_iommu_priv_set(dev, NULL);
4479 	kfree(info);
4480 	set_dma_ops(dev, NULL);
4481 }
4482 
4483 static void intel_iommu_probe_finalize(struct device *dev)
4484 {
4485 	set_dma_ops(dev, NULL);
4486 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4487 }
4488 
4489 static void intel_iommu_get_resv_regions(struct device *device,
4490 					 struct list_head *head)
4491 {
4492 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4493 	struct iommu_resv_region *reg;
4494 	struct dmar_rmrr_unit *rmrr;
4495 	struct device *i_dev;
4496 	int i;
4497 
4498 	rcu_read_lock();
4499 	for_each_rmrr_units(rmrr) {
4500 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4501 					  i, i_dev) {
4502 			struct iommu_resv_region *resv;
4503 			enum iommu_resv_type type;
4504 			size_t length;
4505 
4506 			if (i_dev != device &&
4507 			    !is_downstream_to_pci_bridge(device, i_dev))
4508 				continue;
4509 
4510 			length = rmrr->end_address - rmrr->base_address + 1;
4511 
4512 			type = device_rmrr_is_relaxable(device) ?
4513 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4514 
4515 			resv = iommu_alloc_resv_region(rmrr->base_address,
4516 						       length, prot, type,
4517 						       GFP_ATOMIC);
4518 			if (!resv)
4519 				break;
4520 
4521 			list_add_tail(&resv->list, head);
4522 		}
4523 	}
4524 	rcu_read_unlock();
4525 
4526 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4527 	if (dev_is_pci(device)) {
4528 		struct pci_dev *pdev = to_pci_dev(device);
4529 
4530 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4531 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4532 					IOMMU_RESV_DIRECT_RELAXABLE,
4533 					GFP_KERNEL);
4534 			if (reg)
4535 				list_add_tail(&reg->list, head);
4536 		}
4537 	}
4538 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4539 
4540 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4541 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4542 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4543 	if (!reg)
4544 		return;
4545 	list_add_tail(&reg->list, head);
4546 }
4547 
4548 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4549 {
4550 	if (dev_is_pci(dev))
4551 		return pci_device_group(dev);
4552 	return generic_device_group(dev);
4553 }
4554 
4555 static int intel_iommu_enable_sva(struct device *dev)
4556 {
4557 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4558 	struct intel_iommu *iommu;
4559 
4560 	if (!info || dmar_disabled)
4561 		return -EINVAL;
4562 
4563 	iommu = info->iommu;
4564 	if (!iommu)
4565 		return -EINVAL;
4566 
4567 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4568 		return -ENODEV;
4569 
4570 	if (!info->pasid_enabled || !info->ats_enabled)
4571 		return -EINVAL;
4572 
4573 	/*
4574 	 * Devices having device-specific I/O fault handling should not
4575 	 * support PCI/PRI. The IOMMU side has no means to check the
4576 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4577 	 * default that if the device driver enables SVA on a non-PRI
4578 	 * device, it will handle IOPF in its own way.
4579 	 */
4580 	if (!info->pri_supported)
4581 		return 0;
4582 
4583 	/* Devices supporting PRI should have it enabled. */
4584 	if (!info->pri_enabled)
4585 		return -EINVAL;
4586 
4587 	return 0;
4588 }
4589 
4590 static int intel_iommu_enable_iopf(struct device *dev)
4591 {
4592 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4593 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4594 	struct intel_iommu *iommu;
4595 	int ret;
4596 
4597 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4598 		return -ENODEV;
4599 
4600 	if (info->pri_enabled)
4601 		return -EBUSY;
4602 
4603 	iommu = info->iommu;
4604 	if (!iommu)
4605 		return -EINVAL;
4606 
4607 	/* PASID is required in PRG Response Message. */
4608 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4609 		return -EINVAL;
4610 
4611 	ret = pci_reset_pri(pdev);
4612 	if (ret)
4613 		return ret;
4614 
4615 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4616 	if (ret)
4617 		return ret;
4618 
4619 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4620 	if (ret)
4621 		goto iopf_remove_device;
4622 
4623 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4624 	if (ret)
4625 		goto iopf_unregister_handler;
4626 	info->pri_enabled = 1;
4627 
4628 	return 0;
4629 
4630 iopf_unregister_handler:
4631 	iommu_unregister_device_fault_handler(dev);
4632 iopf_remove_device:
4633 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4634 
4635 	return ret;
4636 }
4637 
4638 static int intel_iommu_disable_iopf(struct device *dev)
4639 {
4640 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4641 	struct intel_iommu *iommu = info->iommu;
4642 
4643 	if (!info->pri_enabled)
4644 		return -EINVAL;
4645 
4646 	/*
4647 	 * PCIe spec states that by clearing PRI enable bit, the Page
4648 	 * Request Interface will not issue new page requests, but has
4649 	 * outstanding page requests that have been transmitted or are
4650 	 * queued for transmission. This is supposed to be called after
4651 	 * the device driver has stopped DMA, all PASIDs have been
4652 	 * unbound and the outstanding PRQs have been drained.
4653 	 */
4654 	pci_disable_pri(to_pci_dev(dev));
4655 	info->pri_enabled = 0;
4656 
4657 	/*
4658 	 * With PRI disabled and outstanding PRQs drained, unregistering
4659 	 * fault handler and removing device from iopf queue should never
4660 	 * fail.
4661 	 */
4662 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4663 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4664 
4665 	return 0;
4666 }
4667 
4668 static int
4669 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4670 {
4671 	switch (feat) {
4672 	case IOMMU_DEV_FEAT_IOPF:
4673 		return intel_iommu_enable_iopf(dev);
4674 
4675 	case IOMMU_DEV_FEAT_SVA:
4676 		return intel_iommu_enable_sva(dev);
4677 
4678 	default:
4679 		return -ENODEV;
4680 	}
4681 }
4682 
4683 static int
4684 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4685 {
4686 	switch (feat) {
4687 	case IOMMU_DEV_FEAT_IOPF:
4688 		return intel_iommu_disable_iopf(dev);
4689 
4690 	case IOMMU_DEV_FEAT_SVA:
4691 		return 0;
4692 
4693 	default:
4694 		return -ENODEV;
4695 	}
4696 }
4697 
4698 static bool intel_iommu_is_attach_deferred(struct device *dev)
4699 {
4700 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4701 
4702 	return translation_pre_enabled(info->iommu) && !info->domain;
4703 }
4704 
4705 /*
4706  * Check that the device does not live on an external facing PCI port that is
4707  * marked as untrusted. Such devices should not be able to apply quirks and
4708  * thus not be able to bypass the IOMMU restrictions.
4709  */
4710 static bool risky_device(struct pci_dev *pdev)
4711 {
4712 	if (pdev->untrusted) {
4713 		pci_info(pdev,
4714 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4715 			 pdev->vendor, pdev->device);
4716 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4717 		return true;
4718 	}
4719 	return false;
4720 }
4721 
4722 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4723 				      unsigned long iova, size_t size)
4724 {
4725 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4726 	unsigned long pages = aligned_nrpages(iova, size);
4727 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4728 	struct iommu_domain_info *info;
4729 	unsigned long i;
4730 
4731 	xa_for_each(&dmar_domain->iommu_array, i, info)
4732 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4733 	return 0;
4734 }
4735 
4736 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4737 {
4738 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4739 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4740 	struct dmar_domain *dmar_domain;
4741 	struct iommu_domain *domain;
4742 	unsigned long flags;
4743 
4744 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4745 	if (WARN_ON_ONCE(!domain))
4746 		goto out_tear_down;
4747 
4748 	/*
4749 	 * The SVA implementation needs to handle its own stuffs like the mm
4750 	 * notification. Before consolidating that code into iommu core, let
4751 	 * the intel sva code handle it.
4752 	 */
4753 	if (domain->type == IOMMU_DOMAIN_SVA) {
4754 		intel_svm_remove_dev_pasid(dev, pasid);
4755 		goto out_tear_down;
4756 	}
4757 
4758 	dmar_domain = to_dmar_domain(domain);
4759 	spin_lock_irqsave(&dmar_domain->lock, flags);
4760 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4761 		if (curr->dev == dev && curr->pasid == pasid) {
4762 			list_del(&curr->link_domain);
4763 			dev_pasid = curr;
4764 			break;
4765 		}
4766 	}
4767 	WARN_ON_ONCE(!dev_pasid);
4768 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4769 
4770 	domain_detach_iommu(dmar_domain, iommu);
4771 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4772 	kfree(dev_pasid);
4773 out_tear_down:
4774 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4775 	intel_drain_pasid_prq(dev, pasid);
4776 }
4777 
4778 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4779 				     struct device *dev, ioasid_t pasid)
4780 {
4781 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4782 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4783 	struct intel_iommu *iommu = info->iommu;
4784 	struct dev_pasid_info *dev_pasid;
4785 	unsigned long flags;
4786 	int ret;
4787 
4788 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4789 		return -EOPNOTSUPP;
4790 
4791 	if (domain->dirty_ops)
4792 		return -EINVAL;
4793 
4794 	if (context_copied(iommu, info->bus, info->devfn))
4795 		return -EBUSY;
4796 
4797 	ret = prepare_domain_attach_device(domain, dev);
4798 	if (ret)
4799 		return ret;
4800 
4801 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4802 	if (!dev_pasid)
4803 		return -ENOMEM;
4804 
4805 	ret = domain_attach_iommu(dmar_domain, iommu);
4806 	if (ret)
4807 		goto out_free;
4808 
4809 	if (domain_type_is_si(dmar_domain))
4810 		ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4811 						     dev, pasid);
4812 	else if (dmar_domain->use_first_level)
4813 		ret = domain_setup_first_level(iommu, dmar_domain,
4814 					       dev, pasid);
4815 	else
4816 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4817 						     dev, pasid);
4818 	if (ret)
4819 		goto out_detach_iommu;
4820 
4821 	dev_pasid->dev = dev;
4822 	dev_pasid->pasid = pasid;
4823 	spin_lock_irqsave(&dmar_domain->lock, flags);
4824 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4825 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4826 
4827 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4828 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4829 
4830 	return 0;
4831 out_detach_iommu:
4832 	domain_detach_iommu(dmar_domain, iommu);
4833 out_free:
4834 	kfree(dev_pasid);
4835 	return ret;
4836 }
4837 
4838 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4839 {
4840 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4841 	struct intel_iommu *iommu = info->iommu;
4842 	struct iommu_hw_info_vtd *vtd;
4843 
4844 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4845 	if (!vtd)
4846 		return ERR_PTR(-ENOMEM);
4847 
4848 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4849 	vtd->cap_reg = iommu->cap;
4850 	vtd->ecap_reg = iommu->ecap;
4851 	*length = sizeof(*vtd);
4852 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4853 	return vtd;
4854 }
4855 
4856 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4857 					  bool enable)
4858 {
4859 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860 	struct device_domain_info *info;
4861 	int ret;
4862 
4863 	spin_lock(&dmar_domain->lock);
4864 	if (dmar_domain->dirty_tracking == enable)
4865 		goto out_unlock;
4866 
4867 	list_for_each_entry(info, &dmar_domain->devices, link) {
4868 		ret = intel_pasid_setup_dirty_tracking(info->iommu,
4869 						       info->domain, info->dev,
4870 						       IOMMU_NO_PASID, enable);
4871 		if (ret)
4872 			goto err_unwind;
4873 	}
4874 
4875 	dmar_domain->dirty_tracking = enable;
4876 out_unlock:
4877 	spin_unlock(&dmar_domain->lock);
4878 
4879 	return 0;
4880 
4881 err_unwind:
4882 	list_for_each_entry(info, &dmar_domain->devices, link)
4883 		intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4884 						 info->dev, IOMMU_NO_PASID,
4885 						 dmar_domain->dirty_tracking);
4886 	spin_unlock(&dmar_domain->lock);
4887 	return ret;
4888 }
4889 
4890 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4891 					    unsigned long iova, size_t size,
4892 					    unsigned long flags,
4893 					    struct iommu_dirty_bitmap *dirty)
4894 {
4895 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4896 	unsigned long end = iova + size - 1;
4897 	unsigned long pgsize;
4898 
4899 	/*
4900 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4901 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4902 	 * have occurred when we stopped dirty tracking. This ensures that we
4903 	 * never inherit dirtied bits from a previous cycle.
4904 	 */
4905 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4906 		return -EINVAL;
4907 
4908 	do {
4909 		struct dma_pte *pte;
4910 		int lvl = 0;
4911 
4912 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4913 				     GFP_ATOMIC);
4914 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4915 		if (!pte || !dma_pte_present(pte)) {
4916 			iova += pgsize;
4917 			continue;
4918 		}
4919 
4920 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4921 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4922 		iova += pgsize;
4923 	} while (iova < end);
4924 
4925 	return 0;
4926 }
4927 
4928 const struct iommu_dirty_ops intel_dirty_ops = {
4929 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4930 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4931 };
4932 
4933 const struct iommu_ops intel_iommu_ops = {
4934 	.blocked_domain		= &blocking_domain,
4935 	.capable		= intel_iommu_capable,
4936 	.hw_info		= intel_iommu_hw_info,
4937 	.domain_alloc		= intel_iommu_domain_alloc,
4938 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4939 	.probe_device		= intel_iommu_probe_device,
4940 	.probe_finalize		= intel_iommu_probe_finalize,
4941 	.release_device		= intel_iommu_release_device,
4942 	.get_resv_regions	= intel_iommu_get_resv_regions,
4943 	.device_group		= intel_iommu_device_group,
4944 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4945 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4946 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4947 	.def_domain_type	= device_def_domain_type,
4948 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4949 	.pgsize_bitmap		= SZ_4K,
4950 #ifdef CONFIG_INTEL_IOMMU_SVM
4951 	.page_response		= intel_svm_page_response,
4952 #endif
4953 	.default_domain_ops = &(const struct iommu_domain_ops) {
4954 		.attach_dev		= intel_iommu_attach_device,
4955 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4956 		.map_pages		= intel_iommu_map_pages,
4957 		.unmap_pages		= intel_iommu_unmap_pages,
4958 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4959 		.flush_iotlb_all        = intel_flush_iotlb_all,
4960 		.iotlb_sync		= intel_iommu_tlb_sync,
4961 		.iova_to_phys		= intel_iommu_iova_to_phys,
4962 		.free			= intel_iommu_domain_free,
4963 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4964 	}
4965 };
4966 
4967 static void quirk_iommu_igfx(struct pci_dev *dev)
4968 {
4969 	if (risky_device(dev))
4970 		return;
4971 
4972 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4973 	dmar_map_gfx = 0;
4974 }
4975 
4976 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4984 
4985 /* Broadwell igfx malfunctions with dmar */
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5010 
5011 static void quirk_iommu_rwbf(struct pci_dev *dev)
5012 {
5013 	if (risky_device(dev))
5014 		return;
5015 
5016 	/*
5017 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5018 	 * but needs it. Same seems to hold for the desktop versions.
5019 	 */
5020 	pci_info(dev, "Forcing write-buffer flush capability\n");
5021 	rwbf_quirk = 1;
5022 }
5023 
5024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5031 
5032 #define GGC 0x52
5033 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5034 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5035 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5036 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5037 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5038 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5039 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5040 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5041 
5042 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5043 {
5044 	unsigned short ggc;
5045 
5046 	if (risky_device(dev))
5047 		return;
5048 
5049 	if (pci_read_config_word(dev, GGC, &ggc))
5050 		return;
5051 
5052 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5053 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5054 		dmar_map_gfx = 0;
5055 	} else if (dmar_map_gfx) {
5056 		/* we have to ensure the gfx device is idle before we flush */
5057 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5058 		iommu_set_dma_strict();
5059 	}
5060 }
5061 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5062 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5065 
5066 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5067 {
5068 	unsigned short ver;
5069 
5070 	if (!IS_GFX_DEVICE(dev))
5071 		return;
5072 
5073 	ver = (dev->device >> 8) & 0xff;
5074 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5075 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5076 	    ver != 0x9a && ver != 0xa7)
5077 		return;
5078 
5079 	if (risky_device(dev))
5080 		return;
5081 
5082 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5083 	iommu_skip_te_disable = 1;
5084 }
5085 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5086 
5087 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5088    ISOCH DMAR unit for the Azalia sound device, but not give it any
5089    TLB entries, which causes it to deadlock. Check for that.  We do
5090    this in a function called from init_dmars(), instead of in a PCI
5091    quirk, because we don't want to print the obnoxious "BIOS broken"
5092    message if VT-d is actually disabled.
5093 */
5094 static void __init check_tylersburg_isoch(void)
5095 {
5096 	struct pci_dev *pdev;
5097 	uint32_t vtisochctrl;
5098 
5099 	/* If there's no Azalia in the system anyway, forget it. */
5100 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5101 	if (!pdev)
5102 		return;
5103 
5104 	if (risky_device(pdev)) {
5105 		pci_dev_put(pdev);
5106 		return;
5107 	}
5108 
5109 	pci_dev_put(pdev);
5110 
5111 	/* System Management Registers. Might be hidden, in which case
5112 	   we can't do the sanity check. But that's OK, because the
5113 	   known-broken BIOSes _don't_ actually hide it, so far. */
5114 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5115 	if (!pdev)
5116 		return;
5117 
5118 	if (risky_device(pdev)) {
5119 		pci_dev_put(pdev);
5120 		return;
5121 	}
5122 
5123 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5124 		pci_dev_put(pdev);
5125 		return;
5126 	}
5127 
5128 	pci_dev_put(pdev);
5129 
5130 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5131 	if (vtisochctrl & 1)
5132 		return;
5133 
5134 	/* Drop all bits other than the number of TLB entries */
5135 	vtisochctrl &= 0x1c;
5136 
5137 	/* If we have the recommended number of TLB entries (16), fine. */
5138 	if (vtisochctrl == 0x10)
5139 		return;
5140 
5141 	/* Zero TLB entries? You get to ride the short bus to school. */
5142 	if (!vtisochctrl) {
5143 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5144 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5145 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5146 		     dmi_get_system_info(DMI_BIOS_VERSION),
5147 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5148 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5149 		return;
5150 	}
5151 
5152 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5153 	       vtisochctrl);
5154 }
5155 
5156 /*
5157  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5158  * invalidation completion before posted writes initiated with translated address
5159  * that utilized translations matching the invalidation address range, violating
5160  * the invalidation completion ordering.
5161  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5162  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5163  * under the control of the trusted/privileged host device driver must use this
5164  * quirk.
5165  * Device TLBs are invalidated under the following six conditions:
5166  * 1. Device driver does DMA API unmap IOVA
5167  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5168  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5169  *    exit_mmap() due to crash
5170  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5171  *    VM has to free pages that were unmapped
5172  * 5. Userspace driver unmaps a DMA buffer
5173  * 6. Cache invalidation in vSVA usage (upcoming)
5174  *
5175  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5176  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5177  * invalidate TLB the same way as normal user unmap which will use this quirk.
5178  * The dTLB invalidation after PASID cache flush does not need this quirk.
5179  *
5180  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5181  */
5182 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5183 			       unsigned long address, unsigned long mask,
5184 			       u32 pasid, u16 qdep)
5185 {
5186 	u16 sid;
5187 
5188 	if (likely(!info->dtlb_extra_inval))
5189 		return;
5190 
5191 	sid = PCI_DEVID(info->bus, info->devfn);
5192 	if (pasid == IOMMU_NO_PASID) {
5193 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5194 				   qdep, address, mask);
5195 	} else {
5196 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5197 					 pasid, qdep, address, mask);
5198 	}
5199 }
5200 
5201 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5202 
5203 /*
5204  * Function to submit a command to the enhanced command interface. The
5205  * valid enhanced command descriptions are defined in Table 47 of the
5206  * VT-d spec. The VT-d hardware implementation may support some but not
5207  * all commands, which can be determined by checking the Enhanced
5208  * Command Capability Register.
5209  *
5210  * Return values:
5211  *  - 0: Command successful without any error;
5212  *  - Negative: software error value;
5213  *  - Nonzero positive: failure status code defined in Table 48.
5214  */
5215 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5216 {
5217 	unsigned long flags;
5218 	u64 res;
5219 	int ret;
5220 
5221 	if (!cap_ecmds(iommu->cap))
5222 		return -ENODEV;
5223 
5224 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5225 
5226 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5227 	if (res & DMA_ECMD_ECRSP_IP) {
5228 		ret = -EBUSY;
5229 		goto err;
5230 	}
5231 
5232 	/*
5233 	 * Unconditionally write the operand B, because
5234 	 * - There is no side effect if an ecmd doesn't require an
5235 	 *   operand B, but we set the register to some value.
5236 	 * - It's not invoked in any critical path. The extra MMIO
5237 	 *   write doesn't bring any performance concerns.
5238 	 */
5239 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5240 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5241 
5242 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5243 		      !(res & DMA_ECMD_ECRSP_IP), res);
5244 
5245 	if (res & DMA_ECMD_ECRSP_IP) {
5246 		ret = -ETIMEDOUT;
5247 		goto err;
5248 	}
5249 
5250 	ret = ecmd_get_status_code(res);
5251 err:
5252 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5253 
5254 	return ret;
5255 }
5256