xref: /linux/drivers/iommu/intel/iommu.c (revision 4fd18fc38757217c746aa063ba9e4729814dc737)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-map-ops.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49 
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52 
53 #define ROOT_SIZE		VTD_PAGE_SIZE
54 #define CONTEXT_SIZE		VTD_PAGE_SIZE
55 
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 
61 #define IOAPIC_RANGE_START	(0xfee00000)
62 #define IOAPIC_RANGE_END	(0xfeefffff)
63 #define IOVA_START_ADDR		(0x1000)
64 
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
76 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN		(1)
81 
82 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
83 
84 /* page table handling */
85 #define LEVEL_STRIDE		(9)
86 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
87 
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
105 
106 static inline int agaw_to_level(int agaw)
107 {
108 	return agaw + 2;
109 }
110 
111 static inline int agaw_to_width(int agaw)
112 {
113 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115 
116 static inline int width_to_agaw(int width)
117 {
118 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120 
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 	return (level - 1) * LEVEL_STRIDE;
124 }
125 
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130 
131 static inline u64 level_mask(int level)
132 {
133 	return -1ULL << level_to_offset_bits(level);
134 }
135 
136 static inline u64 level_size(int level)
137 {
138 	return 1ULL << level_to_offset_bits(level);
139 }
140 
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143 	return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145 
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150 
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 	return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 	return page_to_dma_pfn(virt_to_page(p));
169 }
170 
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173 
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176 
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184 
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 	if (!(re->lo & 1))
194 		return 0;
195 
196 	return re->lo & VTD_PAGE_MASK;
197 }
198 
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 	if (!(re->hi & 1))
206 		return 0;
207 
208 	return re->hi & VTD_PAGE_MASK;
209 }
210 
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 	context->lo &= ~(1ULL << 11);
214 }
215 
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 	return !!(context->lo & (1ULL << 11));
219 }
220 
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 	context->hi |= (1ull << 3);
224 }
225 
226 static inline bool context_copied(struct context_entry *context)
227 {
228 	return !!(context->hi & (1ULL << 3));
229 }
230 
231 static inline bool __context_present(struct context_entry *context)
232 {
233 	return (context->lo & 1);
234 }
235 
236 bool context_present(struct context_entry *context)
237 {
238 	return context_pasid_enabled(context) ?
239 	     __context_present(context) :
240 	     __context_present(context) && !context_copied(context);
241 }
242 
243 static inline void context_set_present(struct context_entry *context)
244 {
245 	context->lo |= 1;
246 }
247 
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 	context->lo &= (((u64)-1) << 2) | 1;
251 }
252 
253 static inline void context_set_translation_type(struct context_entry *context,
254 						unsigned long value)
255 {
256 	context->lo &= (((u64)-1) << 4) | 3;
257 	context->lo |= (value & 3) << 2;
258 }
259 
260 static inline void context_set_address_root(struct context_entry *context,
261 					    unsigned long value)
262 {
263 	context->lo &= ~VTD_PAGE_MASK;
264 	context->lo |= value & VTD_PAGE_MASK;
265 }
266 
267 static inline void context_set_address_width(struct context_entry *context,
268 					     unsigned long value)
269 {
270 	context->hi |= value & 7;
271 }
272 
273 static inline void context_set_domain_id(struct context_entry *context,
274 					 unsigned long value)
275 {
276 	context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278 
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 	return((c->hi >> 8) & 0xffff);
282 }
283 
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 	context->lo = 0;
287 	context->hi = 0;
288 }
289 
290 /*
291  * This domain is a statically identity mapping domain.
292  *	1. This domain creats a static 1:1 mapping to all usable memory.
293  * 	2. It maps to each iommu if successful.
294  *	3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298 
299 #define for_each_domain_iommu(idx, domain)			\
300 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
301 		if (domain->iommu_refcnt[idx])
302 
303 struct dmar_rmrr_unit {
304 	struct list_head list;		/* list of rmrr units	*/
305 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
306 	u64	base_address;		/* reserved base address*/
307 	u64	end_address;		/* reserved end address */
308 	struct dmar_dev_scope *devices;	/* target devices */
309 	int	devices_cnt;		/* target device count */
310 };
311 
312 struct dmar_atsr_unit {
313 	struct list_head list;		/* list of ATSR units */
314 	struct acpi_dmar_header *hdr;	/* ACPI header */
315 	struct dmar_dev_scope *devices;	/* target devices */
316 	int devices_cnt;		/* target device count */
317 	u8 include_all:1;		/* include all ports */
318 };
319 
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322 
323 #define for_each_rmrr_units(rmrr) \
324 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328 
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 				     struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 					    dma_addr_t iova);
337 
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int iommu_skip_te_disable;
359 
360 #define IDENTMAP_GFX		2
361 #define IDENTMAP_AZALIA		4
362 
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365 
366 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
367 struct device_domain_info *get_domain_info(struct device *dev)
368 {
369 	struct device_domain_info *info;
370 
371 	if (!dev)
372 		return NULL;
373 
374 	info = dev_iommu_priv_get(dev);
375 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
376 		return NULL;
377 
378 	return info;
379 }
380 
381 DEFINE_SPINLOCK(device_domain_lock);
382 static LIST_HEAD(device_domain_list);
383 
384 /*
385  * Iterate over elements in device_domain_list and call the specified
386  * callback @fn against each element.
387  */
388 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
389 				     void *data), void *data)
390 {
391 	int ret = 0;
392 	unsigned long flags;
393 	struct device_domain_info *info;
394 
395 	spin_lock_irqsave(&device_domain_lock, flags);
396 	list_for_each_entry(info, &device_domain_list, global) {
397 		ret = fn(info, data);
398 		if (ret) {
399 			spin_unlock_irqrestore(&device_domain_lock, flags);
400 			return ret;
401 		}
402 	}
403 	spin_unlock_irqrestore(&device_domain_lock, flags);
404 
405 	return 0;
406 }
407 
408 const struct iommu_ops intel_iommu_ops;
409 
410 static bool translation_pre_enabled(struct intel_iommu *iommu)
411 {
412 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
413 }
414 
415 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
416 {
417 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
418 }
419 
420 static void init_translation_status(struct intel_iommu *iommu)
421 {
422 	u32 gsts;
423 
424 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
425 	if (gsts & DMA_GSTS_TES)
426 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428 
429 static int __init intel_iommu_setup(char *str)
430 {
431 	if (!str)
432 		return -EINVAL;
433 	while (*str) {
434 		if (!strncmp(str, "on", 2)) {
435 			dmar_disabled = 0;
436 			pr_info("IOMMU enabled\n");
437 		} else if (!strncmp(str, "off", 3)) {
438 			dmar_disabled = 1;
439 			no_platform_optin = 1;
440 			pr_info("IOMMU disabled\n");
441 		} else if (!strncmp(str, "igfx_off", 8)) {
442 			dmar_map_gfx = 0;
443 			pr_info("Disable GFX device mapping\n");
444 		} else if (!strncmp(str, "forcedac", 8)) {
445 			pr_info("Forcing DAC for PCI devices\n");
446 			dmar_forcedac = 1;
447 		} else if (!strncmp(str, "strict", 6)) {
448 			pr_info("Disable batched IOTLB flush\n");
449 			intel_iommu_strict = 1;
450 		} else if (!strncmp(str, "sp_off", 6)) {
451 			pr_info("Disable supported super page\n");
452 			intel_iommu_superpage = 0;
453 		} else if (!strncmp(str, "sm_on", 5)) {
454 			pr_info("Intel-IOMMU: scalable mode supported\n");
455 			intel_iommu_sm = 1;
456 		} else if (!strncmp(str, "tboot_noforce", 13)) {
457 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
458 			intel_iommu_tboot_noforce = 1;
459 		}
460 
461 		str += strcspn(str, ",");
462 		while (*str == ',')
463 			str++;
464 	}
465 	return 0;
466 }
467 __setup("intel_iommu=", intel_iommu_setup);
468 
469 static struct kmem_cache *iommu_domain_cache;
470 static struct kmem_cache *iommu_devinfo_cache;
471 
472 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 {
474 	struct dmar_domain **domains;
475 	int idx = did >> 8;
476 
477 	domains = iommu->domains[idx];
478 	if (!domains)
479 		return NULL;
480 
481 	return domains[did & 0xff];
482 }
483 
484 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
485 			     struct dmar_domain *domain)
486 {
487 	struct dmar_domain **domains;
488 	int idx = did >> 8;
489 
490 	if (!iommu->domains[idx]) {
491 		size_t size = 256 * sizeof(struct dmar_domain *);
492 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
493 	}
494 
495 	domains = iommu->domains[idx];
496 	if (WARN_ON(!domains))
497 		return;
498 	else
499 		domains[did & 0xff] = domain;
500 }
501 
502 void *alloc_pgtable_page(int node)
503 {
504 	struct page *page;
505 	void *vaddr = NULL;
506 
507 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 	if (page)
509 		vaddr = page_address(page);
510 	return vaddr;
511 }
512 
513 void free_pgtable_page(void *vaddr)
514 {
515 	free_page((unsigned long)vaddr);
516 }
517 
518 static inline void *alloc_domain_mem(void)
519 {
520 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
521 }
522 
523 static void free_domain_mem(void *vaddr)
524 {
525 	kmem_cache_free(iommu_domain_cache, vaddr);
526 }
527 
528 static inline void * alloc_devinfo_mem(void)
529 {
530 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
531 }
532 
533 static inline void free_devinfo_mem(void *vaddr)
534 {
535 	kmem_cache_free(iommu_devinfo_cache, vaddr);
536 }
537 
538 static inline int domain_type_is_si(struct dmar_domain *domain)
539 {
540 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
541 }
542 
543 static inline bool domain_use_first_level(struct dmar_domain *domain)
544 {
545 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
546 }
547 
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
549 				       unsigned long pfn)
550 {
551 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552 
553 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
554 }
555 
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 {
558 	unsigned long sagaw;
559 	int agaw = -1;
560 
561 	sagaw = cap_sagaw(iommu->cap);
562 	for (agaw = width_to_agaw(max_gaw);
563 	     agaw >= 0; agaw--) {
564 		if (test_bit(agaw, &sagaw))
565 			break;
566 	}
567 
568 	return agaw;
569 }
570 
571 /*
572  * Calculate max SAGAW for each iommu.
573  */
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 {
576 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
577 }
578 
579 /*
580  * calculate agaw for each iommu.
581  * "SAGAW" may be different across iommus, use a default agaw, and
582  * get a supported less agaw for iommus that don't support the default agaw.
583  */
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 {
586 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
587 }
588 
589 /* This functionin only returns single iommu in a domain */
590 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
591 {
592 	int iommu_id;
593 
594 	/* si_domain and vm domain should not get here. */
595 	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
596 		return NULL;
597 
598 	for_each_domain_iommu(iommu_id, domain)
599 		break;
600 
601 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
602 		return NULL;
603 
604 	return g_iommus[iommu_id];
605 }
606 
607 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
608 {
609 	return sm_supported(iommu) ?
610 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
611 }
612 
613 static void domain_update_iommu_coherency(struct dmar_domain *domain)
614 {
615 	struct dmar_drhd_unit *drhd;
616 	struct intel_iommu *iommu;
617 	bool found = false;
618 	int i;
619 
620 	domain->iommu_coherency = 1;
621 
622 	for_each_domain_iommu(i, domain) {
623 		found = true;
624 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
625 			domain->iommu_coherency = 0;
626 			break;
627 		}
628 	}
629 	if (found)
630 		return;
631 
632 	/* No hardware attached; use lowest common denominator */
633 	rcu_read_lock();
634 	for_each_active_iommu(iommu, drhd) {
635 		if (!iommu_paging_structure_coherency(iommu)) {
636 			domain->iommu_coherency = 0;
637 			break;
638 		}
639 	}
640 	rcu_read_unlock();
641 }
642 
643 static int domain_update_iommu_snooping(struct intel_iommu *skip)
644 {
645 	struct dmar_drhd_unit *drhd;
646 	struct intel_iommu *iommu;
647 	int ret = 1;
648 
649 	rcu_read_lock();
650 	for_each_active_iommu(iommu, drhd) {
651 		if (iommu != skip) {
652 			if (!ecap_sc_support(iommu->ecap)) {
653 				ret = 0;
654 				break;
655 			}
656 		}
657 	}
658 	rcu_read_unlock();
659 
660 	return ret;
661 }
662 
663 static int domain_update_iommu_superpage(struct dmar_domain *domain,
664 					 struct intel_iommu *skip)
665 {
666 	struct dmar_drhd_unit *drhd;
667 	struct intel_iommu *iommu;
668 	int mask = 0x3;
669 
670 	if (!intel_iommu_superpage) {
671 		return 0;
672 	}
673 
674 	/* set iommu_superpage to the smallest common denominator */
675 	rcu_read_lock();
676 	for_each_active_iommu(iommu, drhd) {
677 		if (iommu != skip) {
678 			if (domain && domain_use_first_level(domain)) {
679 				if (!cap_fl1gp_support(iommu->cap))
680 					mask = 0x1;
681 			} else {
682 				mask &= cap_super_page_val(iommu->cap);
683 			}
684 
685 			if (!mask)
686 				break;
687 		}
688 	}
689 	rcu_read_unlock();
690 
691 	return fls(mask);
692 }
693 
694 static int domain_update_device_node(struct dmar_domain *domain)
695 {
696 	struct device_domain_info *info;
697 	int nid = NUMA_NO_NODE;
698 
699 	assert_spin_locked(&device_domain_lock);
700 
701 	if (list_empty(&domain->devices))
702 		return NUMA_NO_NODE;
703 
704 	list_for_each_entry(info, &domain->devices, link) {
705 		if (!info->dev)
706 			continue;
707 
708 		/*
709 		 * There could possibly be multiple device numa nodes as devices
710 		 * within the same domain may sit behind different IOMMUs. There
711 		 * isn't perfect answer in such situation, so we select first
712 		 * come first served policy.
713 		 */
714 		nid = dev_to_node(info->dev);
715 		if (nid != NUMA_NO_NODE)
716 			break;
717 	}
718 
719 	return nid;
720 }
721 
722 /* Some capabilities may be different across iommus */
723 static void domain_update_iommu_cap(struct dmar_domain *domain)
724 {
725 	domain_update_iommu_coherency(domain);
726 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
727 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
728 
729 	/*
730 	 * If RHSA is missing, we should default to the device numa domain
731 	 * as fall back.
732 	 */
733 	if (domain->nid == NUMA_NO_NODE)
734 		domain->nid = domain_update_device_node(domain);
735 
736 	/*
737 	 * First-level translation restricts the input-address to a
738 	 * canonical address (i.e., address bits 63:N have the same
739 	 * value as address bit [N-1], where N is 48-bits with 4-level
740 	 * paging and 57-bits with 5-level paging). Hence, skip bit
741 	 * [N-1].
742 	 */
743 	if (domain_use_first_level(domain))
744 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
745 	else
746 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
747 }
748 
749 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
750 					 u8 devfn, int alloc)
751 {
752 	struct root_entry *root = &iommu->root_entry[bus];
753 	struct context_entry *context;
754 	u64 *entry;
755 
756 	entry = &root->lo;
757 	if (sm_supported(iommu)) {
758 		if (devfn >= 0x80) {
759 			devfn -= 0x80;
760 			entry = &root->hi;
761 		}
762 		devfn *= 2;
763 	}
764 	if (*entry & 1)
765 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
766 	else {
767 		unsigned long phy_addr;
768 		if (!alloc)
769 			return NULL;
770 
771 		context = alloc_pgtable_page(iommu->node);
772 		if (!context)
773 			return NULL;
774 
775 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
776 		phy_addr = virt_to_phys((void *)context);
777 		*entry = phy_addr | 1;
778 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
779 	}
780 	return &context[devfn];
781 }
782 
783 static bool attach_deferred(struct device *dev)
784 {
785 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
786 }
787 
788 /**
789  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
790  *				 sub-hierarchy of a candidate PCI-PCI bridge
791  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
792  * @bridge: the candidate PCI-PCI bridge
793  *
794  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
795  */
796 static bool
797 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
798 {
799 	struct pci_dev *pdev, *pbridge;
800 
801 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
802 		return false;
803 
804 	pdev = to_pci_dev(dev);
805 	pbridge = to_pci_dev(bridge);
806 
807 	if (pbridge->subordinate &&
808 	    pbridge->subordinate->number <= pdev->bus->number &&
809 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
810 		return true;
811 
812 	return false;
813 }
814 
815 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
816 {
817 	struct dmar_drhd_unit *drhd;
818 	u32 vtbar;
819 	int rc;
820 
821 	/* We know that this device on this chipset has its own IOMMU.
822 	 * If we find it under a different IOMMU, then the BIOS is lying
823 	 * to us. Hope that the IOMMU for this device is actually
824 	 * disabled, and it needs no translation...
825 	 */
826 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
827 	if (rc) {
828 		/* "can't" happen */
829 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
830 		return false;
831 	}
832 	vtbar &= 0xffff0000;
833 
834 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
835 	drhd = dmar_find_matched_drhd_unit(pdev);
836 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
837 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
838 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
839 		return true;
840 	}
841 
842 	return false;
843 }
844 
845 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
846 {
847 	if (!iommu || iommu->drhd->ignored)
848 		return true;
849 
850 	if (dev_is_pci(dev)) {
851 		struct pci_dev *pdev = to_pci_dev(dev);
852 
853 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
854 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
855 		    quirk_ioat_snb_local_iommu(pdev))
856 			return true;
857 	}
858 
859 	return false;
860 }
861 
862 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
863 {
864 	struct dmar_drhd_unit *drhd = NULL;
865 	struct pci_dev *pdev = NULL;
866 	struct intel_iommu *iommu;
867 	struct device *tmp;
868 	u16 segment = 0;
869 	int i;
870 
871 	if (!dev)
872 		return NULL;
873 
874 	if (dev_is_pci(dev)) {
875 		struct pci_dev *pf_pdev;
876 
877 		pdev = pci_real_dma_dev(to_pci_dev(dev));
878 
879 		/* VFs aren't listed in scope tables; we need to look up
880 		 * the PF instead to find the IOMMU. */
881 		pf_pdev = pci_physfn(pdev);
882 		dev = &pf_pdev->dev;
883 		segment = pci_domain_nr(pdev->bus);
884 	} else if (has_acpi_companion(dev))
885 		dev = &ACPI_COMPANION(dev)->dev;
886 
887 	rcu_read_lock();
888 	for_each_iommu(iommu, drhd) {
889 		if (pdev && segment != drhd->segment)
890 			continue;
891 
892 		for_each_active_dev_scope(drhd->devices,
893 					  drhd->devices_cnt, i, tmp) {
894 			if (tmp == dev) {
895 				/* For a VF use its original BDF# not that of the PF
896 				 * which we used for the IOMMU lookup. Strictly speaking
897 				 * we could do this for all PCI devices; we only need to
898 				 * get the BDF# from the scope table for ACPI matches. */
899 				if (pdev && pdev->is_virtfn)
900 					goto got_pdev;
901 
902 				if (bus && devfn) {
903 					*bus = drhd->devices[i].bus;
904 					*devfn = drhd->devices[i].devfn;
905 				}
906 				goto out;
907 			}
908 
909 			if (is_downstream_to_pci_bridge(dev, tmp))
910 				goto got_pdev;
911 		}
912 
913 		if (pdev && drhd->include_all) {
914 		got_pdev:
915 			if (bus && devfn) {
916 				*bus = pdev->bus->number;
917 				*devfn = pdev->devfn;
918 			}
919 			goto out;
920 		}
921 	}
922 	iommu = NULL;
923  out:
924 	if (iommu_is_dummy(iommu, dev))
925 		iommu = NULL;
926 
927 	rcu_read_unlock();
928 
929 	return iommu;
930 }
931 
932 static void domain_flush_cache(struct dmar_domain *domain,
933 			       void *addr, int size)
934 {
935 	if (!domain->iommu_coherency)
936 		clflush_cache_range(addr, size);
937 }
938 
939 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
940 {
941 	struct context_entry *context;
942 	int ret = 0;
943 	unsigned long flags;
944 
945 	spin_lock_irqsave(&iommu->lock, flags);
946 	context = iommu_context_addr(iommu, bus, devfn, 0);
947 	if (context)
948 		ret = context_present(context);
949 	spin_unlock_irqrestore(&iommu->lock, flags);
950 	return ret;
951 }
952 
953 static void free_context_table(struct intel_iommu *iommu)
954 {
955 	int i;
956 	unsigned long flags;
957 	struct context_entry *context;
958 
959 	spin_lock_irqsave(&iommu->lock, flags);
960 	if (!iommu->root_entry) {
961 		goto out;
962 	}
963 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
964 		context = iommu_context_addr(iommu, i, 0, 0);
965 		if (context)
966 			free_pgtable_page(context);
967 
968 		if (!sm_supported(iommu))
969 			continue;
970 
971 		context = iommu_context_addr(iommu, i, 0x80, 0);
972 		if (context)
973 			free_pgtable_page(context);
974 
975 	}
976 	free_pgtable_page(iommu->root_entry);
977 	iommu->root_entry = NULL;
978 out:
979 	spin_unlock_irqrestore(&iommu->lock, flags);
980 }
981 
982 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
983 				      unsigned long pfn, int *target_level)
984 {
985 	struct dma_pte *parent, *pte;
986 	int level = agaw_to_level(domain->agaw);
987 	int offset;
988 
989 	BUG_ON(!domain->pgd);
990 
991 	if (!domain_pfn_supported(domain, pfn))
992 		/* Address beyond IOMMU's addressing capabilities. */
993 		return NULL;
994 
995 	parent = domain->pgd;
996 
997 	while (1) {
998 		void *tmp_page;
999 
1000 		offset = pfn_level_offset(pfn, level);
1001 		pte = &parent[offset];
1002 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1003 			break;
1004 		if (level == *target_level)
1005 			break;
1006 
1007 		if (!dma_pte_present(pte)) {
1008 			uint64_t pteval;
1009 
1010 			tmp_page = alloc_pgtable_page(domain->nid);
1011 
1012 			if (!tmp_page)
1013 				return NULL;
1014 
1015 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1016 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1017 			if (domain_use_first_level(domain))
1018 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1019 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1020 				/* Someone else set it while we were thinking; use theirs. */
1021 				free_pgtable_page(tmp_page);
1022 			else
1023 				domain_flush_cache(domain, pte, sizeof(*pte));
1024 		}
1025 		if (level == 1)
1026 			break;
1027 
1028 		parent = phys_to_virt(dma_pte_addr(pte));
1029 		level--;
1030 	}
1031 
1032 	if (!*target_level)
1033 		*target_level = level;
1034 
1035 	return pte;
1036 }
1037 
1038 /* return address's pte at specific level */
1039 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1040 					 unsigned long pfn,
1041 					 int level, int *large_page)
1042 {
1043 	struct dma_pte *parent, *pte;
1044 	int total = agaw_to_level(domain->agaw);
1045 	int offset;
1046 
1047 	parent = domain->pgd;
1048 	while (level <= total) {
1049 		offset = pfn_level_offset(pfn, total);
1050 		pte = &parent[offset];
1051 		if (level == total)
1052 			return pte;
1053 
1054 		if (!dma_pte_present(pte)) {
1055 			*large_page = total;
1056 			break;
1057 		}
1058 
1059 		if (dma_pte_superpage(pte)) {
1060 			*large_page = total;
1061 			return pte;
1062 		}
1063 
1064 		parent = phys_to_virt(dma_pte_addr(pte));
1065 		total--;
1066 	}
1067 	return NULL;
1068 }
1069 
1070 /* clear last level pte, a tlb flush should be followed */
1071 static void dma_pte_clear_range(struct dmar_domain *domain,
1072 				unsigned long start_pfn,
1073 				unsigned long last_pfn)
1074 {
1075 	unsigned int large_page;
1076 	struct dma_pte *first_pte, *pte;
1077 
1078 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 	BUG_ON(start_pfn > last_pfn);
1081 
1082 	/* we don't need lock here; nobody else touches the iova range */
1083 	do {
1084 		large_page = 1;
1085 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1086 		if (!pte) {
1087 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1088 			continue;
1089 		}
1090 		do {
1091 			dma_clear_pte(pte);
1092 			start_pfn += lvl_to_nr_pages(large_page);
1093 			pte++;
1094 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1095 
1096 		domain_flush_cache(domain, first_pte,
1097 				   (void *)pte - (void *)first_pte);
1098 
1099 	} while (start_pfn && start_pfn <= last_pfn);
1100 }
1101 
1102 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1103 			       int retain_level, struct dma_pte *pte,
1104 			       unsigned long pfn, unsigned long start_pfn,
1105 			       unsigned long last_pfn)
1106 {
1107 	pfn = max(start_pfn, pfn);
1108 	pte = &pte[pfn_level_offset(pfn, level)];
1109 
1110 	do {
1111 		unsigned long level_pfn;
1112 		struct dma_pte *level_pte;
1113 
1114 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1115 			goto next;
1116 
1117 		level_pfn = pfn & level_mask(level);
1118 		level_pte = phys_to_virt(dma_pte_addr(pte));
1119 
1120 		if (level > 2) {
1121 			dma_pte_free_level(domain, level - 1, retain_level,
1122 					   level_pte, level_pfn, start_pfn,
1123 					   last_pfn);
1124 		}
1125 
1126 		/*
1127 		 * Free the page table if we're below the level we want to
1128 		 * retain and the range covers the entire table.
1129 		 */
1130 		if (level < retain_level && !(start_pfn > level_pfn ||
1131 		      last_pfn < level_pfn + level_size(level) - 1)) {
1132 			dma_clear_pte(pte);
1133 			domain_flush_cache(domain, pte, sizeof(*pte));
1134 			free_pgtable_page(level_pte);
1135 		}
1136 next:
1137 		pfn += level_size(level);
1138 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1139 }
1140 
1141 /*
1142  * clear last level (leaf) ptes and free page table pages below the
1143  * level we wish to keep intact.
1144  */
1145 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1146 				   unsigned long start_pfn,
1147 				   unsigned long last_pfn,
1148 				   int retain_level)
1149 {
1150 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1151 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1152 	BUG_ON(start_pfn > last_pfn);
1153 
1154 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1155 
1156 	/* We don't need lock here; nobody else touches the iova range */
1157 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1158 			   domain->pgd, 0, start_pfn, last_pfn);
1159 
1160 	/* free pgd */
1161 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162 		free_pgtable_page(domain->pgd);
1163 		domain->pgd = NULL;
1164 	}
1165 }
1166 
1167 /* When a page at a given level is being unlinked from its parent, we don't
1168    need to *modify* it at all. All we need to do is make a list of all the
1169    pages which can be freed just as soon as we've flushed the IOTLB and we
1170    know the hardware page-walk will no longer touch them.
1171    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1172    be freed. */
1173 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1174 					    int level, struct dma_pte *pte,
1175 					    struct page *freelist)
1176 {
1177 	struct page *pg;
1178 
1179 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1180 	pg->freelist = freelist;
1181 	freelist = pg;
1182 
1183 	if (level == 1)
1184 		return freelist;
1185 
1186 	pte = page_address(pg);
1187 	do {
1188 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1189 			freelist = dma_pte_list_pagetables(domain, level - 1,
1190 							   pte, freelist);
1191 		pte++;
1192 	} while (!first_pte_in_page(pte));
1193 
1194 	return freelist;
1195 }
1196 
1197 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1198 					struct dma_pte *pte, unsigned long pfn,
1199 					unsigned long start_pfn,
1200 					unsigned long last_pfn,
1201 					struct page *freelist)
1202 {
1203 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1204 
1205 	pfn = max(start_pfn, pfn);
1206 	pte = &pte[pfn_level_offset(pfn, level)];
1207 
1208 	do {
1209 		unsigned long level_pfn;
1210 
1211 		if (!dma_pte_present(pte))
1212 			goto next;
1213 
1214 		level_pfn = pfn & level_mask(level);
1215 
1216 		/* If range covers entire pagetable, free it */
1217 		if (start_pfn <= level_pfn &&
1218 		    last_pfn >= level_pfn + level_size(level) - 1) {
1219 			/* These suborbinate page tables are going away entirely. Don't
1220 			   bother to clear them; we're just going to *free* them. */
1221 			if (level > 1 && !dma_pte_superpage(pte))
1222 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1223 
1224 			dma_clear_pte(pte);
1225 			if (!first_pte)
1226 				first_pte = pte;
1227 			last_pte = pte;
1228 		} else if (level > 1) {
1229 			/* Recurse down into a level that isn't *entirely* obsolete */
1230 			freelist = dma_pte_clear_level(domain, level - 1,
1231 						       phys_to_virt(dma_pte_addr(pte)),
1232 						       level_pfn, start_pfn, last_pfn,
1233 						       freelist);
1234 		}
1235 next:
1236 		pfn += level_size(level);
1237 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1238 
1239 	if (first_pte)
1240 		domain_flush_cache(domain, first_pte,
1241 				   (void *)++last_pte - (void *)first_pte);
1242 
1243 	return freelist;
1244 }
1245 
1246 /* We can't just free the pages because the IOMMU may still be walking
1247    the page tables, and may have cached the intermediate levels. The
1248    pages can only be freed after the IOTLB flush has been done. */
1249 static struct page *domain_unmap(struct dmar_domain *domain,
1250 				 unsigned long start_pfn,
1251 				 unsigned long last_pfn,
1252 				 struct page *freelist)
1253 {
1254 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1255 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1256 	BUG_ON(start_pfn > last_pfn);
1257 
1258 	/* we don't need lock here; nobody else touches the iova range */
1259 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1260 				       domain->pgd, 0, start_pfn, last_pfn,
1261 				       freelist);
1262 
1263 	/* free pgd */
1264 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1265 		struct page *pgd_page = virt_to_page(domain->pgd);
1266 		pgd_page->freelist = freelist;
1267 		freelist = pgd_page;
1268 
1269 		domain->pgd = NULL;
1270 	}
1271 
1272 	return freelist;
1273 }
1274 
1275 static void dma_free_pagelist(struct page *freelist)
1276 {
1277 	struct page *pg;
1278 
1279 	while ((pg = freelist)) {
1280 		freelist = pg->freelist;
1281 		free_pgtable_page(page_address(pg));
1282 	}
1283 }
1284 
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1287 {
1288 	struct root_entry *root;
1289 	unsigned long flags;
1290 
1291 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1292 	if (!root) {
1293 		pr_err("Allocating root entry for %s failed\n",
1294 			iommu->name);
1295 		return -ENOMEM;
1296 	}
1297 
1298 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1299 
1300 	spin_lock_irqsave(&iommu->lock, flags);
1301 	iommu->root_entry = root;
1302 	spin_unlock_irqrestore(&iommu->lock, flags);
1303 
1304 	return 0;
1305 }
1306 
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1308 {
1309 	u64 addr;
1310 	u32 sts;
1311 	unsigned long flag;
1312 
1313 	addr = virt_to_phys(iommu->root_entry);
1314 	if (sm_supported(iommu))
1315 		addr |= DMA_RTADDR_SMT;
1316 
1317 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1319 
1320 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1321 
1322 	/* Make sure hardware complete it */
1323 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324 		      readl, (sts & DMA_GSTS_RTPS), sts);
1325 
1326 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 }
1328 
1329 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1330 {
1331 	u32 val;
1332 	unsigned long flag;
1333 
1334 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1335 		return;
1336 
1337 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1338 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1339 
1340 	/* Make sure hardware complete it */
1341 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1342 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1343 
1344 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1345 }
1346 
1347 /* return value determine if we need a write buffer flush */
1348 static void __iommu_flush_context(struct intel_iommu *iommu,
1349 				  u16 did, u16 source_id, u8 function_mask,
1350 				  u64 type)
1351 {
1352 	u64 val = 0;
1353 	unsigned long flag;
1354 
1355 	switch (type) {
1356 	case DMA_CCMD_GLOBAL_INVL:
1357 		val = DMA_CCMD_GLOBAL_INVL;
1358 		break;
1359 	case DMA_CCMD_DOMAIN_INVL:
1360 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1361 		break;
1362 	case DMA_CCMD_DEVICE_INVL:
1363 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1364 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1365 		break;
1366 	default:
1367 		BUG();
1368 	}
1369 	val |= DMA_CCMD_ICC;
1370 
1371 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1373 
1374 	/* Make sure hardware complete it */
1375 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1376 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1377 
1378 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1379 }
1380 
1381 /* return value determine if we need a write buffer flush */
1382 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1383 				u64 addr, unsigned int size_order, u64 type)
1384 {
1385 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1386 	u64 val = 0, val_iva = 0;
1387 	unsigned long flag;
1388 
1389 	switch (type) {
1390 	case DMA_TLB_GLOBAL_FLUSH:
1391 		/* global flush doesn't need set IVA_REG */
1392 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1393 		break;
1394 	case DMA_TLB_DSI_FLUSH:
1395 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1396 		break;
1397 	case DMA_TLB_PSI_FLUSH:
1398 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399 		/* IH bit is passed in as part of address */
1400 		val_iva = size_order | addr;
1401 		break;
1402 	default:
1403 		BUG();
1404 	}
1405 	/* Note: set drain read/write */
1406 #if 0
1407 	/*
1408 	 * This is probably to be super secure.. Looks like we can
1409 	 * ignore it without any impact.
1410 	 */
1411 	if (cap_read_drain(iommu->cap))
1412 		val |= DMA_TLB_READ_DRAIN;
1413 #endif
1414 	if (cap_write_drain(iommu->cap))
1415 		val |= DMA_TLB_WRITE_DRAIN;
1416 
1417 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1418 	/* Note: Only uses first TLB reg currently */
1419 	if (val_iva)
1420 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1421 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1422 
1423 	/* Make sure hardware complete it */
1424 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1425 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1426 
1427 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1428 
1429 	/* check IOTLB invalidation granularity */
1430 	if (DMA_TLB_IAIG(val) == 0)
1431 		pr_err("Flush IOTLB failed\n");
1432 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1433 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1434 			(unsigned long long)DMA_TLB_IIRG(type),
1435 			(unsigned long long)DMA_TLB_IAIG(val));
1436 }
1437 
1438 static struct device_domain_info *
1439 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1440 			 u8 bus, u8 devfn)
1441 {
1442 	struct device_domain_info *info;
1443 
1444 	assert_spin_locked(&device_domain_lock);
1445 
1446 	if (!iommu->qi)
1447 		return NULL;
1448 
1449 	list_for_each_entry(info, &domain->devices, link)
1450 		if (info->iommu == iommu && info->bus == bus &&
1451 		    info->devfn == devfn) {
1452 			if (info->ats_supported && info->dev)
1453 				return info;
1454 			break;
1455 		}
1456 
1457 	return NULL;
1458 }
1459 
1460 static void domain_update_iotlb(struct dmar_domain *domain)
1461 {
1462 	struct device_domain_info *info;
1463 	bool has_iotlb_device = false;
1464 
1465 	assert_spin_locked(&device_domain_lock);
1466 
1467 	list_for_each_entry(info, &domain->devices, link) {
1468 		struct pci_dev *pdev;
1469 
1470 		if (!info->dev || !dev_is_pci(info->dev))
1471 			continue;
1472 
1473 		pdev = to_pci_dev(info->dev);
1474 		if (pdev->ats_enabled) {
1475 			has_iotlb_device = true;
1476 			break;
1477 		}
1478 	}
1479 
1480 	domain->has_iotlb_device = has_iotlb_device;
1481 }
1482 
1483 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1484 {
1485 	struct pci_dev *pdev;
1486 
1487 	assert_spin_locked(&device_domain_lock);
1488 
1489 	if (!info || !dev_is_pci(info->dev))
1490 		return;
1491 
1492 	pdev = to_pci_dev(info->dev);
1493 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1494 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1495 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1496 	 * reserved, which should be set to 0.
1497 	 */
1498 	if (!ecap_dit(info->iommu->ecap))
1499 		info->pfsid = 0;
1500 	else {
1501 		struct pci_dev *pf_pdev;
1502 
1503 		/* pdev will be returned if device is not a vf */
1504 		pf_pdev = pci_physfn(pdev);
1505 		info->pfsid = pci_dev_id(pf_pdev);
1506 	}
1507 
1508 #ifdef CONFIG_INTEL_IOMMU_SVM
1509 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1510 	   the device if you enable PASID support after ATS support is
1511 	   undefined. So always enable PASID support on devices which
1512 	   have it, even if we can't yet know if we're ever going to
1513 	   use it. */
1514 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1515 		info->pasid_enabled = 1;
1516 
1517 	if (info->pri_supported &&
1518 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1519 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1520 		info->pri_enabled = 1;
1521 #endif
1522 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1523 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1524 		info->ats_enabled = 1;
1525 		domain_update_iotlb(info->domain);
1526 		info->ats_qdep = pci_ats_queue_depth(pdev);
1527 	}
1528 }
1529 
1530 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1531 {
1532 	struct pci_dev *pdev;
1533 
1534 	assert_spin_locked(&device_domain_lock);
1535 
1536 	if (!dev_is_pci(info->dev))
1537 		return;
1538 
1539 	pdev = to_pci_dev(info->dev);
1540 
1541 	if (info->ats_enabled) {
1542 		pci_disable_ats(pdev);
1543 		info->ats_enabled = 0;
1544 		domain_update_iotlb(info->domain);
1545 	}
1546 #ifdef CONFIG_INTEL_IOMMU_SVM
1547 	if (info->pri_enabled) {
1548 		pci_disable_pri(pdev);
1549 		info->pri_enabled = 0;
1550 	}
1551 	if (info->pasid_enabled) {
1552 		pci_disable_pasid(pdev);
1553 		info->pasid_enabled = 0;
1554 	}
1555 #endif
1556 }
1557 
1558 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1559 				  u64 addr, unsigned mask)
1560 {
1561 	u16 sid, qdep;
1562 	unsigned long flags;
1563 	struct device_domain_info *info;
1564 
1565 	if (!domain->has_iotlb_device)
1566 		return;
1567 
1568 	spin_lock_irqsave(&device_domain_lock, flags);
1569 	list_for_each_entry(info, &domain->devices, link) {
1570 		if (!info->ats_enabled)
1571 			continue;
1572 
1573 		sid = info->bus << 8 | info->devfn;
1574 		qdep = info->ats_qdep;
1575 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1576 				qdep, addr, mask);
1577 	}
1578 	spin_unlock_irqrestore(&device_domain_lock, flags);
1579 }
1580 
1581 static void domain_flush_piotlb(struct intel_iommu *iommu,
1582 				struct dmar_domain *domain,
1583 				u64 addr, unsigned long npages, bool ih)
1584 {
1585 	u16 did = domain->iommu_did[iommu->seq_id];
1586 
1587 	if (domain->default_pasid)
1588 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1589 				addr, npages, ih);
1590 
1591 	if (!list_empty(&domain->devices))
1592 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1593 }
1594 
1595 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1596 				  struct dmar_domain *domain,
1597 				  unsigned long pfn, unsigned int pages,
1598 				  int ih, int map)
1599 {
1600 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1601 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1602 	u16 did = domain->iommu_did[iommu->seq_id];
1603 
1604 	BUG_ON(pages == 0);
1605 
1606 	if (ih)
1607 		ih = 1 << 6;
1608 
1609 	if (domain_use_first_level(domain)) {
1610 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1611 	} else {
1612 		/*
1613 		 * Fallback to domain selective flush if no PSI support or
1614 		 * the size is too big. PSI requires page size to be 2 ^ x,
1615 		 * and the base address is naturally aligned to the size.
1616 		 */
1617 		if (!cap_pgsel_inv(iommu->cap) ||
1618 		    mask > cap_max_amask_val(iommu->cap))
1619 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1620 							DMA_TLB_DSI_FLUSH);
1621 		else
1622 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1623 							DMA_TLB_PSI_FLUSH);
1624 	}
1625 
1626 	/*
1627 	 * In caching mode, changes of pages from non-present to present require
1628 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1629 	 */
1630 	if (!cap_caching_mode(iommu->cap) || !map)
1631 		iommu_flush_dev_iotlb(domain, addr, mask);
1632 }
1633 
1634 /* Notification for newly created mappings */
1635 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1636 					struct dmar_domain *domain,
1637 					unsigned long pfn, unsigned int pages)
1638 {
1639 	/*
1640 	 * It's a non-present to present mapping. Only flush if caching mode
1641 	 * and second level.
1642 	 */
1643 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1644 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1645 	else
1646 		iommu_flush_write_buffer(iommu);
1647 }
1648 
1649 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1650 {
1651 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1652 	int idx;
1653 
1654 	for_each_domain_iommu(idx, dmar_domain) {
1655 		struct intel_iommu *iommu = g_iommus[idx];
1656 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1657 
1658 		if (domain_use_first_level(dmar_domain))
1659 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1660 		else
1661 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1662 						 DMA_TLB_DSI_FLUSH);
1663 
1664 		if (!cap_caching_mode(iommu->cap))
1665 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1666 					      0, MAX_AGAW_PFN_WIDTH);
1667 	}
1668 }
1669 
1670 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1671 {
1672 	u32 pmen;
1673 	unsigned long flags;
1674 
1675 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1676 		return;
1677 
1678 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1679 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1680 	pmen &= ~DMA_PMEN_EPM;
1681 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1682 
1683 	/* wait for the protected region status bit to clear */
1684 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1685 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1686 
1687 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1688 }
1689 
1690 static void iommu_enable_translation(struct intel_iommu *iommu)
1691 {
1692 	u32 sts;
1693 	unsigned long flags;
1694 
1695 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1696 	iommu->gcmd |= DMA_GCMD_TE;
1697 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1698 
1699 	/* Make sure hardware complete it */
1700 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1701 		      readl, (sts & DMA_GSTS_TES), sts);
1702 
1703 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1704 }
1705 
1706 static void iommu_disable_translation(struct intel_iommu *iommu)
1707 {
1708 	u32 sts;
1709 	unsigned long flag;
1710 
1711 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1712 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1713 		return;
1714 
1715 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1716 	iommu->gcmd &= ~DMA_GCMD_TE;
1717 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1718 
1719 	/* Make sure hardware complete it */
1720 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1722 
1723 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1724 }
1725 
1726 static int iommu_init_domains(struct intel_iommu *iommu)
1727 {
1728 	u32 ndomains, nlongs;
1729 	size_t size;
1730 
1731 	ndomains = cap_ndoms(iommu->cap);
1732 	pr_debug("%s: Number of Domains supported <%d>\n",
1733 		 iommu->name, ndomains);
1734 	nlongs = BITS_TO_LONGS(ndomains);
1735 
1736 	spin_lock_init(&iommu->lock);
1737 
1738 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1739 	if (!iommu->domain_ids) {
1740 		pr_err("%s: Allocating domain id array failed\n",
1741 		       iommu->name);
1742 		return -ENOMEM;
1743 	}
1744 
1745 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1746 	iommu->domains = kzalloc(size, GFP_KERNEL);
1747 
1748 	if (iommu->domains) {
1749 		size = 256 * sizeof(struct dmar_domain *);
1750 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1751 	}
1752 
1753 	if (!iommu->domains || !iommu->domains[0]) {
1754 		pr_err("%s: Allocating domain array failed\n",
1755 		       iommu->name);
1756 		kfree(iommu->domain_ids);
1757 		kfree(iommu->domains);
1758 		iommu->domain_ids = NULL;
1759 		iommu->domains    = NULL;
1760 		return -ENOMEM;
1761 	}
1762 
1763 	/*
1764 	 * If Caching mode is set, then invalid translations are tagged
1765 	 * with domain-id 0, hence we need to pre-allocate it. We also
1766 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1767 	 * make sure it is not used for a real domain.
1768 	 */
1769 	set_bit(0, iommu->domain_ids);
1770 
1771 	/*
1772 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1773 	 * entry for first-level or pass-through translation modes should
1774 	 * be programmed with a domain id different from those used for
1775 	 * second-level or nested translation. We reserve a domain id for
1776 	 * this purpose.
1777 	 */
1778 	if (sm_supported(iommu))
1779 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1780 
1781 	return 0;
1782 }
1783 
1784 static void disable_dmar_iommu(struct intel_iommu *iommu)
1785 {
1786 	struct device_domain_info *info, *tmp;
1787 	unsigned long flags;
1788 
1789 	if (!iommu->domains || !iommu->domain_ids)
1790 		return;
1791 
1792 	spin_lock_irqsave(&device_domain_lock, flags);
1793 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1794 		if (info->iommu != iommu)
1795 			continue;
1796 
1797 		if (!info->dev || !info->domain)
1798 			continue;
1799 
1800 		__dmar_remove_one_dev_info(info);
1801 	}
1802 	spin_unlock_irqrestore(&device_domain_lock, flags);
1803 
1804 	if (iommu->gcmd & DMA_GCMD_TE)
1805 		iommu_disable_translation(iommu);
1806 }
1807 
1808 static void free_dmar_iommu(struct intel_iommu *iommu)
1809 {
1810 	if ((iommu->domains) && (iommu->domain_ids)) {
1811 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1812 		int i;
1813 
1814 		for (i = 0; i < elems; i++)
1815 			kfree(iommu->domains[i]);
1816 		kfree(iommu->domains);
1817 		kfree(iommu->domain_ids);
1818 		iommu->domains = NULL;
1819 		iommu->domain_ids = NULL;
1820 	}
1821 
1822 	g_iommus[iommu->seq_id] = NULL;
1823 
1824 	/* free context mapping */
1825 	free_context_table(iommu);
1826 
1827 #ifdef CONFIG_INTEL_IOMMU_SVM
1828 	if (pasid_supported(iommu)) {
1829 		if (ecap_prs(iommu->ecap))
1830 			intel_svm_finish_prq(iommu);
1831 	}
1832 	if (vccap_pasid(iommu->vccap))
1833 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1834 
1835 #endif
1836 }
1837 
1838 /*
1839  * Check and return whether first level is used by default for
1840  * DMA translation.
1841  */
1842 static bool first_level_by_default(void)
1843 {
1844 	struct dmar_drhd_unit *drhd;
1845 	struct intel_iommu *iommu;
1846 	static int first_level_support = -1;
1847 
1848 	if (likely(first_level_support != -1))
1849 		return first_level_support;
1850 
1851 	first_level_support = 1;
1852 
1853 	rcu_read_lock();
1854 	for_each_active_iommu(iommu, drhd) {
1855 		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1856 			first_level_support = 0;
1857 			break;
1858 		}
1859 	}
1860 	rcu_read_unlock();
1861 
1862 	return first_level_support;
1863 }
1864 
1865 static struct dmar_domain *alloc_domain(int flags)
1866 {
1867 	struct dmar_domain *domain;
1868 
1869 	domain = alloc_domain_mem();
1870 	if (!domain)
1871 		return NULL;
1872 
1873 	memset(domain, 0, sizeof(*domain));
1874 	domain->nid = NUMA_NO_NODE;
1875 	domain->flags = flags;
1876 	if (first_level_by_default())
1877 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1878 	domain->has_iotlb_device = false;
1879 	INIT_LIST_HEAD(&domain->devices);
1880 
1881 	return domain;
1882 }
1883 
1884 /* Must be called with iommu->lock */
1885 static int domain_attach_iommu(struct dmar_domain *domain,
1886 			       struct intel_iommu *iommu)
1887 {
1888 	unsigned long ndomains;
1889 	int num;
1890 
1891 	assert_spin_locked(&device_domain_lock);
1892 	assert_spin_locked(&iommu->lock);
1893 
1894 	domain->iommu_refcnt[iommu->seq_id] += 1;
1895 	domain->iommu_count += 1;
1896 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1897 		ndomains = cap_ndoms(iommu->cap);
1898 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1899 
1900 		if (num >= ndomains) {
1901 			pr_err("%s: No free domain ids\n", iommu->name);
1902 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1903 			domain->iommu_count -= 1;
1904 			return -ENOSPC;
1905 		}
1906 
1907 		set_bit(num, iommu->domain_ids);
1908 		set_iommu_domain(iommu, num, domain);
1909 
1910 		domain->iommu_did[iommu->seq_id] = num;
1911 		domain->nid			 = iommu->node;
1912 
1913 		domain_update_iommu_cap(domain);
1914 	}
1915 
1916 	return 0;
1917 }
1918 
1919 static int domain_detach_iommu(struct dmar_domain *domain,
1920 			       struct intel_iommu *iommu)
1921 {
1922 	int num, count;
1923 
1924 	assert_spin_locked(&device_domain_lock);
1925 	assert_spin_locked(&iommu->lock);
1926 
1927 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1928 	count = --domain->iommu_count;
1929 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1930 		num = domain->iommu_did[iommu->seq_id];
1931 		clear_bit(num, iommu->domain_ids);
1932 		set_iommu_domain(iommu, num, NULL);
1933 
1934 		domain_update_iommu_cap(domain);
1935 		domain->iommu_did[iommu->seq_id] = 0;
1936 	}
1937 
1938 	return count;
1939 }
1940 
1941 static inline int guestwidth_to_adjustwidth(int gaw)
1942 {
1943 	int agaw;
1944 	int r = (gaw - 12) % 9;
1945 
1946 	if (r == 0)
1947 		agaw = gaw;
1948 	else
1949 		agaw = gaw + 9 - r;
1950 	if (agaw > 64)
1951 		agaw = 64;
1952 	return agaw;
1953 }
1954 
1955 static void domain_exit(struct dmar_domain *domain)
1956 {
1957 
1958 	/* Remove associated devices and clear attached or cached domains */
1959 	domain_remove_dev_info(domain);
1960 
1961 	/* destroy iovas */
1962 	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1963 		iommu_put_dma_cookie(&domain->domain);
1964 
1965 	if (domain->pgd) {
1966 		struct page *freelist;
1967 
1968 		freelist = domain_unmap(domain, 0,
1969 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1970 		dma_free_pagelist(freelist);
1971 	}
1972 
1973 	free_domain_mem(domain);
1974 }
1975 
1976 /*
1977  * Get the PASID directory size for scalable mode context entry.
1978  * Value of X in the PDTS field of a scalable mode context entry
1979  * indicates PASID directory with 2^(X + 7) entries.
1980  */
1981 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1982 {
1983 	int pds, max_pde;
1984 
1985 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1986 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1987 	if (pds < 7)
1988 		return 0;
1989 
1990 	return pds - 7;
1991 }
1992 
1993 /*
1994  * Set the RID_PASID field of a scalable mode context entry. The
1995  * IOMMU hardware will use the PASID value set in this field for
1996  * DMA translations of DMA requests without PASID.
1997  */
1998 static inline void
1999 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2000 {
2001 	context->hi |= pasid & ((1 << 20) - 1);
2002 }
2003 
2004 /*
2005  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2006  * entry.
2007  */
2008 static inline void context_set_sm_dte(struct context_entry *context)
2009 {
2010 	context->lo |= (1 << 2);
2011 }
2012 
2013 /*
2014  * Set the PRE(Page Request Enable) field of a scalable mode context
2015  * entry.
2016  */
2017 static inline void context_set_sm_pre(struct context_entry *context)
2018 {
2019 	context->lo |= (1 << 4);
2020 }
2021 
2022 /* Convert value to context PASID directory size field coding. */
2023 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2024 
2025 static int domain_context_mapping_one(struct dmar_domain *domain,
2026 				      struct intel_iommu *iommu,
2027 				      struct pasid_table *table,
2028 				      u8 bus, u8 devfn)
2029 {
2030 	u16 did = domain->iommu_did[iommu->seq_id];
2031 	int translation = CONTEXT_TT_MULTI_LEVEL;
2032 	struct device_domain_info *info = NULL;
2033 	struct context_entry *context;
2034 	unsigned long flags;
2035 	int ret;
2036 
2037 	WARN_ON(did == 0);
2038 
2039 	if (hw_pass_through && domain_type_is_si(domain))
2040 		translation = CONTEXT_TT_PASS_THROUGH;
2041 
2042 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2043 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2044 
2045 	BUG_ON(!domain->pgd);
2046 
2047 	spin_lock_irqsave(&device_domain_lock, flags);
2048 	spin_lock(&iommu->lock);
2049 
2050 	ret = -ENOMEM;
2051 	context = iommu_context_addr(iommu, bus, devfn, 1);
2052 	if (!context)
2053 		goto out_unlock;
2054 
2055 	ret = 0;
2056 	if (context_present(context))
2057 		goto out_unlock;
2058 
2059 	/*
2060 	 * For kdump cases, old valid entries may be cached due to the
2061 	 * in-flight DMA and copied pgtable, but there is no unmapping
2062 	 * behaviour for them, thus we need an explicit cache flush for
2063 	 * the newly-mapped device. For kdump, at this point, the device
2064 	 * is supposed to finish reset at its driver probe stage, so no
2065 	 * in-flight DMA will exist, and we don't need to worry anymore
2066 	 * hereafter.
2067 	 */
2068 	if (context_copied(context)) {
2069 		u16 did_old = context_domain_id(context);
2070 
2071 		if (did_old < cap_ndoms(iommu->cap)) {
2072 			iommu->flush.flush_context(iommu, did_old,
2073 						   (((u16)bus) << 8) | devfn,
2074 						   DMA_CCMD_MASK_NOBIT,
2075 						   DMA_CCMD_DEVICE_INVL);
2076 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2077 						 DMA_TLB_DSI_FLUSH);
2078 		}
2079 	}
2080 
2081 	context_clear_entry(context);
2082 
2083 	if (sm_supported(iommu)) {
2084 		unsigned long pds;
2085 
2086 		WARN_ON(!table);
2087 
2088 		/* Setup the PASID DIR pointer: */
2089 		pds = context_get_sm_pds(table);
2090 		context->lo = (u64)virt_to_phys(table->table) |
2091 				context_pdts(pds);
2092 
2093 		/* Setup the RID_PASID field: */
2094 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2095 
2096 		/*
2097 		 * Setup the Device-TLB enable bit and Page request
2098 		 * Enable bit:
2099 		 */
2100 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2101 		if (info && info->ats_supported)
2102 			context_set_sm_dte(context);
2103 		if (info && info->pri_supported)
2104 			context_set_sm_pre(context);
2105 	} else {
2106 		struct dma_pte *pgd = domain->pgd;
2107 		int agaw;
2108 
2109 		context_set_domain_id(context, did);
2110 
2111 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2112 			/*
2113 			 * Skip top levels of page tables for iommu which has
2114 			 * less agaw than default. Unnecessary for PT mode.
2115 			 */
2116 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2117 				ret = -ENOMEM;
2118 				pgd = phys_to_virt(dma_pte_addr(pgd));
2119 				if (!dma_pte_present(pgd))
2120 					goto out_unlock;
2121 			}
2122 
2123 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2124 			if (info && info->ats_supported)
2125 				translation = CONTEXT_TT_DEV_IOTLB;
2126 			else
2127 				translation = CONTEXT_TT_MULTI_LEVEL;
2128 
2129 			context_set_address_root(context, virt_to_phys(pgd));
2130 			context_set_address_width(context, agaw);
2131 		} else {
2132 			/*
2133 			 * In pass through mode, AW must be programmed to
2134 			 * indicate the largest AGAW value supported by
2135 			 * hardware. And ASR is ignored by hardware.
2136 			 */
2137 			context_set_address_width(context, iommu->msagaw);
2138 		}
2139 
2140 		context_set_translation_type(context, translation);
2141 	}
2142 
2143 	context_set_fault_enable(context);
2144 	context_set_present(context);
2145 	if (!ecap_coherent(iommu->ecap))
2146 		clflush_cache_range(context, sizeof(*context));
2147 
2148 	/*
2149 	 * It's a non-present to present mapping. If hardware doesn't cache
2150 	 * non-present entry we only need to flush the write-buffer. If the
2151 	 * _does_ cache non-present entries, then it does so in the special
2152 	 * domain #0, which we have to flush:
2153 	 */
2154 	if (cap_caching_mode(iommu->cap)) {
2155 		iommu->flush.flush_context(iommu, 0,
2156 					   (((u16)bus) << 8) | devfn,
2157 					   DMA_CCMD_MASK_NOBIT,
2158 					   DMA_CCMD_DEVICE_INVL);
2159 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2160 	} else {
2161 		iommu_flush_write_buffer(iommu);
2162 	}
2163 	iommu_enable_dev_iotlb(info);
2164 
2165 	ret = 0;
2166 
2167 out_unlock:
2168 	spin_unlock(&iommu->lock);
2169 	spin_unlock_irqrestore(&device_domain_lock, flags);
2170 
2171 	return ret;
2172 }
2173 
2174 struct domain_context_mapping_data {
2175 	struct dmar_domain *domain;
2176 	struct intel_iommu *iommu;
2177 	struct pasid_table *table;
2178 };
2179 
2180 static int domain_context_mapping_cb(struct pci_dev *pdev,
2181 				     u16 alias, void *opaque)
2182 {
2183 	struct domain_context_mapping_data *data = opaque;
2184 
2185 	return domain_context_mapping_one(data->domain, data->iommu,
2186 					  data->table, PCI_BUS_NUM(alias),
2187 					  alias & 0xff);
2188 }
2189 
2190 static int
2191 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2192 {
2193 	struct domain_context_mapping_data data;
2194 	struct pasid_table *table;
2195 	struct intel_iommu *iommu;
2196 	u8 bus, devfn;
2197 
2198 	iommu = device_to_iommu(dev, &bus, &devfn);
2199 	if (!iommu)
2200 		return -ENODEV;
2201 
2202 	table = intel_pasid_get_table(dev);
2203 
2204 	if (!dev_is_pci(dev))
2205 		return domain_context_mapping_one(domain, iommu, table,
2206 						  bus, devfn);
2207 
2208 	data.domain = domain;
2209 	data.iommu = iommu;
2210 	data.table = table;
2211 
2212 	return pci_for_each_dma_alias(to_pci_dev(dev),
2213 				      &domain_context_mapping_cb, &data);
2214 }
2215 
2216 static int domain_context_mapped_cb(struct pci_dev *pdev,
2217 				    u16 alias, void *opaque)
2218 {
2219 	struct intel_iommu *iommu = opaque;
2220 
2221 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2222 }
2223 
2224 static int domain_context_mapped(struct device *dev)
2225 {
2226 	struct intel_iommu *iommu;
2227 	u8 bus, devfn;
2228 
2229 	iommu = device_to_iommu(dev, &bus, &devfn);
2230 	if (!iommu)
2231 		return -ENODEV;
2232 
2233 	if (!dev_is_pci(dev))
2234 		return device_context_mapped(iommu, bus, devfn);
2235 
2236 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2237 				       domain_context_mapped_cb, iommu);
2238 }
2239 
2240 /* Returns a number of VTD pages, but aligned to MM page size */
2241 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2242 					    size_t size)
2243 {
2244 	host_addr &= ~PAGE_MASK;
2245 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2246 }
2247 
2248 /* Return largest possible superpage level for a given mapping */
2249 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2250 					  unsigned long iov_pfn,
2251 					  unsigned long phy_pfn,
2252 					  unsigned long pages)
2253 {
2254 	int support, level = 1;
2255 	unsigned long pfnmerge;
2256 
2257 	support = domain->iommu_superpage;
2258 
2259 	/* To use a large page, the virtual *and* physical addresses
2260 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2261 	   of them will mean we have to use smaller pages. So just
2262 	   merge them and check both at once. */
2263 	pfnmerge = iov_pfn | phy_pfn;
2264 
2265 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2266 		pages >>= VTD_STRIDE_SHIFT;
2267 		if (!pages)
2268 			break;
2269 		pfnmerge >>= VTD_STRIDE_SHIFT;
2270 		level++;
2271 		support--;
2272 	}
2273 	return level;
2274 }
2275 
2276 static int
2277 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2278 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2279 {
2280 	struct dma_pte *first_pte = NULL, *pte = NULL;
2281 	unsigned int largepage_lvl = 0;
2282 	unsigned long lvl_pages = 0;
2283 	phys_addr_t pteval;
2284 	u64 attr;
2285 
2286 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2287 
2288 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2289 		return -EINVAL;
2290 
2291 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2292 	if (domain_use_first_level(domain))
2293 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2294 
2295 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2296 
2297 	while (nr_pages > 0) {
2298 		uint64_t tmp;
2299 
2300 		if (!pte) {
2301 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2302 					phys_pfn, nr_pages);
2303 
2304 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2305 			if (!pte)
2306 				return -ENOMEM;
2307 			/* It is large page*/
2308 			if (largepage_lvl > 1) {
2309 				unsigned long nr_superpages, end_pfn;
2310 
2311 				pteval |= DMA_PTE_LARGE_PAGE;
2312 				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2313 
2314 				nr_superpages = nr_pages / lvl_pages;
2315 				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2316 
2317 				/*
2318 				 * Ensure that old small page tables are
2319 				 * removed to make room for superpage(s).
2320 				 * We're adding new large pages, so make sure
2321 				 * we don't remove their parent tables.
2322 				 */
2323 				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2324 						       largepage_lvl + 1);
2325 			} else {
2326 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2327 			}
2328 
2329 		}
2330 		/* We don't need lock here, nobody else
2331 		 * touches the iova range
2332 		 */
2333 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2334 		if (tmp) {
2335 			static int dumps = 5;
2336 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2337 				iov_pfn, tmp, (unsigned long long)pteval);
2338 			if (dumps) {
2339 				dumps--;
2340 				debug_dma_dump_mappings(NULL);
2341 			}
2342 			WARN_ON(1);
2343 		}
2344 
2345 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2346 
2347 		BUG_ON(nr_pages < lvl_pages);
2348 
2349 		nr_pages -= lvl_pages;
2350 		iov_pfn += lvl_pages;
2351 		phys_pfn += lvl_pages;
2352 		pteval += lvl_pages * VTD_PAGE_SIZE;
2353 
2354 		/* If the next PTE would be the first in a new page, then we
2355 		 * need to flush the cache on the entries we've just written.
2356 		 * And then we'll need to recalculate 'pte', so clear it and
2357 		 * let it get set again in the if (!pte) block above.
2358 		 *
2359 		 * If we're done (!nr_pages) we need to flush the cache too.
2360 		 *
2361 		 * Also if we've been setting superpages, we may need to
2362 		 * recalculate 'pte' and switch back to smaller pages for the
2363 		 * end of the mapping, if the trailing size is not enough to
2364 		 * use another superpage (i.e. nr_pages < lvl_pages).
2365 		 */
2366 		pte++;
2367 		if (!nr_pages || first_pte_in_page(pte) ||
2368 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2369 			domain_flush_cache(domain, first_pte,
2370 					   (void *)pte - (void *)first_pte);
2371 			pte = NULL;
2372 		}
2373 	}
2374 
2375 	return 0;
2376 }
2377 
2378 static int
2379 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380 	       unsigned long phys_pfn, unsigned long nr_pages, int prot)
2381 {
2382 	int iommu_id, ret;
2383 	struct intel_iommu *iommu;
2384 
2385 	/* Do the real mapping first */
2386 	ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2387 	if (ret)
2388 		return ret;
2389 
2390 	for_each_domain_iommu(iommu_id, domain) {
2391 		iommu = g_iommus[iommu_id];
2392 		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2393 	}
2394 
2395 	return 0;
2396 }
2397 
2398 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2399 {
2400 	unsigned long flags;
2401 	struct context_entry *context;
2402 	u16 did_old;
2403 
2404 	if (!iommu)
2405 		return;
2406 
2407 	spin_lock_irqsave(&iommu->lock, flags);
2408 	context = iommu_context_addr(iommu, bus, devfn, 0);
2409 	if (!context) {
2410 		spin_unlock_irqrestore(&iommu->lock, flags);
2411 		return;
2412 	}
2413 	did_old = context_domain_id(context);
2414 	context_clear_entry(context);
2415 	__iommu_flush_cache(iommu, context, sizeof(*context));
2416 	spin_unlock_irqrestore(&iommu->lock, flags);
2417 	iommu->flush.flush_context(iommu,
2418 				   did_old,
2419 				   (((u16)bus) << 8) | devfn,
2420 				   DMA_CCMD_MASK_NOBIT,
2421 				   DMA_CCMD_DEVICE_INVL);
2422 	iommu->flush.flush_iotlb(iommu,
2423 				 did_old,
2424 				 0,
2425 				 0,
2426 				 DMA_TLB_DSI_FLUSH);
2427 }
2428 
2429 static inline void unlink_domain_info(struct device_domain_info *info)
2430 {
2431 	assert_spin_locked(&device_domain_lock);
2432 	list_del(&info->link);
2433 	list_del(&info->global);
2434 	if (info->dev)
2435 		dev_iommu_priv_set(info->dev, NULL);
2436 }
2437 
2438 static void domain_remove_dev_info(struct dmar_domain *domain)
2439 {
2440 	struct device_domain_info *info, *tmp;
2441 	unsigned long flags;
2442 
2443 	spin_lock_irqsave(&device_domain_lock, flags);
2444 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2445 		__dmar_remove_one_dev_info(info);
2446 	spin_unlock_irqrestore(&device_domain_lock, flags);
2447 }
2448 
2449 struct dmar_domain *find_domain(struct device *dev)
2450 {
2451 	struct device_domain_info *info;
2452 
2453 	if (unlikely(!dev || !dev->iommu))
2454 		return NULL;
2455 
2456 	if (unlikely(attach_deferred(dev)))
2457 		return NULL;
2458 
2459 	/* No lock here, assumes no domain exit in normal case */
2460 	info = get_domain_info(dev);
2461 	if (likely(info))
2462 		return info->domain;
2463 
2464 	return NULL;
2465 }
2466 
2467 static inline struct device_domain_info *
2468 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2469 {
2470 	struct device_domain_info *info;
2471 
2472 	list_for_each_entry(info, &device_domain_list, global)
2473 		if (info->segment == segment && info->bus == bus &&
2474 		    info->devfn == devfn)
2475 			return info;
2476 
2477 	return NULL;
2478 }
2479 
2480 static int domain_setup_first_level(struct intel_iommu *iommu,
2481 				    struct dmar_domain *domain,
2482 				    struct device *dev,
2483 				    u32 pasid)
2484 {
2485 	int flags = PASID_FLAG_SUPERVISOR_MODE;
2486 	struct dma_pte *pgd = domain->pgd;
2487 	int agaw, level;
2488 
2489 	/*
2490 	 * Skip top levels of page tables for iommu which has
2491 	 * less agaw than default. Unnecessary for PT mode.
2492 	 */
2493 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2494 		pgd = phys_to_virt(dma_pte_addr(pgd));
2495 		if (!dma_pte_present(pgd))
2496 			return -ENOMEM;
2497 	}
2498 
2499 	level = agaw_to_level(agaw);
2500 	if (level != 4 && level != 5)
2501 		return -EINVAL;
2502 
2503 	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2504 
2505 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2506 					     domain->iommu_did[iommu->seq_id],
2507 					     flags);
2508 }
2509 
2510 static bool dev_is_real_dma_subdevice(struct device *dev)
2511 {
2512 	return dev && dev_is_pci(dev) &&
2513 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2514 }
2515 
2516 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2517 						    int bus, int devfn,
2518 						    struct device *dev,
2519 						    struct dmar_domain *domain)
2520 {
2521 	struct dmar_domain *found = NULL;
2522 	struct device_domain_info *info;
2523 	unsigned long flags;
2524 	int ret;
2525 
2526 	info = alloc_devinfo_mem();
2527 	if (!info)
2528 		return NULL;
2529 
2530 	if (!dev_is_real_dma_subdevice(dev)) {
2531 		info->bus = bus;
2532 		info->devfn = devfn;
2533 		info->segment = iommu->segment;
2534 	} else {
2535 		struct pci_dev *pdev = to_pci_dev(dev);
2536 
2537 		info->bus = pdev->bus->number;
2538 		info->devfn = pdev->devfn;
2539 		info->segment = pci_domain_nr(pdev->bus);
2540 	}
2541 
2542 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2543 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2544 	info->ats_qdep = 0;
2545 	info->dev = dev;
2546 	info->domain = domain;
2547 	info->iommu = iommu;
2548 	info->pasid_table = NULL;
2549 	info->auxd_enabled = 0;
2550 	INIT_LIST_HEAD(&info->auxiliary_domains);
2551 
2552 	if (dev && dev_is_pci(dev)) {
2553 		struct pci_dev *pdev = to_pci_dev(info->dev);
2554 
2555 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2556 		    pci_ats_supported(pdev) &&
2557 		    dmar_find_matched_atsr_unit(pdev))
2558 			info->ats_supported = 1;
2559 
2560 		if (sm_supported(iommu)) {
2561 			if (pasid_supported(iommu)) {
2562 				int features = pci_pasid_features(pdev);
2563 				if (features >= 0)
2564 					info->pasid_supported = features | 1;
2565 			}
2566 
2567 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2568 			    pci_pri_supported(pdev))
2569 				info->pri_supported = 1;
2570 		}
2571 	}
2572 
2573 	spin_lock_irqsave(&device_domain_lock, flags);
2574 	if (dev)
2575 		found = find_domain(dev);
2576 
2577 	if (!found) {
2578 		struct device_domain_info *info2;
2579 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2580 						       info->devfn);
2581 		if (info2) {
2582 			found      = info2->domain;
2583 			info2->dev = dev;
2584 		}
2585 	}
2586 
2587 	if (found) {
2588 		spin_unlock_irqrestore(&device_domain_lock, flags);
2589 		free_devinfo_mem(info);
2590 		/* Caller must free the original domain */
2591 		return found;
2592 	}
2593 
2594 	spin_lock(&iommu->lock);
2595 	ret = domain_attach_iommu(domain, iommu);
2596 	spin_unlock(&iommu->lock);
2597 
2598 	if (ret) {
2599 		spin_unlock_irqrestore(&device_domain_lock, flags);
2600 		free_devinfo_mem(info);
2601 		return NULL;
2602 	}
2603 
2604 	list_add(&info->link, &domain->devices);
2605 	list_add(&info->global, &device_domain_list);
2606 	if (dev)
2607 		dev_iommu_priv_set(dev, info);
2608 	spin_unlock_irqrestore(&device_domain_lock, flags);
2609 
2610 	/* PASID table is mandatory for a PCI device in scalable mode. */
2611 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2612 		ret = intel_pasid_alloc_table(dev);
2613 		if (ret) {
2614 			dev_err(dev, "PASID table allocation failed\n");
2615 			dmar_remove_one_dev_info(dev);
2616 			return NULL;
2617 		}
2618 
2619 		/* Setup the PASID entry for requests without PASID: */
2620 		spin_lock_irqsave(&iommu->lock, flags);
2621 		if (hw_pass_through && domain_type_is_si(domain))
2622 			ret = intel_pasid_setup_pass_through(iommu, domain,
2623 					dev, PASID_RID2PASID);
2624 		else if (domain_use_first_level(domain))
2625 			ret = domain_setup_first_level(iommu, domain, dev,
2626 					PASID_RID2PASID);
2627 		else
2628 			ret = intel_pasid_setup_second_level(iommu, domain,
2629 					dev, PASID_RID2PASID);
2630 		spin_unlock_irqrestore(&iommu->lock, flags);
2631 		if (ret) {
2632 			dev_err(dev, "Setup RID2PASID failed\n");
2633 			dmar_remove_one_dev_info(dev);
2634 			return NULL;
2635 		}
2636 	}
2637 
2638 	if (dev && domain_context_mapping(domain, dev)) {
2639 		dev_err(dev, "Domain context map failed\n");
2640 		dmar_remove_one_dev_info(dev);
2641 		return NULL;
2642 	}
2643 
2644 	return domain;
2645 }
2646 
2647 static int iommu_domain_identity_map(struct dmar_domain *domain,
2648 				     unsigned long first_vpfn,
2649 				     unsigned long last_vpfn)
2650 {
2651 	/*
2652 	 * RMRR range might have overlap with physical memory range,
2653 	 * clear it first
2654 	 */
2655 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2656 
2657 	return __domain_mapping(domain, first_vpfn,
2658 				first_vpfn, last_vpfn - first_vpfn + 1,
2659 				DMA_PTE_READ|DMA_PTE_WRITE);
2660 }
2661 
2662 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2663 
2664 static int __init si_domain_init(int hw)
2665 {
2666 	struct dmar_rmrr_unit *rmrr;
2667 	struct device *dev;
2668 	int i, nid, ret;
2669 
2670 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2671 	if (!si_domain)
2672 		return -EFAULT;
2673 
2674 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2675 		domain_exit(si_domain);
2676 		return -EFAULT;
2677 	}
2678 
2679 	if (hw)
2680 		return 0;
2681 
2682 	for_each_online_node(nid) {
2683 		unsigned long start_pfn, end_pfn;
2684 		int i;
2685 
2686 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2687 			ret = iommu_domain_identity_map(si_domain,
2688 					mm_to_dma_pfn(start_pfn),
2689 					mm_to_dma_pfn(end_pfn));
2690 			if (ret)
2691 				return ret;
2692 		}
2693 	}
2694 
2695 	/*
2696 	 * Identity map the RMRRs so that devices with RMRRs could also use
2697 	 * the si_domain.
2698 	 */
2699 	for_each_rmrr_units(rmrr) {
2700 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2701 					  i, dev) {
2702 			unsigned long long start = rmrr->base_address;
2703 			unsigned long long end = rmrr->end_address;
2704 
2705 			if (WARN_ON(end < start ||
2706 				    end >> agaw_to_width(si_domain->agaw)))
2707 				continue;
2708 
2709 			ret = iommu_domain_identity_map(si_domain,
2710 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2711 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2712 			if (ret)
2713 				return ret;
2714 		}
2715 	}
2716 
2717 	return 0;
2718 }
2719 
2720 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2721 {
2722 	struct dmar_domain *ndomain;
2723 	struct intel_iommu *iommu;
2724 	u8 bus, devfn;
2725 
2726 	iommu = device_to_iommu(dev, &bus, &devfn);
2727 	if (!iommu)
2728 		return -ENODEV;
2729 
2730 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2731 	if (ndomain != domain)
2732 		return -EBUSY;
2733 
2734 	return 0;
2735 }
2736 
2737 static bool device_has_rmrr(struct device *dev)
2738 {
2739 	struct dmar_rmrr_unit *rmrr;
2740 	struct device *tmp;
2741 	int i;
2742 
2743 	rcu_read_lock();
2744 	for_each_rmrr_units(rmrr) {
2745 		/*
2746 		 * Return TRUE if this RMRR contains the device that
2747 		 * is passed in.
2748 		 */
2749 		for_each_active_dev_scope(rmrr->devices,
2750 					  rmrr->devices_cnt, i, tmp)
2751 			if (tmp == dev ||
2752 			    is_downstream_to_pci_bridge(dev, tmp)) {
2753 				rcu_read_unlock();
2754 				return true;
2755 			}
2756 	}
2757 	rcu_read_unlock();
2758 	return false;
2759 }
2760 
2761 /**
2762  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2763  * is relaxable (ie. is allowed to be not enforced under some conditions)
2764  * @dev: device handle
2765  *
2766  * We assume that PCI USB devices with RMRRs have them largely
2767  * for historical reasons and that the RMRR space is not actively used post
2768  * boot.  This exclusion may change if vendors begin to abuse it.
2769  *
2770  * The same exception is made for graphics devices, with the requirement that
2771  * any use of the RMRR regions will be torn down before assigning the device
2772  * to a guest.
2773  *
2774  * Return: true if the RMRR is relaxable, false otherwise
2775  */
2776 static bool device_rmrr_is_relaxable(struct device *dev)
2777 {
2778 	struct pci_dev *pdev;
2779 
2780 	if (!dev_is_pci(dev))
2781 		return false;
2782 
2783 	pdev = to_pci_dev(dev);
2784 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2785 		return true;
2786 	else
2787 		return false;
2788 }
2789 
2790 /*
2791  * There are a couple cases where we need to restrict the functionality of
2792  * devices associated with RMRRs.  The first is when evaluating a device for
2793  * identity mapping because problems exist when devices are moved in and out
2794  * of domains and their respective RMRR information is lost.  This means that
2795  * a device with associated RMRRs will never be in a "passthrough" domain.
2796  * The second is use of the device through the IOMMU API.  This interface
2797  * expects to have full control of the IOVA space for the device.  We cannot
2798  * satisfy both the requirement that RMRR access is maintained and have an
2799  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2800  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2801  * We therefore prevent devices associated with an RMRR from participating in
2802  * the IOMMU API, which eliminates them from device assignment.
2803  *
2804  * In both cases, devices which have relaxable RMRRs are not concerned by this
2805  * restriction. See device_rmrr_is_relaxable comment.
2806  */
2807 static bool device_is_rmrr_locked(struct device *dev)
2808 {
2809 	if (!device_has_rmrr(dev))
2810 		return false;
2811 
2812 	if (device_rmrr_is_relaxable(dev))
2813 		return false;
2814 
2815 	return true;
2816 }
2817 
2818 /*
2819  * Return the required default domain type for a specific device.
2820  *
2821  * @dev: the device in query
2822  * @startup: true if this is during early boot
2823  *
2824  * Returns:
2825  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2826  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2827  *  - 0: both identity and dynamic domains work for this device
2828  */
2829 static int device_def_domain_type(struct device *dev)
2830 {
2831 	if (dev_is_pci(dev)) {
2832 		struct pci_dev *pdev = to_pci_dev(dev);
2833 
2834 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2835 			return IOMMU_DOMAIN_IDENTITY;
2836 
2837 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2838 			return IOMMU_DOMAIN_IDENTITY;
2839 	}
2840 
2841 	return 0;
2842 }
2843 
2844 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2845 {
2846 	/*
2847 	 * Start from the sane iommu hardware state.
2848 	 * If the queued invalidation is already initialized by us
2849 	 * (for example, while enabling interrupt-remapping) then
2850 	 * we got the things already rolling from a sane state.
2851 	 */
2852 	if (!iommu->qi) {
2853 		/*
2854 		 * Clear any previous faults.
2855 		 */
2856 		dmar_fault(-1, iommu);
2857 		/*
2858 		 * Disable queued invalidation if supported and already enabled
2859 		 * before OS handover.
2860 		 */
2861 		dmar_disable_qi(iommu);
2862 	}
2863 
2864 	if (dmar_enable_qi(iommu)) {
2865 		/*
2866 		 * Queued Invalidate not enabled, use Register Based Invalidate
2867 		 */
2868 		iommu->flush.flush_context = __iommu_flush_context;
2869 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2870 		pr_info("%s: Using Register based invalidation\n",
2871 			iommu->name);
2872 	} else {
2873 		iommu->flush.flush_context = qi_flush_context;
2874 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2875 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2876 	}
2877 }
2878 
2879 static int copy_context_table(struct intel_iommu *iommu,
2880 			      struct root_entry *old_re,
2881 			      struct context_entry **tbl,
2882 			      int bus, bool ext)
2883 {
2884 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2885 	struct context_entry *new_ce = NULL, ce;
2886 	struct context_entry *old_ce = NULL;
2887 	struct root_entry re;
2888 	phys_addr_t old_ce_phys;
2889 
2890 	tbl_idx = ext ? bus * 2 : bus;
2891 	memcpy(&re, old_re, sizeof(re));
2892 
2893 	for (devfn = 0; devfn < 256; devfn++) {
2894 		/* First calculate the correct index */
2895 		idx = (ext ? devfn * 2 : devfn) % 256;
2896 
2897 		if (idx == 0) {
2898 			/* First save what we may have and clean up */
2899 			if (new_ce) {
2900 				tbl[tbl_idx] = new_ce;
2901 				__iommu_flush_cache(iommu, new_ce,
2902 						    VTD_PAGE_SIZE);
2903 				pos = 1;
2904 			}
2905 
2906 			if (old_ce)
2907 				memunmap(old_ce);
2908 
2909 			ret = 0;
2910 			if (devfn < 0x80)
2911 				old_ce_phys = root_entry_lctp(&re);
2912 			else
2913 				old_ce_phys = root_entry_uctp(&re);
2914 
2915 			if (!old_ce_phys) {
2916 				if (ext && devfn == 0) {
2917 					/* No LCTP, try UCTP */
2918 					devfn = 0x7f;
2919 					continue;
2920 				} else {
2921 					goto out;
2922 				}
2923 			}
2924 
2925 			ret = -ENOMEM;
2926 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2927 					MEMREMAP_WB);
2928 			if (!old_ce)
2929 				goto out;
2930 
2931 			new_ce = alloc_pgtable_page(iommu->node);
2932 			if (!new_ce)
2933 				goto out_unmap;
2934 
2935 			ret = 0;
2936 		}
2937 
2938 		/* Now copy the context entry */
2939 		memcpy(&ce, old_ce + idx, sizeof(ce));
2940 
2941 		if (!__context_present(&ce))
2942 			continue;
2943 
2944 		did = context_domain_id(&ce);
2945 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2946 			set_bit(did, iommu->domain_ids);
2947 
2948 		/*
2949 		 * We need a marker for copied context entries. This
2950 		 * marker needs to work for the old format as well as
2951 		 * for extended context entries.
2952 		 *
2953 		 * Bit 67 of the context entry is used. In the old
2954 		 * format this bit is available to software, in the
2955 		 * extended format it is the PGE bit, but PGE is ignored
2956 		 * by HW if PASIDs are disabled (and thus still
2957 		 * available).
2958 		 *
2959 		 * So disable PASIDs first and then mark the entry
2960 		 * copied. This means that we don't copy PASID
2961 		 * translations from the old kernel, but this is fine as
2962 		 * faults there are not fatal.
2963 		 */
2964 		context_clear_pasid_enable(&ce);
2965 		context_set_copied(&ce);
2966 
2967 		new_ce[idx] = ce;
2968 	}
2969 
2970 	tbl[tbl_idx + pos] = new_ce;
2971 
2972 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2973 
2974 out_unmap:
2975 	memunmap(old_ce);
2976 
2977 out:
2978 	return ret;
2979 }
2980 
2981 static int copy_translation_tables(struct intel_iommu *iommu)
2982 {
2983 	struct context_entry **ctxt_tbls;
2984 	struct root_entry *old_rt;
2985 	phys_addr_t old_rt_phys;
2986 	int ctxt_table_entries;
2987 	unsigned long flags;
2988 	u64 rtaddr_reg;
2989 	int bus, ret;
2990 	bool new_ext, ext;
2991 
2992 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2993 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2994 	new_ext    = !!ecap_ecs(iommu->ecap);
2995 
2996 	/*
2997 	 * The RTT bit can only be changed when translation is disabled,
2998 	 * but disabling translation means to open a window for data
2999 	 * corruption. So bail out and don't copy anything if we would
3000 	 * have to change the bit.
3001 	 */
3002 	if (new_ext != ext)
3003 		return -EINVAL;
3004 
3005 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3006 	if (!old_rt_phys)
3007 		return -EINVAL;
3008 
3009 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3010 	if (!old_rt)
3011 		return -ENOMEM;
3012 
3013 	/* This is too big for the stack - allocate it from slab */
3014 	ctxt_table_entries = ext ? 512 : 256;
3015 	ret = -ENOMEM;
3016 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3017 	if (!ctxt_tbls)
3018 		goto out_unmap;
3019 
3020 	for (bus = 0; bus < 256; bus++) {
3021 		ret = copy_context_table(iommu, &old_rt[bus],
3022 					 ctxt_tbls, bus, ext);
3023 		if (ret) {
3024 			pr_err("%s: Failed to copy context table for bus %d\n",
3025 				iommu->name, bus);
3026 			continue;
3027 		}
3028 	}
3029 
3030 	spin_lock_irqsave(&iommu->lock, flags);
3031 
3032 	/* Context tables are copied, now write them to the root_entry table */
3033 	for (bus = 0; bus < 256; bus++) {
3034 		int idx = ext ? bus * 2 : bus;
3035 		u64 val;
3036 
3037 		if (ctxt_tbls[idx]) {
3038 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3039 			iommu->root_entry[bus].lo = val;
3040 		}
3041 
3042 		if (!ext || !ctxt_tbls[idx + 1])
3043 			continue;
3044 
3045 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3046 		iommu->root_entry[bus].hi = val;
3047 	}
3048 
3049 	spin_unlock_irqrestore(&iommu->lock, flags);
3050 
3051 	kfree(ctxt_tbls);
3052 
3053 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3054 
3055 	ret = 0;
3056 
3057 out_unmap:
3058 	memunmap(old_rt);
3059 
3060 	return ret;
3061 }
3062 
3063 #ifdef CONFIG_INTEL_IOMMU_SVM
3064 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3065 {
3066 	struct intel_iommu *iommu = data;
3067 	ioasid_t ioasid;
3068 
3069 	if (!iommu)
3070 		return INVALID_IOASID;
3071 	/*
3072 	 * VT-d virtual command interface always uses the full 20 bit
3073 	 * PASID range. Host can partition guest PASID range based on
3074 	 * policies but it is out of guest's control.
3075 	 */
3076 	if (min < PASID_MIN || max > intel_pasid_max_id)
3077 		return INVALID_IOASID;
3078 
3079 	if (vcmd_alloc_pasid(iommu, &ioasid))
3080 		return INVALID_IOASID;
3081 
3082 	return ioasid;
3083 }
3084 
3085 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3086 {
3087 	struct intel_iommu *iommu = data;
3088 
3089 	if (!iommu)
3090 		return;
3091 	/*
3092 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3093 	 * We can only free the PASID when all the devices are unbound.
3094 	 */
3095 	if (ioasid_find(NULL, ioasid, NULL)) {
3096 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3097 		return;
3098 	}
3099 	vcmd_free_pasid(iommu, ioasid);
3100 }
3101 
3102 static void register_pasid_allocator(struct intel_iommu *iommu)
3103 {
3104 	/*
3105 	 * If we are running in the host, no need for custom allocator
3106 	 * in that PASIDs are allocated from the host system-wide.
3107 	 */
3108 	if (!cap_caching_mode(iommu->cap))
3109 		return;
3110 
3111 	if (!sm_supported(iommu)) {
3112 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3113 		return;
3114 	}
3115 
3116 	/*
3117 	 * Register a custom PASID allocator if we are running in a guest,
3118 	 * guest PASID must be obtained via virtual command interface.
3119 	 * There can be multiple vIOMMUs in each guest but only one allocator
3120 	 * is active. All vIOMMU allocators will eventually be calling the same
3121 	 * host allocator.
3122 	 */
3123 	if (!vccap_pasid(iommu->vccap))
3124 		return;
3125 
3126 	pr_info("Register custom PASID allocator\n");
3127 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3128 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3129 	iommu->pasid_allocator.pdata = (void *)iommu;
3130 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3131 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3132 		/*
3133 		 * Disable scalable mode on this IOMMU if there
3134 		 * is no custom allocator. Mixing SM capable vIOMMU
3135 		 * and non-SM vIOMMU are not supported.
3136 		 */
3137 		intel_iommu_sm = 0;
3138 	}
3139 }
3140 #endif
3141 
3142 static int __init init_dmars(void)
3143 {
3144 	struct dmar_drhd_unit *drhd;
3145 	struct intel_iommu *iommu;
3146 	int ret;
3147 
3148 	/*
3149 	 * for each drhd
3150 	 *    allocate root
3151 	 *    initialize and program root entry to not present
3152 	 * endfor
3153 	 */
3154 	for_each_drhd_unit(drhd) {
3155 		/*
3156 		 * lock not needed as this is only incremented in the single
3157 		 * threaded kernel __init code path all other access are read
3158 		 * only
3159 		 */
3160 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3161 			g_num_of_iommus++;
3162 			continue;
3163 		}
3164 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3165 	}
3166 
3167 	/* Preallocate enough resources for IOMMU hot-addition */
3168 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3169 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3170 
3171 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3172 			GFP_KERNEL);
3173 	if (!g_iommus) {
3174 		pr_err("Allocating global iommu array failed\n");
3175 		ret = -ENOMEM;
3176 		goto error;
3177 	}
3178 
3179 	for_each_iommu(iommu, drhd) {
3180 		if (drhd->ignored) {
3181 			iommu_disable_translation(iommu);
3182 			continue;
3183 		}
3184 
3185 		/*
3186 		 * Find the max pasid size of all IOMMU's in the system.
3187 		 * We need to ensure the system pasid table is no bigger
3188 		 * than the smallest supported.
3189 		 */
3190 		if (pasid_supported(iommu)) {
3191 			u32 temp = 2 << ecap_pss(iommu->ecap);
3192 
3193 			intel_pasid_max_id = min_t(u32, temp,
3194 						   intel_pasid_max_id);
3195 		}
3196 
3197 		g_iommus[iommu->seq_id] = iommu;
3198 
3199 		intel_iommu_init_qi(iommu);
3200 
3201 		ret = iommu_init_domains(iommu);
3202 		if (ret)
3203 			goto free_iommu;
3204 
3205 		init_translation_status(iommu);
3206 
3207 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3208 			iommu_disable_translation(iommu);
3209 			clear_translation_pre_enabled(iommu);
3210 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3211 				iommu->name);
3212 		}
3213 
3214 		/*
3215 		 * TBD:
3216 		 * we could share the same root & context tables
3217 		 * among all IOMMU's. Need to Split it later.
3218 		 */
3219 		ret = iommu_alloc_root_entry(iommu);
3220 		if (ret)
3221 			goto free_iommu;
3222 
3223 		if (translation_pre_enabled(iommu)) {
3224 			pr_info("Translation already enabled - trying to copy translation structures\n");
3225 
3226 			ret = copy_translation_tables(iommu);
3227 			if (ret) {
3228 				/*
3229 				 * We found the IOMMU with translation
3230 				 * enabled - but failed to copy over the
3231 				 * old root-entry table. Try to proceed
3232 				 * by disabling translation now and
3233 				 * allocating a clean root-entry table.
3234 				 * This might cause DMAR faults, but
3235 				 * probably the dump will still succeed.
3236 				 */
3237 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3238 				       iommu->name);
3239 				iommu_disable_translation(iommu);
3240 				clear_translation_pre_enabled(iommu);
3241 			} else {
3242 				pr_info("Copied translation tables from previous kernel for %s\n",
3243 					iommu->name);
3244 			}
3245 		}
3246 
3247 		if (!ecap_pass_through(iommu->ecap))
3248 			hw_pass_through = 0;
3249 		intel_svm_check(iommu);
3250 	}
3251 
3252 	/*
3253 	 * Now that qi is enabled on all iommus, set the root entry and flush
3254 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3255 	 * flush_context function will loop forever and the boot hangs.
3256 	 */
3257 	for_each_active_iommu(iommu, drhd) {
3258 		iommu_flush_write_buffer(iommu);
3259 #ifdef CONFIG_INTEL_IOMMU_SVM
3260 		register_pasid_allocator(iommu);
3261 #endif
3262 		iommu_set_root_entry(iommu);
3263 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3264 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3265 	}
3266 
3267 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3268 	dmar_map_gfx = 0;
3269 #endif
3270 
3271 	if (!dmar_map_gfx)
3272 		iommu_identity_mapping |= IDENTMAP_GFX;
3273 
3274 	check_tylersburg_isoch();
3275 
3276 	ret = si_domain_init(hw_pass_through);
3277 	if (ret)
3278 		goto free_iommu;
3279 
3280 	/*
3281 	 * for each drhd
3282 	 *   enable fault log
3283 	 *   global invalidate context cache
3284 	 *   global invalidate iotlb
3285 	 *   enable translation
3286 	 */
3287 	for_each_iommu(iommu, drhd) {
3288 		if (drhd->ignored) {
3289 			/*
3290 			 * we always have to disable PMRs or DMA may fail on
3291 			 * this device
3292 			 */
3293 			if (force_on)
3294 				iommu_disable_protect_mem_regions(iommu);
3295 			continue;
3296 		}
3297 
3298 		iommu_flush_write_buffer(iommu);
3299 
3300 #ifdef CONFIG_INTEL_IOMMU_SVM
3301 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3302 			/*
3303 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3304 			 * could cause possible lock race condition.
3305 			 */
3306 			up_write(&dmar_global_lock);
3307 			ret = intel_svm_enable_prq(iommu);
3308 			down_write(&dmar_global_lock);
3309 			if (ret)
3310 				goto free_iommu;
3311 		}
3312 #endif
3313 		ret = dmar_set_interrupt(iommu);
3314 		if (ret)
3315 			goto free_iommu;
3316 	}
3317 
3318 	return 0;
3319 
3320 free_iommu:
3321 	for_each_active_iommu(iommu, drhd) {
3322 		disable_dmar_iommu(iommu);
3323 		free_dmar_iommu(iommu);
3324 	}
3325 
3326 	kfree(g_iommus);
3327 
3328 error:
3329 	return ret;
3330 }
3331 
3332 static inline int iommu_domain_cache_init(void)
3333 {
3334 	int ret = 0;
3335 
3336 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3337 					 sizeof(struct dmar_domain),
3338 					 0,
3339 					 SLAB_HWCACHE_ALIGN,
3340 
3341 					 NULL);
3342 	if (!iommu_domain_cache) {
3343 		pr_err("Couldn't create iommu_domain cache\n");
3344 		ret = -ENOMEM;
3345 	}
3346 
3347 	return ret;
3348 }
3349 
3350 static inline int iommu_devinfo_cache_init(void)
3351 {
3352 	int ret = 0;
3353 
3354 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3355 					 sizeof(struct device_domain_info),
3356 					 0,
3357 					 SLAB_HWCACHE_ALIGN,
3358 					 NULL);
3359 	if (!iommu_devinfo_cache) {
3360 		pr_err("Couldn't create devinfo cache\n");
3361 		ret = -ENOMEM;
3362 	}
3363 
3364 	return ret;
3365 }
3366 
3367 static int __init iommu_init_mempool(void)
3368 {
3369 	int ret;
3370 	ret = iova_cache_get();
3371 	if (ret)
3372 		return ret;
3373 
3374 	ret = iommu_domain_cache_init();
3375 	if (ret)
3376 		goto domain_error;
3377 
3378 	ret = iommu_devinfo_cache_init();
3379 	if (!ret)
3380 		return ret;
3381 
3382 	kmem_cache_destroy(iommu_domain_cache);
3383 domain_error:
3384 	iova_cache_put();
3385 
3386 	return -ENOMEM;
3387 }
3388 
3389 static void __init iommu_exit_mempool(void)
3390 {
3391 	kmem_cache_destroy(iommu_devinfo_cache);
3392 	kmem_cache_destroy(iommu_domain_cache);
3393 	iova_cache_put();
3394 }
3395 
3396 static void __init init_no_remapping_devices(void)
3397 {
3398 	struct dmar_drhd_unit *drhd;
3399 	struct device *dev;
3400 	int i;
3401 
3402 	for_each_drhd_unit(drhd) {
3403 		if (!drhd->include_all) {
3404 			for_each_active_dev_scope(drhd->devices,
3405 						  drhd->devices_cnt, i, dev)
3406 				break;
3407 			/* ignore DMAR unit if no devices exist */
3408 			if (i == drhd->devices_cnt)
3409 				drhd->ignored = 1;
3410 		}
3411 	}
3412 
3413 	for_each_active_drhd_unit(drhd) {
3414 		if (drhd->include_all)
3415 			continue;
3416 
3417 		for_each_active_dev_scope(drhd->devices,
3418 					  drhd->devices_cnt, i, dev)
3419 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3420 				break;
3421 		if (i < drhd->devices_cnt)
3422 			continue;
3423 
3424 		/* This IOMMU has *only* gfx devices. Either bypass it or
3425 		   set the gfx_mapped flag, as appropriate */
3426 		drhd->gfx_dedicated = 1;
3427 		if (!dmar_map_gfx)
3428 			drhd->ignored = 1;
3429 	}
3430 }
3431 
3432 #ifdef CONFIG_SUSPEND
3433 static int init_iommu_hw(void)
3434 {
3435 	struct dmar_drhd_unit *drhd;
3436 	struct intel_iommu *iommu = NULL;
3437 
3438 	for_each_active_iommu(iommu, drhd)
3439 		if (iommu->qi)
3440 			dmar_reenable_qi(iommu);
3441 
3442 	for_each_iommu(iommu, drhd) {
3443 		if (drhd->ignored) {
3444 			/*
3445 			 * we always have to disable PMRs or DMA may fail on
3446 			 * this device
3447 			 */
3448 			if (force_on)
3449 				iommu_disable_protect_mem_regions(iommu);
3450 			continue;
3451 		}
3452 
3453 		iommu_flush_write_buffer(iommu);
3454 
3455 		iommu_set_root_entry(iommu);
3456 
3457 		iommu->flush.flush_context(iommu, 0, 0, 0,
3458 					   DMA_CCMD_GLOBAL_INVL);
3459 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3460 		iommu_enable_translation(iommu);
3461 		iommu_disable_protect_mem_regions(iommu);
3462 	}
3463 
3464 	return 0;
3465 }
3466 
3467 static void iommu_flush_all(void)
3468 {
3469 	struct dmar_drhd_unit *drhd;
3470 	struct intel_iommu *iommu;
3471 
3472 	for_each_active_iommu(iommu, drhd) {
3473 		iommu->flush.flush_context(iommu, 0, 0, 0,
3474 					   DMA_CCMD_GLOBAL_INVL);
3475 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3476 					 DMA_TLB_GLOBAL_FLUSH);
3477 	}
3478 }
3479 
3480 static int iommu_suspend(void)
3481 {
3482 	struct dmar_drhd_unit *drhd;
3483 	struct intel_iommu *iommu = NULL;
3484 	unsigned long flag;
3485 
3486 	for_each_active_iommu(iommu, drhd) {
3487 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3488 					     GFP_KERNEL);
3489 		if (!iommu->iommu_state)
3490 			goto nomem;
3491 	}
3492 
3493 	iommu_flush_all();
3494 
3495 	for_each_active_iommu(iommu, drhd) {
3496 		iommu_disable_translation(iommu);
3497 
3498 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3499 
3500 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3501 			readl(iommu->reg + DMAR_FECTL_REG);
3502 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3503 			readl(iommu->reg + DMAR_FEDATA_REG);
3504 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3505 			readl(iommu->reg + DMAR_FEADDR_REG);
3506 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3507 			readl(iommu->reg + DMAR_FEUADDR_REG);
3508 
3509 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3510 	}
3511 	return 0;
3512 
3513 nomem:
3514 	for_each_active_iommu(iommu, drhd)
3515 		kfree(iommu->iommu_state);
3516 
3517 	return -ENOMEM;
3518 }
3519 
3520 static void iommu_resume(void)
3521 {
3522 	struct dmar_drhd_unit *drhd;
3523 	struct intel_iommu *iommu = NULL;
3524 	unsigned long flag;
3525 
3526 	if (init_iommu_hw()) {
3527 		if (force_on)
3528 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3529 		else
3530 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3531 		return;
3532 	}
3533 
3534 	for_each_active_iommu(iommu, drhd) {
3535 
3536 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3537 
3538 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3539 			iommu->reg + DMAR_FECTL_REG);
3540 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3541 			iommu->reg + DMAR_FEDATA_REG);
3542 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3543 			iommu->reg + DMAR_FEADDR_REG);
3544 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3545 			iommu->reg + DMAR_FEUADDR_REG);
3546 
3547 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3548 	}
3549 
3550 	for_each_active_iommu(iommu, drhd)
3551 		kfree(iommu->iommu_state);
3552 }
3553 
3554 static struct syscore_ops iommu_syscore_ops = {
3555 	.resume		= iommu_resume,
3556 	.suspend	= iommu_suspend,
3557 };
3558 
3559 static void __init init_iommu_pm_ops(void)
3560 {
3561 	register_syscore_ops(&iommu_syscore_ops);
3562 }
3563 
3564 #else
3565 static inline void init_iommu_pm_ops(void) {}
3566 #endif	/* CONFIG_PM */
3567 
3568 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3569 {
3570 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3571 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3572 	    rmrr->end_address <= rmrr->base_address ||
3573 	    arch_rmrr_sanity_check(rmrr))
3574 		return -EINVAL;
3575 
3576 	return 0;
3577 }
3578 
3579 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3580 {
3581 	struct acpi_dmar_reserved_memory *rmrr;
3582 	struct dmar_rmrr_unit *rmrru;
3583 
3584 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3585 	if (rmrr_sanity_check(rmrr)) {
3586 		pr_warn(FW_BUG
3587 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3588 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3589 			   rmrr->base_address, rmrr->end_address,
3590 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3591 			   dmi_get_system_info(DMI_BIOS_VERSION),
3592 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3593 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3594 	}
3595 
3596 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3597 	if (!rmrru)
3598 		goto out;
3599 
3600 	rmrru->hdr = header;
3601 
3602 	rmrru->base_address = rmrr->base_address;
3603 	rmrru->end_address = rmrr->end_address;
3604 
3605 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3606 				((void *)rmrr) + rmrr->header.length,
3607 				&rmrru->devices_cnt);
3608 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3609 		goto free_rmrru;
3610 
3611 	list_add(&rmrru->list, &dmar_rmrr_units);
3612 
3613 	return 0;
3614 free_rmrru:
3615 	kfree(rmrru);
3616 out:
3617 	return -ENOMEM;
3618 }
3619 
3620 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3621 {
3622 	struct dmar_atsr_unit *atsru;
3623 	struct acpi_dmar_atsr *tmp;
3624 
3625 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3626 				dmar_rcu_check()) {
3627 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3628 		if (atsr->segment != tmp->segment)
3629 			continue;
3630 		if (atsr->header.length != tmp->header.length)
3631 			continue;
3632 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3633 			return atsru;
3634 	}
3635 
3636 	return NULL;
3637 }
3638 
3639 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3640 {
3641 	struct acpi_dmar_atsr *atsr;
3642 	struct dmar_atsr_unit *atsru;
3643 
3644 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3645 		return 0;
3646 
3647 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3648 	atsru = dmar_find_atsr(atsr);
3649 	if (atsru)
3650 		return 0;
3651 
3652 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3653 	if (!atsru)
3654 		return -ENOMEM;
3655 
3656 	/*
3657 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3658 	 * copy the memory content because the memory buffer will be freed
3659 	 * on return.
3660 	 */
3661 	atsru->hdr = (void *)(atsru + 1);
3662 	memcpy(atsru->hdr, hdr, hdr->length);
3663 	atsru->include_all = atsr->flags & 0x1;
3664 	if (!atsru->include_all) {
3665 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3666 				(void *)atsr + atsr->header.length,
3667 				&atsru->devices_cnt);
3668 		if (atsru->devices_cnt && atsru->devices == NULL) {
3669 			kfree(atsru);
3670 			return -ENOMEM;
3671 		}
3672 	}
3673 
3674 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3675 
3676 	return 0;
3677 }
3678 
3679 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3680 {
3681 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3682 	kfree(atsru);
3683 }
3684 
3685 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3686 {
3687 	struct acpi_dmar_atsr *atsr;
3688 	struct dmar_atsr_unit *atsru;
3689 
3690 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3691 	atsru = dmar_find_atsr(atsr);
3692 	if (atsru) {
3693 		list_del_rcu(&atsru->list);
3694 		synchronize_rcu();
3695 		intel_iommu_free_atsr(atsru);
3696 	}
3697 
3698 	return 0;
3699 }
3700 
3701 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3702 {
3703 	int i;
3704 	struct device *dev;
3705 	struct acpi_dmar_atsr *atsr;
3706 	struct dmar_atsr_unit *atsru;
3707 
3708 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3709 	atsru = dmar_find_atsr(atsr);
3710 	if (!atsru)
3711 		return 0;
3712 
3713 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3714 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3715 					  i, dev)
3716 			return -EBUSY;
3717 	}
3718 
3719 	return 0;
3720 }
3721 
3722 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3723 {
3724 	int sp, ret;
3725 	struct intel_iommu *iommu = dmaru->iommu;
3726 
3727 	if (g_iommus[iommu->seq_id])
3728 		return 0;
3729 
3730 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3731 		pr_warn("%s: Doesn't support hardware pass through.\n",
3732 			iommu->name);
3733 		return -ENXIO;
3734 	}
3735 	if (!ecap_sc_support(iommu->ecap) &&
3736 	    domain_update_iommu_snooping(iommu)) {
3737 		pr_warn("%s: Doesn't support snooping.\n",
3738 			iommu->name);
3739 		return -ENXIO;
3740 	}
3741 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3742 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3743 		pr_warn("%s: Doesn't support large page.\n",
3744 			iommu->name);
3745 		return -ENXIO;
3746 	}
3747 
3748 	/*
3749 	 * Disable translation if already enabled prior to OS handover.
3750 	 */
3751 	if (iommu->gcmd & DMA_GCMD_TE)
3752 		iommu_disable_translation(iommu);
3753 
3754 	g_iommus[iommu->seq_id] = iommu;
3755 	ret = iommu_init_domains(iommu);
3756 	if (ret == 0)
3757 		ret = iommu_alloc_root_entry(iommu);
3758 	if (ret)
3759 		goto out;
3760 
3761 	intel_svm_check(iommu);
3762 
3763 	if (dmaru->ignored) {
3764 		/*
3765 		 * we always have to disable PMRs or DMA may fail on this device
3766 		 */
3767 		if (force_on)
3768 			iommu_disable_protect_mem_regions(iommu);
3769 		return 0;
3770 	}
3771 
3772 	intel_iommu_init_qi(iommu);
3773 	iommu_flush_write_buffer(iommu);
3774 
3775 #ifdef CONFIG_INTEL_IOMMU_SVM
3776 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3777 		ret = intel_svm_enable_prq(iommu);
3778 		if (ret)
3779 			goto disable_iommu;
3780 	}
3781 #endif
3782 	ret = dmar_set_interrupt(iommu);
3783 	if (ret)
3784 		goto disable_iommu;
3785 
3786 	iommu_set_root_entry(iommu);
3787 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3788 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3789 	iommu_enable_translation(iommu);
3790 
3791 	iommu_disable_protect_mem_regions(iommu);
3792 	return 0;
3793 
3794 disable_iommu:
3795 	disable_dmar_iommu(iommu);
3796 out:
3797 	free_dmar_iommu(iommu);
3798 	return ret;
3799 }
3800 
3801 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3802 {
3803 	int ret = 0;
3804 	struct intel_iommu *iommu = dmaru->iommu;
3805 
3806 	if (!intel_iommu_enabled)
3807 		return 0;
3808 	if (iommu == NULL)
3809 		return -EINVAL;
3810 
3811 	if (insert) {
3812 		ret = intel_iommu_add(dmaru);
3813 	} else {
3814 		disable_dmar_iommu(iommu);
3815 		free_dmar_iommu(iommu);
3816 	}
3817 
3818 	return ret;
3819 }
3820 
3821 static void intel_iommu_free_dmars(void)
3822 {
3823 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3824 	struct dmar_atsr_unit *atsru, *atsr_n;
3825 
3826 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3827 		list_del(&rmrru->list);
3828 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3829 		kfree(rmrru);
3830 	}
3831 
3832 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3833 		list_del(&atsru->list);
3834 		intel_iommu_free_atsr(atsru);
3835 	}
3836 }
3837 
3838 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3839 {
3840 	int i, ret = 1;
3841 	struct pci_bus *bus;
3842 	struct pci_dev *bridge = NULL;
3843 	struct device *tmp;
3844 	struct acpi_dmar_atsr *atsr;
3845 	struct dmar_atsr_unit *atsru;
3846 
3847 	dev = pci_physfn(dev);
3848 	for (bus = dev->bus; bus; bus = bus->parent) {
3849 		bridge = bus->self;
3850 		/* If it's an integrated device, allow ATS */
3851 		if (!bridge)
3852 			return 1;
3853 		/* Connected via non-PCIe: no ATS */
3854 		if (!pci_is_pcie(bridge) ||
3855 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3856 			return 0;
3857 		/* If we found the root port, look it up in the ATSR */
3858 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3859 			break;
3860 	}
3861 
3862 	rcu_read_lock();
3863 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3864 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3865 		if (atsr->segment != pci_domain_nr(dev->bus))
3866 			continue;
3867 
3868 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3869 			if (tmp == &bridge->dev)
3870 				goto out;
3871 
3872 		if (atsru->include_all)
3873 			goto out;
3874 	}
3875 	ret = 0;
3876 out:
3877 	rcu_read_unlock();
3878 
3879 	return ret;
3880 }
3881 
3882 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3883 {
3884 	int ret;
3885 	struct dmar_rmrr_unit *rmrru;
3886 	struct dmar_atsr_unit *atsru;
3887 	struct acpi_dmar_atsr *atsr;
3888 	struct acpi_dmar_reserved_memory *rmrr;
3889 
3890 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3891 		return 0;
3892 
3893 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3894 		rmrr = container_of(rmrru->hdr,
3895 				    struct acpi_dmar_reserved_memory, header);
3896 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3897 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3898 				((void *)rmrr) + rmrr->header.length,
3899 				rmrr->segment, rmrru->devices,
3900 				rmrru->devices_cnt);
3901 			if (ret < 0)
3902 				return ret;
3903 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3904 			dmar_remove_dev_scope(info, rmrr->segment,
3905 				rmrru->devices, rmrru->devices_cnt);
3906 		}
3907 	}
3908 
3909 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3910 		if (atsru->include_all)
3911 			continue;
3912 
3913 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3914 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3915 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3916 					(void *)atsr + atsr->header.length,
3917 					atsr->segment, atsru->devices,
3918 					atsru->devices_cnt);
3919 			if (ret > 0)
3920 				break;
3921 			else if (ret < 0)
3922 				return ret;
3923 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3924 			if (dmar_remove_dev_scope(info, atsr->segment,
3925 					atsru->devices, atsru->devices_cnt))
3926 				break;
3927 		}
3928 	}
3929 
3930 	return 0;
3931 }
3932 
3933 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3934 				       unsigned long val, void *v)
3935 {
3936 	struct memory_notify *mhp = v;
3937 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3938 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3939 			mhp->nr_pages - 1);
3940 
3941 	switch (val) {
3942 	case MEM_GOING_ONLINE:
3943 		if (iommu_domain_identity_map(si_domain,
3944 					      start_vpfn, last_vpfn)) {
3945 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3946 				start_vpfn, last_vpfn);
3947 			return NOTIFY_BAD;
3948 		}
3949 		break;
3950 
3951 	case MEM_OFFLINE:
3952 	case MEM_CANCEL_ONLINE:
3953 		{
3954 			struct dmar_drhd_unit *drhd;
3955 			struct intel_iommu *iommu;
3956 			struct page *freelist;
3957 
3958 			freelist = domain_unmap(si_domain,
3959 						start_vpfn, last_vpfn,
3960 						NULL);
3961 
3962 			rcu_read_lock();
3963 			for_each_active_iommu(iommu, drhd)
3964 				iommu_flush_iotlb_psi(iommu, si_domain,
3965 					start_vpfn, mhp->nr_pages,
3966 					!freelist, 0);
3967 			rcu_read_unlock();
3968 			dma_free_pagelist(freelist);
3969 		}
3970 		break;
3971 	}
3972 
3973 	return NOTIFY_OK;
3974 }
3975 
3976 static struct notifier_block intel_iommu_memory_nb = {
3977 	.notifier_call = intel_iommu_memory_notifier,
3978 	.priority = 0
3979 };
3980 
3981 static void free_all_cpu_cached_iovas(unsigned int cpu)
3982 {
3983 	int i;
3984 
3985 	for (i = 0; i < g_num_of_iommus; i++) {
3986 		struct intel_iommu *iommu = g_iommus[i];
3987 		struct dmar_domain *domain;
3988 		int did;
3989 
3990 		if (!iommu)
3991 			continue;
3992 
3993 		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
3994 			domain = get_iommu_domain(iommu, (u16)did);
3995 
3996 			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
3997 				continue;
3998 
3999 			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4000 		}
4001 	}
4002 }
4003 
4004 static int intel_iommu_cpu_dead(unsigned int cpu)
4005 {
4006 	free_all_cpu_cached_iovas(cpu);
4007 	return 0;
4008 }
4009 
4010 static void intel_disable_iommus(void)
4011 {
4012 	struct intel_iommu *iommu = NULL;
4013 	struct dmar_drhd_unit *drhd;
4014 
4015 	for_each_iommu(iommu, drhd)
4016 		iommu_disable_translation(iommu);
4017 }
4018 
4019 void intel_iommu_shutdown(void)
4020 {
4021 	struct dmar_drhd_unit *drhd;
4022 	struct intel_iommu *iommu = NULL;
4023 
4024 	if (no_iommu || dmar_disabled)
4025 		return;
4026 
4027 	down_write(&dmar_global_lock);
4028 
4029 	/* Disable PMRs explicitly here. */
4030 	for_each_iommu(iommu, drhd)
4031 		iommu_disable_protect_mem_regions(iommu);
4032 
4033 	/* Make sure the IOMMUs are switched off */
4034 	intel_disable_iommus();
4035 
4036 	up_write(&dmar_global_lock);
4037 }
4038 
4039 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4040 {
4041 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4042 
4043 	return container_of(iommu_dev, struct intel_iommu, iommu);
4044 }
4045 
4046 static ssize_t intel_iommu_show_version(struct device *dev,
4047 					struct device_attribute *attr,
4048 					char *buf)
4049 {
4050 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4051 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4052 	return sprintf(buf, "%d:%d\n",
4053 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4054 }
4055 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4056 
4057 static ssize_t intel_iommu_show_address(struct device *dev,
4058 					struct device_attribute *attr,
4059 					char *buf)
4060 {
4061 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4062 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4063 }
4064 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4065 
4066 static ssize_t intel_iommu_show_cap(struct device *dev,
4067 				    struct device_attribute *attr,
4068 				    char *buf)
4069 {
4070 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4071 	return sprintf(buf, "%llx\n", iommu->cap);
4072 }
4073 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4074 
4075 static ssize_t intel_iommu_show_ecap(struct device *dev,
4076 				    struct device_attribute *attr,
4077 				    char *buf)
4078 {
4079 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4080 	return sprintf(buf, "%llx\n", iommu->ecap);
4081 }
4082 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4083 
4084 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4085 				      struct device_attribute *attr,
4086 				      char *buf)
4087 {
4088 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4089 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4090 }
4091 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4092 
4093 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4094 					   struct device_attribute *attr,
4095 					   char *buf)
4096 {
4097 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4098 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4099 						  cap_ndoms(iommu->cap)));
4100 }
4101 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4102 
4103 static struct attribute *intel_iommu_attrs[] = {
4104 	&dev_attr_version.attr,
4105 	&dev_attr_address.attr,
4106 	&dev_attr_cap.attr,
4107 	&dev_attr_ecap.attr,
4108 	&dev_attr_domains_supported.attr,
4109 	&dev_attr_domains_used.attr,
4110 	NULL,
4111 };
4112 
4113 static struct attribute_group intel_iommu_group = {
4114 	.name = "intel-iommu",
4115 	.attrs = intel_iommu_attrs,
4116 };
4117 
4118 const struct attribute_group *intel_iommu_groups[] = {
4119 	&intel_iommu_group,
4120 	NULL,
4121 };
4122 
4123 static inline bool has_external_pci(void)
4124 {
4125 	struct pci_dev *pdev = NULL;
4126 
4127 	for_each_pci_dev(pdev)
4128 		if (pdev->external_facing)
4129 			return true;
4130 
4131 	return false;
4132 }
4133 
4134 static int __init platform_optin_force_iommu(void)
4135 {
4136 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4137 		return 0;
4138 
4139 	if (no_iommu || dmar_disabled)
4140 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4141 
4142 	/*
4143 	 * If Intel-IOMMU is disabled by default, we will apply identity
4144 	 * map for all devices except those marked as being untrusted.
4145 	 */
4146 	if (dmar_disabled)
4147 		iommu_set_default_passthrough(false);
4148 
4149 	dmar_disabled = 0;
4150 	no_iommu = 0;
4151 
4152 	return 1;
4153 }
4154 
4155 static int __init probe_acpi_namespace_devices(void)
4156 {
4157 	struct dmar_drhd_unit *drhd;
4158 	/* To avoid a -Wunused-but-set-variable warning. */
4159 	struct intel_iommu *iommu __maybe_unused;
4160 	struct device *dev;
4161 	int i, ret = 0;
4162 
4163 	for_each_active_iommu(iommu, drhd) {
4164 		for_each_active_dev_scope(drhd->devices,
4165 					  drhd->devices_cnt, i, dev) {
4166 			struct acpi_device_physical_node *pn;
4167 			struct iommu_group *group;
4168 			struct acpi_device *adev;
4169 
4170 			if (dev->bus != &acpi_bus_type)
4171 				continue;
4172 
4173 			adev = to_acpi_device(dev);
4174 			mutex_lock(&adev->physical_node_lock);
4175 			list_for_each_entry(pn,
4176 					    &adev->physical_node_list, node) {
4177 				group = iommu_group_get(pn->dev);
4178 				if (group) {
4179 					iommu_group_put(group);
4180 					continue;
4181 				}
4182 
4183 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4184 				ret = iommu_probe_device(pn->dev);
4185 				if (ret)
4186 					break;
4187 			}
4188 			mutex_unlock(&adev->physical_node_lock);
4189 
4190 			if (ret)
4191 				return ret;
4192 		}
4193 	}
4194 
4195 	return 0;
4196 }
4197 
4198 int __init intel_iommu_init(void)
4199 {
4200 	int ret = -ENODEV;
4201 	struct dmar_drhd_unit *drhd;
4202 	struct intel_iommu *iommu;
4203 
4204 	/*
4205 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4206 	 * opt in, so enforce that.
4207 	 */
4208 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4209 		    platform_optin_force_iommu();
4210 
4211 	if (iommu_init_mempool()) {
4212 		if (force_on)
4213 			panic("tboot: Failed to initialize iommu memory\n");
4214 		return -ENOMEM;
4215 	}
4216 
4217 	down_write(&dmar_global_lock);
4218 	if (dmar_table_init()) {
4219 		if (force_on)
4220 			panic("tboot: Failed to initialize DMAR table\n");
4221 		goto out_free_dmar;
4222 	}
4223 
4224 	if (dmar_dev_scope_init() < 0) {
4225 		if (force_on)
4226 			panic("tboot: Failed to initialize DMAR device scope\n");
4227 		goto out_free_dmar;
4228 	}
4229 
4230 	up_write(&dmar_global_lock);
4231 
4232 	/*
4233 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4234 	 * complain later when we register it under the lock.
4235 	 */
4236 	dmar_register_bus_notifier();
4237 
4238 	down_write(&dmar_global_lock);
4239 
4240 	if (!no_iommu)
4241 		intel_iommu_debugfs_init();
4242 
4243 	if (no_iommu || dmar_disabled) {
4244 		/*
4245 		 * We exit the function here to ensure IOMMU's remapping and
4246 		 * mempool aren't setup, which means that the IOMMU's PMRs
4247 		 * won't be disabled via the call to init_dmars(). So disable
4248 		 * it explicitly here. The PMRs were setup by tboot prior to
4249 		 * calling SENTER, but the kernel is expected to reset/tear
4250 		 * down the PMRs.
4251 		 */
4252 		if (intel_iommu_tboot_noforce) {
4253 			for_each_iommu(iommu, drhd)
4254 				iommu_disable_protect_mem_regions(iommu);
4255 		}
4256 
4257 		/*
4258 		 * Make sure the IOMMUs are switched off, even when we
4259 		 * boot into a kexec kernel and the previous kernel left
4260 		 * them enabled
4261 		 */
4262 		intel_disable_iommus();
4263 		goto out_free_dmar;
4264 	}
4265 
4266 	if (list_empty(&dmar_rmrr_units))
4267 		pr_info("No RMRR found\n");
4268 
4269 	if (list_empty(&dmar_atsr_units))
4270 		pr_info("No ATSR found\n");
4271 
4272 	if (dmar_map_gfx)
4273 		intel_iommu_gfx_mapped = 1;
4274 
4275 	init_no_remapping_devices();
4276 
4277 	ret = init_dmars();
4278 	if (ret) {
4279 		if (force_on)
4280 			panic("tboot: Failed to initialize DMARs\n");
4281 		pr_err("Initialization failed\n");
4282 		goto out_free_dmar;
4283 	}
4284 	up_write(&dmar_global_lock);
4285 
4286 	init_iommu_pm_ops();
4287 
4288 	down_read(&dmar_global_lock);
4289 	for_each_active_iommu(iommu, drhd) {
4290 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4291 				       intel_iommu_groups,
4292 				       "%s", iommu->name);
4293 		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4294 		iommu_device_register(&iommu->iommu);
4295 	}
4296 	up_read(&dmar_global_lock);
4297 
4298 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4299 	if (si_domain && !hw_pass_through)
4300 		register_memory_notifier(&intel_iommu_memory_nb);
4301 	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4302 			  intel_iommu_cpu_dead);
4303 
4304 	down_read(&dmar_global_lock);
4305 	if (probe_acpi_namespace_devices())
4306 		pr_warn("ACPI name space devices didn't probe correctly\n");
4307 
4308 	/* Finally, we enable the DMA remapping hardware. */
4309 	for_each_iommu(iommu, drhd) {
4310 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4311 			iommu_enable_translation(iommu);
4312 
4313 		iommu_disable_protect_mem_regions(iommu);
4314 	}
4315 	up_read(&dmar_global_lock);
4316 
4317 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4318 
4319 	intel_iommu_enabled = 1;
4320 
4321 	return 0;
4322 
4323 out_free_dmar:
4324 	intel_iommu_free_dmars();
4325 	up_write(&dmar_global_lock);
4326 	iommu_exit_mempool();
4327 	return ret;
4328 }
4329 
4330 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4331 {
4332 	struct intel_iommu *iommu = opaque;
4333 
4334 	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4335 	return 0;
4336 }
4337 
4338 /*
4339  * NB - intel-iommu lacks any sort of reference counting for the users of
4340  * dependent devices.  If multiple endpoints have intersecting dependent
4341  * devices, unbinding the driver from any one of them will possibly leave
4342  * the others unable to operate.
4343  */
4344 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4345 {
4346 	if (!iommu || !dev || !dev_is_pci(dev))
4347 		return;
4348 
4349 	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4350 }
4351 
4352 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4353 {
4354 	struct dmar_domain *domain;
4355 	struct intel_iommu *iommu;
4356 	unsigned long flags;
4357 
4358 	assert_spin_locked(&device_domain_lock);
4359 
4360 	if (WARN_ON(!info))
4361 		return;
4362 
4363 	iommu = info->iommu;
4364 	domain = info->domain;
4365 
4366 	if (info->dev) {
4367 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4368 			intel_pasid_tear_down_entry(iommu, info->dev,
4369 					PASID_RID2PASID, false);
4370 
4371 		iommu_disable_dev_iotlb(info);
4372 		if (!dev_is_real_dma_subdevice(info->dev))
4373 			domain_context_clear(iommu, info->dev);
4374 		intel_pasid_free_table(info->dev);
4375 	}
4376 
4377 	unlink_domain_info(info);
4378 
4379 	spin_lock_irqsave(&iommu->lock, flags);
4380 	domain_detach_iommu(domain, iommu);
4381 	spin_unlock_irqrestore(&iommu->lock, flags);
4382 
4383 	free_devinfo_mem(info);
4384 }
4385 
4386 static void dmar_remove_one_dev_info(struct device *dev)
4387 {
4388 	struct device_domain_info *info;
4389 	unsigned long flags;
4390 
4391 	spin_lock_irqsave(&device_domain_lock, flags);
4392 	info = get_domain_info(dev);
4393 	if (info)
4394 		__dmar_remove_one_dev_info(info);
4395 	spin_unlock_irqrestore(&device_domain_lock, flags);
4396 }
4397 
4398 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4399 {
4400 	int adjust_width;
4401 
4402 	/* calculate AGAW */
4403 	domain->gaw = guest_width;
4404 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4405 	domain->agaw = width_to_agaw(adjust_width);
4406 
4407 	domain->iommu_coherency = 0;
4408 	domain->iommu_snooping = 0;
4409 	domain->iommu_superpage = 0;
4410 	domain->max_addr = 0;
4411 
4412 	/* always allocate the top pgd */
4413 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4414 	if (!domain->pgd)
4415 		return -ENOMEM;
4416 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4417 	return 0;
4418 }
4419 
4420 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4421 {
4422 	struct dmar_domain *dmar_domain;
4423 	struct iommu_domain *domain;
4424 
4425 	switch (type) {
4426 	case IOMMU_DOMAIN_DMA:
4427 	case IOMMU_DOMAIN_UNMANAGED:
4428 		dmar_domain = alloc_domain(0);
4429 		if (!dmar_domain) {
4430 			pr_err("Can't allocate dmar_domain\n");
4431 			return NULL;
4432 		}
4433 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4434 			pr_err("Domain initialization failed\n");
4435 			domain_exit(dmar_domain);
4436 			return NULL;
4437 		}
4438 
4439 		if (type == IOMMU_DOMAIN_DMA &&
4440 		    iommu_get_dma_cookie(&dmar_domain->domain))
4441 			return NULL;
4442 
4443 		domain = &dmar_domain->domain;
4444 		domain->geometry.aperture_start = 0;
4445 		domain->geometry.aperture_end   =
4446 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4447 		domain->geometry.force_aperture = true;
4448 
4449 		return domain;
4450 	case IOMMU_DOMAIN_IDENTITY:
4451 		return &si_domain->domain;
4452 	default:
4453 		return NULL;
4454 	}
4455 
4456 	return NULL;
4457 }
4458 
4459 static void intel_iommu_domain_free(struct iommu_domain *domain)
4460 {
4461 	if (domain != &si_domain->domain)
4462 		domain_exit(to_dmar_domain(domain));
4463 }
4464 
4465 /*
4466  * Check whether a @domain could be attached to the @dev through the
4467  * aux-domain attach/detach APIs.
4468  */
4469 static inline bool
4470 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4471 {
4472 	struct device_domain_info *info = get_domain_info(dev);
4473 
4474 	return info && info->auxd_enabled &&
4475 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4476 }
4477 
4478 static void auxiliary_link_device(struct dmar_domain *domain,
4479 				  struct device *dev)
4480 {
4481 	struct device_domain_info *info = get_domain_info(dev);
4482 
4483 	assert_spin_locked(&device_domain_lock);
4484 	if (WARN_ON(!info))
4485 		return;
4486 
4487 	domain->auxd_refcnt++;
4488 	list_add(&domain->auxd, &info->auxiliary_domains);
4489 }
4490 
4491 static void auxiliary_unlink_device(struct dmar_domain *domain,
4492 				    struct device *dev)
4493 {
4494 	struct device_domain_info *info = get_domain_info(dev);
4495 
4496 	assert_spin_locked(&device_domain_lock);
4497 	if (WARN_ON(!info))
4498 		return;
4499 
4500 	list_del(&domain->auxd);
4501 	domain->auxd_refcnt--;
4502 
4503 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
4504 		ioasid_put(domain->default_pasid);
4505 }
4506 
4507 static int aux_domain_add_dev(struct dmar_domain *domain,
4508 			      struct device *dev)
4509 {
4510 	int ret;
4511 	unsigned long flags;
4512 	struct intel_iommu *iommu;
4513 
4514 	iommu = device_to_iommu(dev, NULL, NULL);
4515 	if (!iommu)
4516 		return -ENODEV;
4517 
4518 	if (domain->default_pasid <= 0) {
4519 		u32 pasid;
4520 
4521 		/* No private data needed for the default pasid */
4522 		pasid = ioasid_alloc(NULL, PASID_MIN,
4523 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4524 				     NULL);
4525 		if (pasid == INVALID_IOASID) {
4526 			pr_err("Can't allocate default pasid\n");
4527 			return -ENODEV;
4528 		}
4529 		domain->default_pasid = pasid;
4530 	}
4531 
4532 	spin_lock_irqsave(&device_domain_lock, flags);
4533 	/*
4534 	 * iommu->lock must be held to attach domain to iommu and setup the
4535 	 * pasid entry for second level translation.
4536 	 */
4537 	spin_lock(&iommu->lock);
4538 	ret = domain_attach_iommu(domain, iommu);
4539 	if (ret)
4540 		goto attach_failed;
4541 
4542 	/* Setup the PASID entry for mediated devices: */
4543 	if (domain_use_first_level(domain))
4544 		ret = domain_setup_first_level(iommu, domain, dev,
4545 					       domain->default_pasid);
4546 	else
4547 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4548 						     domain->default_pasid);
4549 	if (ret)
4550 		goto table_failed;
4551 	spin_unlock(&iommu->lock);
4552 
4553 	auxiliary_link_device(domain, dev);
4554 
4555 	spin_unlock_irqrestore(&device_domain_lock, flags);
4556 
4557 	return 0;
4558 
4559 table_failed:
4560 	domain_detach_iommu(domain, iommu);
4561 attach_failed:
4562 	spin_unlock(&iommu->lock);
4563 	spin_unlock_irqrestore(&device_domain_lock, flags);
4564 	if (!domain->auxd_refcnt && domain->default_pasid > 0)
4565 		ioasid_put(domain->default_pasid);
4566 
4567 	return ret;
4568 }
4569 
4570 static void aux_domain_remove_dev(struct dmar_domain *domain,
4571 				  struct device *dev)
4572 {
4573 	struct device_domain_info *info;
4574 	struct intel_iommu *iommu;
4575 	unsigned long flags;
4576 
4577 	if (!is_aux_domain(dev, &domain->domain))
4578 		return;
4579 
4580 	spin_lock_irqsave(&device_domain_lock, flags);
4581 	info = get_domain_info(dev);
4582 	iommu = info->iommu;
4583 
4584 	auxiliary_unlink_device(domain, dev);
4585 
4586 	spin_lock(&iommu->lock);
4587 	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
4588 	domain_detach_iommu(domain, iommu);
4589 	spin_unlock(&iommu->lock);
4590 
4591 	spin_unlock_irqrestore(&device_domain_lock, flags);
4592 }
4593 
4594 static int prepare_domain_attach_device(struct iommu_domain *domain,
4595 					struct device *dev)
4596 {
4597 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4598 	struct intel_iommu *iommu;
4599 	int addr_width;
4600 
4601 	iommu = device_to_iommu(dev, NULL, NULL);
4602 	if (!iommu)
4603 		return -ENODEV;
4604 
4605 	/* check if this iommu agaw is sufficient for max mapped address */
4606 	addr_width = agaw_to_width(iommu->agaw);
4607 	if (addr_width > cap_mgaw(iommu->cap))
4608 		addr_width = cap_mgaw(iommu->cap);
4609 
4610 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4611 		dev_err(dev, "%s: iommu width (%d) is not "
4612 		        "sufficient for the mapped address (%llx)\n",
4613 		        __func__, addr_width, dmar_domain->max_addr);
4614 		return -EFAULT;
4615 	}
4616 	dmar_domain->gaw = addr_width;
4617 
4618 	/*
4619 	 * Knock out extra levels of page tables if necessary
4620 	 */
4621 	while (iommu->agaw < dmar_domain->agaw) {
4622 		struct dma_pte *pte;
4623 
4624 		pte = dmar_domain->pgd;
4625 		if (dma_pte_present(pte)) {
4626 			dmar_domain->pgd = (struct dma_pte *)
4627 				phys_to_virt(dma_pte_addr(pte));
4628 			free_pgtable_page(pte);
4629 		}
4630 		dmar_domain->agaw--;
4631 	}
4632 
4633 	return 0;
4634 }
4635 
4636 static int intel_iommu_attach_device(struct iommu_domain *domain,
4637 				     struct device *dev)
4638 {
4639 	int ret;
4640 
4641 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4642 	    device_is_rmrr_locked(dev)) {
4643 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4644 		return -EPERM;
4645 	}
4646 
4647 	if (is_aux_domain(dev, domain))
4648 		return -EPERM;
4649 
4650 	/* normally dev is not mapped */
4651 	if (unlikely(domain_context_mapped(dev))) {
4652 		struct dmar_domain *old_domain;
4653 
4654 		old_domain = find_domain(dev);
4655 		if (old_domain)
4656 			dmar_remove_one_dev_info(dev);
4657 	}
4658 
4659 	ret = prepare_domain_attach_device(domain, dev);
4660 	if (ret)
4661 		return ret;
4662 
4663 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4664 }
4665 
4666 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4667 					 struct device *dev)
4668 {
4669 	int ret;
4670 
4671 	if (!is_aux_domain(dev, domain))
4672 		return -EPERM;
4673 
4674 	ret = prepare_domain_attach_device(domain, dev);
4675 	if (ret)
4676 		return ret;
4677 
4678 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4679 }
4680 
4681 static void intel_iommu_detach_device(struct iommu_domain *domain,
4682 				      struct device *dev)
4683 {
4684 	dmar_remove_one_dev_info(dev);
4685 }
4686 
4687 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4688 					  struct device *dev)
4689 {
4690 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4691 }
4692 
4693 #ifdef CONFIG_INTEL_IOMMU_SVM
4694 /*
4695  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4696  * VT-d granularity. Invalidation is typically included in the unmap operation
4697  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4698  * owns the first level page tables. Invalidations of translation caches in the
4699  * guest are trapped and passed down to the host.
4700  *
4701  * vIOMMU in the guest will only expose first level page tables, therefore
4702  * we do not support IOTLB granularity for request without PASID (second level).
4703  *
4704  * For example, to find the VT-d granularity encoding for IOTLB
4705  * type and page selective granularity within PASID:
4706  * X: indexed by iommu cache type
4707  * Y: indexed by enum iommu_inv_granularity
4708  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4709  */
4710 
4711 static const int
4712 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4713 	/*
4714 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4715 	 * page selective (address granularity)
4716 	 */
4717 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4718 	/* PASID based dev TLBs */
4719 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4720 	/* PASID cache */
4721 	{-EINVAL, -EINVAL, -EINVAL}
4722 };
4723 
4724 static inline int to_vtd_granularity(int type, int granu)
4725 {
4726 	return inv_type_granu_table[type][granu];
4727 }
4728 
4729 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4730 {
4731 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4732 
4733 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4734 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4735 	 * granu size in contiguous memory.
4736 	 */
4737 	return order_base_2(nr_pages);
4738 }
4739 
4740 static int
4741 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4742 			   struct iommu_cache_invalidate_info *inv_info)
4743 {
4744 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4745 	struct device_domain_info *info;
4746 	struct intel_iommu *iommu;
4747 	unsigned long flags;
4748 	int cache_type;
4749 	u8 bus, devfn;
4750 	u16 did, sid;
4751 	int ret = 0;
4752 	u64 size = 0;
4753 
4754 	if (!inv_info || !dmar_domain)
4755 		return -EINVAL;
4756 
4757 	if (!dev || !dev_is_pci(dev))
4758 		return -ENODEV;
4759 
4760 	iommu = device_to_iommu(dev, &bus, &devfn);
4761 	if (!iommu)
4762 		return -ENODEV;
4763 
4764 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4765 		return -EINVAL;
4766 
4767 	spin_lock_irqsave(&device_domain_lock, flags);
4768 	spin_lock(&iommu->lock);
4769 	info = get_domain_info(dev);
4770 	if (!info) {
4771 		ret = -EINVAL;
4772 		goto out_unlock;
4773 	}
4774 	did = dmar_domain->iommu_did[iommu->seq_id];
4775 	sid = PCI_DEVID(bus, devfn);
4776 
4777 	/* Size is only valid in address selective invalidation */
4778 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4779 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4780 				   inv_info->granu.addr_info.nb_granules);
4781 
4782 	for_each_set_bit(cache_type,
4783 			 (unsigned long *)&inv_info->cache,
4784 			 IOMMU_CACHE_INV_TYPE_NR) {
4785 		int granu = 0;
4786 		u64 pasid = 0;
4787 		u64 addr = 0;
4788 
4789 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4790 		if (granu == -EINVAL) {
4791 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4792 					   cache_type, inv_info->granularity);
4793 			break;
4794 		}
4795 
4796 		/*
4797 		 * PASID is stored in different locations based on the
4798 		 * granularity.
4799 		 */
4800 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4801 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4802 			pasid = inv_info->granu.pasid_info.pasid;
4803 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4804 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4805 			pasid = inv_info->granu.addr_info.pasid;
4806 
4807 		switch (BIT(cache_type)) {
4808 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4809 			/* HW will ignore LSB bits based on address mask */
4810 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4811 			    size &&
4812 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4813 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4814 						   inv_info->granu.addr_info.addr, size);
4815 			}
4816 
4817 			/*
4818 			 * If granu is PASID-selective, address is ignored.
4819 			 * We use npages = -1 to indicate that.
4820 			 */
4821 			qi_flush_piotlb(iommu, did, pasid,
4822 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4823 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4824 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4825 
4826 			if (!info->ats_enabled)
4827 				break;
4828 			/*
4829 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4830 			 * in the guest may assume IOTLB flush is inclusive,
4831 			 * which is more efficient.
4832 			 */
4833 			fallthrough;
4834 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4835 			/*
4836 			 * PASID based device TLB invalidation does not support
4837 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4838 			 * IOMMU_INV_GRANU_ADDR.
4839 			 * The equivalent of that is we set the size to be the
4840 			 * entire range of 64 bit. User only provides PASID info
4841 			 * without address info. So we set addr to 0.
4842 			 */
4843 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4844 				size = 64 - VTD_PAGE_SHIFT;
4845 				addr = 0;
4846 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4847 				addr = inv_info->granu.addr_info.addr;
4848 			}
4849 
4850 			if (info->ats_enabled)
4851 				qi_flush_dev_iotlb_pasid(iommu, sid,
4852 						info->pfsid, pasid,
4853 						info->ats_qdep, addr,
4854 						size);
4855 			else
4856 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4857 			break;
4858 		default:
4859 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4860 					    cache_type);
4861 			ret = -EINVAL;
4862 		}
4863 	}
4864 out_unlock:
4865 	spin_unlock(&iommu->lock);
4866 	spin_unlock_irqrestore(&device_domain_lock, flags);
4867 
4868 	return ret;
4869 }
4870 #endif
4871 
4872 static int intel_iommu_map(struct iommu_domain *domain,
4873 			   unsigned long iova, phys_addr_t hpa,
4874 			   size_t size, int iommu_prot, gfp_t gfp)
4875 {
4876 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4877 	u64 max_addr;
4878 	int prot = 0;
4879 	int ret;
4880 
4881 	if (iommu_prot & IOMMU_READ)
4882 		prot |= DMA_PTE_READ;
4883 	if (iommu_prot & IOMMU_WRITE)
4884 		prot |= DMA_PTE_WRITE;
4885 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4886 		prot |= DMA_PTE_SNP;
4887 
4888 	max_addr = iova + size;
4889 	if (dmar_domain->max_addr < max_addr) {
4890 		u64 end;
4891 
4892 		/* check if minimum agaw is sufficient for mapped address */
4893 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4894 		if (end < max_addr) {
4895 			pr_err("%s: iommu width (%d) is not "
4896 			       "sufficient for the mapped address (%llx)\n",
4897 			       __func__, dmar_domain->gaw, max_addr);
4898 			return -EFAULT;
4899 		}
4900 		dmar_domain->max_addr = max_addr;
4901 	}
4902 	/* Round up size to next multiple of PAGE_SIZE, if it and
4903 	   the low bits of hpa would take us onto the next page */
4904 	size = aligned_nrpages(hpa, size);
4905 	ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4906 			     hpa >> VTD_PAGE_SHIFT, size, prot);
4907 	return ret;
4908 }
4909 
4910 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4911 				unsigned long iova, size_t size,
4912 				struct iommu_iotlb_gather *gather)
4913 {
4914 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4915 	unsigned long start_pfn, last_pfn;
4916 	int level = 0;
4917 
4918 	/* Cope with horrid API which requires us to unmap more than the
4919 	   size argument if it happens to be a large-page mapping. */
4920 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4921 
4922 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4923 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4924 
4925 	start_pfn = iova >> VTD_PAGE_SHIFT;
4926 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4927 
4928 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
4929 					last_pfn, gather->freelist);
4930 
4931 	if (dmar_domain->max_addr == iova + size)
4932 		dmar_domain->max_addr = iova;
4933 
4934 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4935 
4936 	return size;
4937 }
4938 
4939 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4940 				 struct iommu_iotlb_gather *gather)
4941 {
4942 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4943 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4944 	size_t size = gather->end - gather->start;
4945 	unsigned long start_pfn;
4946 	unsigned long nrpages;
4947 	int iommu_id;
4948 
4949 	nrpages = aligned_nrpages(gather->start, size);
4950 	start_pfn = mm_to_dma_pfn(iova_pfn);
4951 
4952 	for_each_domain_iommu(iommu_id, dmar_domain)
4953 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4954 				      start_pfn, nrpages, !gather->freelist, 0);
4955 
4956 	dma_free_pagelist(gather->freelist);
4957 }
4958 
4959 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4960 					    dma_addr_t iova)
4961 {
4962 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4963 	struct dma_pte *pte;
4964 	int level = 0;
4965 	u64 phys = 0;
4966 
4967 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4968 	if (pte && dma_pte_present(pte))
4969 		phys = dma_pte_addr(pte) +
4970 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4971 						VTD_PAGE_SHIFT) - 1));
4972 
4973 	return phys;
4974 }
4975 
4976 static inline bool scalable_mode_support(void)
4977 {
4978 	struct dmar_drhd_unit *drhd;
4979 	struct intel_iommu *iommu;
4980 	bool ret = true;
4981 
4982 	rcu_read_lock();
4983 	for_each_active_iommu(iommu, drhd) {
4984 		if (!sm_supported(iommu)) {
4985 			ret = false;
4986 			break;
4987 		}
4988 	}
4989 	rcu_read_unlock();
4990 
4991 	return ret;
4992 }
4993 
4994 static inline bool iommu_pasid_support(void)
4995 {
4996 	struct dmar_drhd_unit *drhd;
4997 	struct intel_iommu *iommu;
4998 	bool ret = true;
4999 
5000 	rcu_read_lock();
5001 	for_each_active_iommu(iommu, drhd) {
5002 		if (!pasid_supported(iommu)) {
5003 			ret = false;
5004 			break;
5005 		}
5006 	}
5007 	rcu_read_unlock();
5008 
5009 	return ret;
5010 }
5011 
5012 static inline bool nested_mode_support(void)
5013 {
5014 	struct dmar_drhd_unit *drhd;
5015 	struct intel_iommu *iommu;
5016 	bool ret = true;
5017 
5018 	rcu_read_lock();
5019 	for_each_active_iommu(iommu, drhd) {
5020 		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5021 			ret = false;
5022 			break;
5023 		}
5024 	}
5025 	rcu_read_unlock();
5026 
5027 	return ret;
5028 }
5029 
5030 static bool intel_iommu_capable(enum iommu_cap cap)
5031 {
5032 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5033 		return domain_update_iommu_snooping(NULL) == 1;
5034 	if (cap == IOMMU_CAP_INTR_REMAP)
5035 		return irq_remapping_enabled == 1;
5036 
5037 	return false;
5038 }
5039 
5040 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5041 {
5042 	struct intel_iommu *iommu;
5043 
5044 	iommu = device_to_iommu(dev, NULL, NULL);
5045 	if (!iommu)
5046 		return ERR_PTR(-ENODEV);
5047 
5048 	if (translation_pre_enabled(iommu))
5049 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5050 
5051 	return &iommu->iommu;
5052 }
5053 
5054 static void intel_iommu_release_device(struct device *dev)
5055 {
5056 	struct intel_iommu *iommu;
5057 
5058 	iommu = device_to_iommu(dev, NULL, NULL);
5059 	if (!iommu)
5060 		return;
5061 
5062 	dmar_remove_one_dev_info(dev);
5063 
5064 	set_dma_ops(dev, NULL);
5065 }
5066 
5067 static void intel_iommu_probe_finalize(struct device *dev)
5068 {
5069 	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5070 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5071 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5072 
5073 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5074 		iommu_setup_dma_ops(dev, base,
5075 				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5076 	else
5077 		set_dma_ops(dev, NULL);
5078 }
5079 
5080 static void intel_iommu_get_resv_regions(struct device *device,
5081 					 struct list_head *head)
5082 {
5083 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5084 	struct iommu_resv_region *reg;
5085 	struct dmar_rmrr_unit *rmrr;
5086 	struct device *i_dev;
5087 	int i;
5088 
5089 	down_read(&dmar_global_lock);
5090 	for_each_rmrr_units(rmrr) {
5091 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5092 					  i, i_dev) {
5093 			struct iommu_resv_region *resv;
5094 			enum iommu_resv_type type;
5095 			size_t length;
5096 
5097 			if (i_dev != device &&
5098 			    !is_downstream_to_pci_bridge(device, i_dev))
5099 				continue;
5100 
5101 			length = rmrr->end_address - rmrr->base_address + 1;
5102 
5103 			type = device_rmrr_is_relaxable(device) ?
5104 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5105 
5106 			resv = iommu_alloc_resv_region(rmrr->base_address,
5107 						       length, prot, type);
5108 			if (!resv)
5109 				break;
5110 
5111 			list_add_tail(&resv->list, head);
5112 		}
5113 	}
5114 	up_read(&dmar_global_lock);
5115 
5116 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5117 	if (dev_is_pci(device)) {
5118 		struct pci_dev *pdev = to_pci_dev(device);
5119 
5120 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5121 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5122 						   IOMMU_RESV_DIRECT_RELAXABLE);
5123 			if (reg)
5124 				list_add_tail(&reg->list, head);
5125 		}
5126 	}
5127 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5128 
5129 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5130 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5131 				      0, IOMMU_RESV_MSI);
5132 	if (!reg)
5133 		return;
5134 	list_add_tail(&reg->list, head);
5135 }
5136 
5137 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5138 {
5139 	struct device_domain_info *info;
5140 	struct context_entry *context;
5141 	struct dmar_domain *domain;
5142 	unsigned long flags;
5143 	u64 ctx_lo;
5144 	int ret;
5145 
5146 	domain = find_domain(dev);
5147 	if (!domain)
5148 		return -EINVAL;
5149 
5150 	spin_lock_irqsave(&device_domain_lock, flags);
5151 	spin_lock(&iommu->lock);
5152 
5153 	ret = -EINVAL;
5154 	info = get_domain_info(dev);
5155 	if (!info || !info->pasid_supported)
5156 		goto out;
5157 
5158 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5159 	if (WARN_ON(!context))
5160 		goto out;
5161 
5162 	ctx_lo = context[0].lo;
5163 
5164 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5165 		ctx_lo |= CONTEXT_PASIDE;
5166 		context[0].lo = ctx_lo;
5167 		wmb();
5168 		iommu->flush.flush_context(iommu,
5169 					   domain->iommu_did[iommu->seq_id],
5170 					   PCI_DEVID(info->bus, info->devfn),
5171 					   DMA_CCMD_MASK_NOBIT,
5172 					   DMA_CCMD_DEVICE_INVL);
5173 	}
5174 
5175 	/* Enable PASID support in the device, if it wasn't already */
5176 	if (!info->pasid_enabled)
5177 		iommu_enable_dev_iotlb(info);
5178 
5179 	ret = 0;
5180 
5181  out:
5182 	spin_unlock(&iommu->lock);
5183 	spin_unlock_irqrestore(&device_domain_lock, flags);
5184 
5185 	return ret;
5186 }
5187 
5188 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5189 {
5190 	if (dev_is_pci(dev))
5191 		return pci_device_group(dev);
5192 	return generic_device_group(dev);
5193 }
5194 
5195 static int intel_iommu_enable_auxd(struct device *dev)
5196 {
5197 	struct device_domain_info *info;
5198 	struct intel_iommu *iommu;
5199 	unsigned long flags;
5200 	int ret;
5201 
5202 	iommu = device_to_iommu(dev, NULL, NULL);
5203 	if (!iommu || dmar_disabled)
5204 		return -EINVAL;
5205 
5206 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5207 		return -EINVAL;
5208 
5209 	ret = intel_iommu_enable_pasid(iommu, dev);
5210 	if (ret)
5211 		return -ENODEV;
5212 
5213 	spin_lock_irqsave(&device_domain_lock, flags);
5214 	info = get_domain_info(dev);
5215 	info->auxd_enabled = 1;
5216 	spin_unlock_irqrestore(&device_domain_lock, flags);
5217 
5218 	return 0;
5219 }
5220 
5221 static int intel_iommu_disable_auxd(struct device *dev)
5222 {
5223 	struct device_domain_info *info;
5224 	unsigned long flags;
5225 
5226 	spin_lock_irqsave(&device_domain_lock, flags);
5227 	info = get_domain_info(dev);
5228 	if (!WARN_ON(!info))
5229 		info->auxd_enabled = 0;
5230 	spin_unlock_irqrestore(&device_domain_lock, flags);
5231 
5232 	return 0;
5233 }
5234 
5235 /*
5236  * A PCI express designated vendor specific extended capability is defined
5237  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5238  * for system software and tools to detect endpoint devices supporting the
5239  * Intel scalable IO virtualization without host driver dependency.
5240  *
5241  * Returns the address of the matching extended capability structure within
5242  * the device's PCI configuration space or 0 if the device does not support
5243  * it.
5244  */
5245 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5246 {
5247 	int pos;
5248 	u16 vendor, id;
5249 
5250 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5251 	while (pos) {
5252 		pci_read_config_word(pdev, pos + 4, &vendor);
5253 		pci_read_config_word(pdev, pos + 8, &id);
5254 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5255 			return pos;
5256 
5257 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5258 	}
5259 
5260 	return 0;
5261 }
5262 
5263 static bool
5264 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5265 {
5266 	if (feat == IOMMU_DEV_FEAT_AUX) {
5267 		int ret;
5268 
5269 		if (!dev_is_pci(dev) || dmar_disabled ||
5270 		    !scalable_mode_support() || !iommu_pasid_support())
5271 			return false;
5272 
5273 		ret = pci_pasid_features(to_pci_dev(dev));
5274 		if (ret < 0)
5275 			return false;
5276 
5277 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5278 	}
5279 
5280 	if (feat == IOMMU_DEV_FEAT_SVA) {
5281 		struct device_domain_info *info = get_domain_info(dev);
5282 
5283 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5284 			info->pasid_supported && info->pri_supported &&
5285 			info->ats_supported;
5286 	}
5287 
5288 	return false;
5289 }
5290 
5291 static int
5292 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5293 {
5294 	if (feat == IOMMU_DEV_FEAT_AUX)
5295 		return intel_iommu_enable_auxd(dev);
5296 
5297 	if (feat == IOMMU_DEV_FEAT_SVA) {
5298 		struct device_domain_info *info = get_domain_info(dev);
5299 
5300 		if (!info)
5301 			return -EINVAL;
5302 
5303 		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5304 			return 0;
5305 	}
5306 
5307 	return -ENODEV;
5308 }
5309 
5310 static int
5311 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5312 {
5313 	if (feat == IOMMU_DEV_FEAT_AUX)
5314 		return intel_iommu_disable_auxd(dev);
5315 
5316 	return -ENODEV;
5317 }
5318 
5319 static bool
5320 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5321 {
5322 	struct device_domain_info *info = get_domain_info(dev);
5323 
5324 	if (feat == IOMMU_DEV_FEAT_AUX)
5325 		return scalable_mode_support() && info && info->auxd_enabled;
5326 
5327 	return false;
5328 }
5329 
5330 static int
5331 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5332 {
5333 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5334 
5335 	return dmar_domain->default_pasid > 0 ?
5336 			dmar_domain->default_pasid : -EINVAL;
5337 }
5338 
5339 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5340 					   struct device *dev)
5341 {
5342 	return attach_deferred(dev);
5343 }
5344 
5345 static int
5346 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5347 			    enum iommu_attr attr, void *data)
5348 {
5349 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5350 	unsigned long flags;
5351 	int ret = 0;
5352 
5353 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5354 		return -EINVAL;
5355 
5356 	switch (attr) {
5357 	case DOMAIN_ATTR_NESTING:
5358 		spin_lock_irqsave(&device_domain_lock, flags);
5359 		if (nested_mode_support() &&
5360 		    list_empty(&dmar_domain->devices)) {
5361 			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5362 			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5363 		} else {
5364 			ret = -ENODEV;
5365 		}
5366 		spin_unlock_irqrestore(&device_domain_lock, flags);
5367 		break;
5368 	default:
5369 		ret = -EINVAL;
5370 		break;
5371 	}
5372 
5373 	return ret;
5374 }
5375 
5376 static int
5377 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5378 			    enum iommu_attr attr, void *data)
5379 {
5380 	switch (domain->type) {
5381 	case IOMMU_DOMAIN_UNMANAGED:
5382 		return -ENODEV;
5383 	case IOMMU_DOMAIN_DMA:
5384 		switch (attr) {
5385 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5386 			*(int *)data = !intel_iommu_strict;
5387 			return 0;
5388 		default:
5389 			return -ENODEV;
5390 		}
5391 		break;
5392 	default:
5393 		return -EINVAL;
5394 	}
5395 }
5396 
5397 /*
5398  * Check that the device does not live on an external facing PCI port that is
5399  * marked as untrusted. Such devices should not be able to apply quirks and
5400  * thus not be able to bypass the IOMMU restrictions.
5401  */
5402 static bool risky_device(struct pci_dev *pdev)
5403 {
5404 	if (pdev->untrusted) {
5405 		pci_info(pdev,
5406 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5407 			 pdev->vendor, pdev->device);
5408 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5409 		return true;
5410 	}
5411 	return false;
5412 }
5413 
5414 const struct iommu_ops intel_iommu_ops = {
5415 	.capable		= intel_iommu_capable,
5416 	.domain_alloc		= intel_iommu_domain_alloc,
5417 	.domain_free		= intel_iommu_domain_free,
5418 	.domain_get_attr        = intel_iommu_domain_get_attr,
5419 	.domain_set_attr	= intel_iommu_domain_set_attr,
5420 	.attach_dev		= intel_iommu_attach_device,
5421 	.detach_dev		= intel_iommu_detach_device,
5422 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5423 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5424 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5425 	.map			= intel_iommu_map,
5426 	.unmap			= intel_iommu_unmap,
5427 	.flush_iotlb_all        = intel_flush_iotlb_all,
5428 	.iotlb_sync		= intel_iommu_tlb_sync,
5429 	.iova_to_phys		= intel_iommu_iova_to_phys,
5430 	.probe_device		= intel_iommu_probe_device,
5431 	.probe_finalize		= intel_iommu_probe_finalize,
5432 	.release_device		= intel_iommu_release_device,
5433 	.get_resv_regions	= intel_iommu_get_resv_regions,
5434 	.put_resv_regions	= generic_iommu_put_resv_regions,
5435 	.device_group		= intel_iommu_device_group,
5436 	.dev_has_feat		= intel_iommu_dev_has_feat,
5437 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5438 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5439 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5440 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5441 	.def_domain_type	= device_def_domain_type,
5442 	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5443 #ifdef CONFIG_INTEL_IOMMU_SVM
5444 	.cache_invalidate	= intel_iommu_sva_invalidate,
5445 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5446 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5447 	.sva_bind		= intel_svm_bind,
5448 	.sva_unbind		= intel_svm_unbind,
5449 	.sva_get_pasid		= intel_svm_get_pasid,
5450 	.page_response		= intel_svm_page_response,
5451 #endif
5452 };
5453 
5454 static void quirk_iommu_igfx(struct pci_dev *dev)
5455 {
5456 	if (risky_device(dev))
5457 		return;
5458 
5459 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5460 	dmar_map_gfx = 0;
5461 }
5462 
5463 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5470 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5471 
5472 /* Broadwell igfx malfunctions with dmar */
5473 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5488 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5489 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5491 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5492 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5493 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5497 
5498 static void quirk_iommu_rwbf(struct pci_dev *dev)
5499 {
5500 	if (risky_device(dev))
5501 		return;
5502 
5503 	/*
5504 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5505 	 * but needs it. Same seems to hold for the desktop versions.
5506 	 */
5507 	pci_info(dev, "Forcing write-buffer flush capability\n");
5508 	rwbf_quirk = 1;
5509 }
5510 
5511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5518 
5519 #define GGC 0x52
5520 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5521 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5522 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5523 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5524 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5525 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5526 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5527 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5528 
5529 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5530 {
5531 	unsigned short ggc;
5532 
5533 	if (risky_device(dev))
5534 		return;
5535 
5536 	if (pci_read_config_word(dev, GGC, &ggc))
5537 		return;
5538 
5539 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5540 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5541 		dmar_map_gfx = 0;
5542 	} else if (dmar_map_gfx) {
5543 		/* we have to ensure the gfx device is idle before we flush */
5544 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5545 		intel_iommu_strict = 1;
5546        }
5547 }
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5552 
5553 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5554 {
5555 	unsigned short ver;
5556 
5557 	if (!IS_GFX_DEVICE(dev))
5558 		return;
5559 
5560 	ver = (dev->device >> 8) & 0xff;
5561 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5562 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5563 	    ver != 0x9a)
5564 		return;
5565 
5566 	if (risky_device(dev))
5567 		return;
5568 
5569 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5570 	iommu_skip_te_disable = 1;
5571 }
5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5573 
5574 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5575    ISOCH DMAR unit for the Azalia sound device, but not give it any
5576    TLB entries, which causes it to deadlock. Check for that.  We do
5577    this in a function called from init_dmars(), instead of in a PCI
5578    quirk, because we don't want to print the obnoxious "BIOS broken"
5579    message if VT-d is actually disabled.
5580 */
5581 static void __init check_tylersburg_isoch(void)
5582 {
5583 	struct pci_dev *pdev;
5584 	uint32_t vtisochctrl;
5585 
5586 	/* If there's no Azalia in the system anyway, forget it. */
5587 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5588 	if (!pdev)
5589 		return;
5590 
5591 	if (risky_device(pdev)) {
5592 		pci_dev_put(pdev);
5593 		return;
5594 	}
5595 
5596 	pci_dev_put(pdev);
5597 
5598 	/* System Management Registers. Might be hidden, in which case
5599 	   we can't do the sanity check. But that's OK, because the
5600 	   known-broken BIOSes _don't_ actually hide it, so far. */
5601 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5602 	if (!pdev)
5603 		return;
5604 
5605 	if (risky_device(pdev)) {
5606 		pci_dev_put(pdev);
5607 		return;
5608 	}
5609 
5610 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5611 		pci_dev_put(pdev);
5612 		return;
5613 	}
5614 
5615 	pci_dev_put(pdev);
5616 
5617 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5618 	if (vtisochctrl & 1)
5619 		return;
5620 
5621 	/* Drop all bits other than the number of TLB entries */
5622 	vtisochctrl &= 0x1c;
5623 
5624 	/* If we have the recommended number of TLB entries (16), fine. */
5625 	if (vtisochctrl == 0x10)
5626 		return;
5627 
5628 	/* Zero TLB entries? You get to ride the short bus to school. */
5629 	if (!vtisochctrl) {
5630 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5631 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5632 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5633 		     dmi_get_system_info(DMI_BIOS_VERSION),
5634 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5635 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5636 		return;
5637 	}
5638 
5639 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5640 	       vtisochctrl);
5641 }
5642