xref: /linux/drivers/iommu/intel/iommu.c (revision 9052e9c95d908d6c3d7570aadc8898e1d871c8bb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/intel-svm.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #include "../irq_remapping.h"
50 #include "../iommu-sva-lib.h"
51 #include "pasid.h"
52 #include "cap_audit.h"
53 
54 #define ROOT_SIZE		VTD_PAGE_SIZE
55 #define CONTEXT_SIZE		VTD_PAGE_SIZE
56 
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 
62 #define IOAPIC_RANGE_START	(0xfee00000)
63 #define IOAPIC_RANGE_END	(0xfeefffff)
64 #define IOVA_START_ADDR		(0x1000)
65 
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
77 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN		(1)
82 
83 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
84 
85 /* page table handling */
86 #define LEVEL_STRIDE		(9)
87 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
88 
89 static inline int agaw_to_level(int agaw)
90 {
91 	return agaw + 2;
92 }
93 
94 static inline int agaw_to_width(int agaw)
95 {
96 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
97 }
98 
99 static inline int width_to_agaw(int width)
100 {
101 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
102 }
103 
104 static inline unsigned int level_to_offset_bits(int level)
105 {
106 	return (level - 1) * LEVEL_STRIDE;
107 }
108 
109 static inline int pfn_level_offset(u64 pfn, int level)
110 {
111 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
112 }
113 
114 static inline u64 level_mask(int level)
115 {
116 	return -1ULL << level_to_offset_bits(level);
117 }
118 
119 static inline u64 level_size(int level)
120 {
121 	return 1ULL << level_to_offset_bits(level);
122 }
123 
124 static inline u64 align_to_level(u64 pfn, int level)
125 {
126 	return (pfn + level_size(level) - 1) & level_mask(level);
127 }
128 
129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
130 {
131 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
132 }
133 
134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
135    are never going to work. */
136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
137 {
138 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
139 }
140 
141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
142 {
143 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
144 }
145 static inline unsigned long page_to_dma_pfn(struct page *pg)
146 {
147 	return mm_to_dma_pfn(page_to_pfn(pg));
148 }
149 static inline unsigned long virt_to_dma_pfn(void *p)
150 {
151 	return page_to_dma_pfn(virt_to_page(p));
152 }
153 
154 /* global iommu list, set NULL for ignored DMAR units */
155 static struct intel_iommu **g_iommus;
156 
157 static void __init check_tylersburg_isoch(void);
158 static int rwbf_quirk;
159 
160 /*
161  * set to 1 to panic kernel if can't successfully enable VT-d
162  * (used when kernel is launched w/ TXT)
163  */
164 static int force_on = 0;
165 static int intel_iommu_tboot_noforce;
166 static int no_platform_optin;
167 
168 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
169 
170 /*
171  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
172  * if marked present.
173  */
174 static phys_addr_t root_entry_lctp(struct root_entry *re)
175 {
176 	if (!(re->lo & 1))
177 		return 0;
178 
179 	return re->lo & VTD_PAGE_MASK;
180 }
181 
182 /*
183  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
184  * if marked present.
185  */
186 static phys_addr_t root_entry_uctp(struct root_entry *re)
187 {
188 	if (!(re->hi & 1))
189 		return 0;
190 
191 	return re->hi & VTD_PAGE_MASK;
192 }
193 
194 static inline void context_clear_pasid_enable(struct context_entry *context)
195 {
196 	context->lo &= ~(1ULL << 11);
197 }
198 
199 static inline bool context_pasid_enabled(struct context_entry *context)
200 {
201 	return !!(context->lo & (1ULL << 11));
202 }
203 
204 static inline void context_set_copied(struct context_entry *context)
205 {
206 	context->hi |= (1ull << 3);
207 }
208 
209 static inline bool context_copied(struct context_entry *context)
210 {
211 	return !!(context->hi & (1ULL << 3));
212 }
213 
214 static inline bool __context_present(struct context_entry *context)
215 {
216 	return (context->lo & 1);
217 }
218 
219 bool context_present(struct context_entry *context)
220 {
221 	return context_pasid_enabled(context) ?
222 	     __context_present(context) :
223 	     __context_present(context) && !context_copied(context);
224 }
225 
226 static inline void context_set_present(struct context_entry *context)
227 {
228 	context->lo |= 1;
229 }
230 
231 static inline void context_set_fault_enable(struct context_entry *context)
232 {
233 	context->lo &= (((u64)-1) << 2) | 1;
234 }
235 
236 static inline void context_set_translation_type(struct context_entry *context,
237 						unsigned long value)
238 {
239 	context->lo &= (((u64)-1) << 4) | 3;
240 	context->lo |= (value & 3) << 2;
241 }
242 
243 static inline void context_set_address_root(struct context_entry *context,
244 					    unsigned long value)
245 {
246 	context->lo &= ~VTD_PAGE_MASK;
247 	context->lo |= value & VTD_PAGE_MASK;
248 }
249 
250 static inline void context_set_address_width(struct context_entry *context,
251 					     unsigned long value)
252 {
253 	context->hi |= value & 7;
254 }
255 
256 static inline void context_set_domain_id(struct context_entry *context,
257 					 unsigned long value)
258 {
259 	context->hi |= (value & ((1 << 16) - 1)) << 8;
260 }
261 
262 static inline int context_domain_id(struct context_entry *c)
263 {
264 	return((c->hi >> 8) & 0xffff);
265 }
266 
267 static inline void context_clear_entry(struct context_entry *context)
268 {
269 	context->lo = 0;
270 	context->hi = 0;
271 }
272 
273 /*
274  * This domain is a statically identity mapping domain.
275  *	1. This domain creats a static 1:1 mapping to all usable memory.
276  * 	2. It maps to each iommu if successful.
277  *	3. Each iommu mapps to this domain if successful.
278  */
279 static struct dmar_domain *si_domain;
280 static int hw_pass_through = 1;
281 
282 #define for_each_domain_iommu(idx, domain)			\
283 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
284 		if (domain->iommu_refcnt[idx])
285 
286 struct dmar_rmrr_unit {
287 	struct list_head list;		/* list of rmrr units	*/
288 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
289 	u64	base_address;		/* reserved base address*/
290 	u64	end_address;		/* reserved end address */
291 	struct dmar_dev_scope *devices;	/* target devices */
292 	int	devices_cnt;		/* target device count */
293 };
294 
295 struct dmar_atsr_unit {
296 	struct list_head list;		/* list of ATSR units */
297 	struct acpi_dmar_header *hdr;	/* ACPI header */
298 	struct dmar_dev_scope *devices;	/* target devices */
299 	int devices_cnt;		/* target device count */
300 	u8 include_all:1;		/* include all ports */
301 };
302 
303 struct dmar_satc_unit {
304 	struct list_head list;		/* list of SATC units */
305 	struct acpi_dmar_header *hdr;	/* ACPI header */
306 	struct dmar_dev_scope *devices;	/* target devices */
307 	struct intel_iommu *iommu;	/* the corresponding iommu */
308 	int devices_cnt;		/* target device count */
309 	u8 atc_required:1;		/* ATS is required */
310 };
311 
312 static LIST_HEAD(dmar_atsr_units);
313 static LIST_HEAD(dmar_rmrr_units);
314 static LIST_HEAD(dmar_satc_units);
315 
316 #define for_each_rmrr_units(rmrr) \
317 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
318 
319 /* bitmap for indexing intel_iommus */
320 static int g_num_of_iommus;
321 
322 static void domain_exit(struct dmar_domain *domain);
323 static void domain_remove_dev_info(struct dmar_domain *domain);
324 static void dmar_remove_one_dev_info(struct device *dev);
325 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
326 static int intel_iommu_attach_device(struct iommu_domain *domain,
327 				     struct device *dev);
328 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
329 					    dma_addr_t iova);
330 
331 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
332 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
333 
334 int intel_iommu_enabled = 0;
335 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
336 
337 static int dmar_map_gfx = 1;
338 static int intel_iommu_superpage = 1;
339 static int iommu_identity_mapping;
340 static int iommu_skip_te_disable;
341 
342 #define IDENTMAP_GFX		2
343 #define IDENTMAP_AZALIA		4
344 
345 int intel_iommu_gfx_mapped;
346 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
347 
348 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
349 struct device_domain_info *get_domain_info(struct device *dev)
350 {
351 	struct device_domain_info *info;
352 
353 	if (!dev)
354 		return NULL;
355 
356 	info = dev_iommu_priv_get(dev);
357 	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
358 		return NULL;
359 
360 	return info;
361 }
362 
363 DEFINE_SPINLOCK(device_domain_lock);
364 static LIST_HEAD(device_domain_list);
365 
366 /*
367  * Iterate over elements in device_domain_list and call the specified
368  * callback @fn against each element.
369  */
370 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
371 				     void *data), void *data)
372 {
373 	int ret = 0;
374 	unsigned long flags;
375 	struct device_domain_info *info;
376 
377 	spin_lock_irqsave(&device_domain_lock, flags);
378 	list_for_each_entry(info, &device_domain_list, global) {
379 		ret = fn(info, data);
380 		if (ret) {
381 			spin_unlock_irqrestore(&device_domain_lock, flags);
382 			return ret;
383 		}
384 	}
385 	spin_unlock_irqrestore(&device_domain_lock, flags);
386 
387 	return 0;
388 }
389 
390 const struct iommu_ops intel_iommu_ops;
391 
392 static bool translation_pre_enabled(struct intel_iommu *iommu)
393 {
394 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
395 }
396 
397 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
398 {
399 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
400 }
401 
402 static void init_translation_status(struct intel_iommu *iommu)
403 {
404 	u32 gsts;
405 
406 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
407 	if (gsts & DMA_GSTS_TES)
408 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
409 }
410 
411 static int __init intel_iommu_setup(char *str)
412 {
413 	if (!str)
414 		return -EINVAL;
415 	while (*str) {
416 		if (!strncmp(str, "on", 2)) {
417 			dmar_disabled = 0;
418 			pr_info("IOMMU enabled\n");
419 		} else if (!strncmp(str, "off", 3)) {
420 			dmar_disabled = 1;
421 			no_platform_optin = 1;
422 			pr_info("IOMMU disabled\n");
423 		} else if (!strncmp(str, "igfx_off", 8)) {
424 			dmar_map_gfx = 0;
425 			pr_info("Disable GFX device mapping\n");
426 		} else if (!strncmp(str, "forcedac", 8)) {
427 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
428 			iommu_dma_forcedac = true;
429 		} else if (!strncmp(str, "strict", 6)) {
430 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
431 			iommu_set_dma_strict();
432 		} else if (!strncmp(str, "sp_off", 6)) {
433 			pr_info("Disable supported super page\n");
434 			intel_iommu_superpage = 0;
435 		} else if (!strncmp(str, "sm_on", 5)) {
436 			pr_info("Enable scalable mode if hardware supports\n");
437 			intel_iommu_sm = 1;
438 		} else if (!strncmp(str, "sm_off", 6)) {
439 			pr_info("Scalable mode is disallowed\n");
440 			intel_iommu_sm = 0;
441 		} else if (!strncmp(str, "tboot_noforce", 13)) {
442 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
443 			intel_iommu_tboot_noforce = 1;
444 		}
445 
446 		str += strcspn(str, ",");
447 		while (*str == ',')
448 			str++;
449 	}
450 	return 0;
451 }
452 __setup("intel_iommu=", intel_iommu_setup);
453 
454 static struct kmem_cache *iommu_domain_cache;
455 static struct kmem_cache *iommu_devinfo_cache;
456 
457 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
458 {
459 	struct dmar_domain **domains;
460 	int idx = did >> 8;
461 
462 	domains = iommu->domains[idx];
463 	if (!domains)
464 		return NULL;
465 
466 	return domains[did & 0xff];
467 }
468 
469 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
470 			     struct dmar_domain *domain)
471 {
472 	struct dmar_domain **domains;
473 	int idx = did >> 8;
474 
475 	if (!iommu->domains[idx]) {
476 		size_t size = 256 * sizeof(struct dmar_domain *);
477 		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
478 	}
479 
480 	domains = iommu->domains[idx];
481 	if (WARN_ON(!domains))
482 		return;
483 	else
484 		domains[did & 0xff] = domain;
485 }
486 
487 void *alloc_pgtable_page(int node)
488 {
489 	struct page *page;
490 	void *vaddr = NULL;
491 
492 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
493 	if (page)
494 		vaddr = page_address(page);
495 	return vaddr;
496 }
497 
498 void free_pgtable_page(void *vaddr)
499 {
500 	free_page((unsigned long)vaddr);
501 }
502 
503 static inline void *alloc_domain_mem(void)
504 {
505 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
506 }
507 
508 static void free_domain_mem(void *vaddr)
509 {
510 	kmem_cache_free(iommu_domain_cache, vaddr);
511 }
512 
513 static inline void * alloc_devinfo_mem(void)
514 {
515 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
516 }
517 
518 static inline void free_devinfo_mem(void *vaddr)
519 {
520 	kmem_cache_free(iommu_devinfo_cache, vaddr);
521 }
522 
523 static inline int domain_type_is_si(struct dmar_domain *domain)
524 {
525 	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
526 }
527 
528 static inline bool domain_use_first_level(struct dmar_domain *domain)
529 {
530 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
531 }
532 
533 static inline int domain_pfn_supported(struct dmar_domain *domain,
534 				       unsigned long pfn)
535 {
536 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
537 
538 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
539 }
540 
541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
542 {
543 	unsigned long sagaw;
544 	int agaw;
545 
546 	sagaw = cap_sagaw(iommu->cap);
547 	for (agaw = width_to_agaw(max_gaw);
548 	     agaw >= 0; agaw--) {
549 		if (test_bit(agaw, &sagaw))
550 			break;
551 	}
552 
553 	return agaw;
554 }
555 
556 /*
557  * Calculate max SAGAW for each iommu.
558  */
559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
560 {
561 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
562 }
563 
564 /*
565  * calculate agaw for each iommu.
566  * "SAGAW" may be different across iommus, use a default agaw, and
567  * get a supported less agaw for iommus that don't support the default agaw.
568  */
569 int iommu_calculate_agaw(struct intel_iommu *iommu)
570 {
571 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
572 }
573 
574 /* This functionin only returns single iommu in a domain */
575 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
576 {
577 	int iommu_id;
578 
579 	/* si_domain and vm domain should not get here. */
580 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
581 		return NULL;
582 
583 	for_each_domain_iommu(iommu_id, domain)
584 		break;
585 
586 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587 		return NULL;
588 
589 	return g_iommus[iommu_id];
590 }
591 
592 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
593 {
594 	return sm_supported(iommu) ?
595 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
596 }
597 
598 static void domain_update_iommu_coherency(struct dmar_domain *domain)
599 {
600 	struct dmar_drhd_unit *drhd;
601 	struct intel_iommu *iommu;
602 	bool found = false;
603 	int i;
604 
605 	domain->iommu_coherency = true;
606 
607 	for_each_domain_iommu(i, domain) {
608 		found = true;
609 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
610 			domain->iommu_coherency = false;
611 			break;
612 		}
613 	}
614 	if (found)
615 		return;
616 
617 	/* No hardware attached; use lowest common denominator */
618 	rcu_read_lock();
619 	for_each_active_iommu(iommu, drhd) {
620 		if (!iommu_paging_structure_coherency(iommu)) {
621 			domain->iommu_coherency = false;
622 			break;
623 		}
624 	}
625 	rcu_read_unlock();
626 }
627 
628 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
629 {
630 	struct dmar_drhd_unit *drhd;
631 	struct intel_iommu *iommu;
632 	bool ret = true;
633 
634 	rcu_read_lock();
635 	for_each_active_iommu(iommu, drhd) {
636 		if (iommu != skip) {
637 			/*
638 			 * If the hardware is operating in the scalable mode,
639 			 * the snooping control is always supported since we
640 			 * always set PASID-table-entry.PGSNP bit if the domain
641 			 * is managed outside (UNMANAGED).
642 			 */
643 			if (!sm_supported(iommu) &&
644 			    !ecap_sc_support(iommu->ecap)) {
645 				ret = false;
646 				break;
647 			}
648 		}
649 	}
650 	rcu_read_unlock();
651 
652 	return ret;
653 }
654 
655 static int domain_update_iommu_superpage(struct dmar_domain *domain,
656 					 struct intel_iommu *skip)
657 {
658 	struct dmar_drhd_unit *drhd;
659 	struct intel_iommu *iommu;
660 	int mask = 0x3;
661 
662 	if (!intel_iommu_superpage)
663 		return 0;
664 
665 	/* set iommu_superpage to the smallest common denominator */
666 	rcu_read_lock();
667 	for_each_active_iommu(iommu, drhd) {
668 		if (iommu != skip) {
669 			if (domain && domain_use_first_level(domain)) {
670 				if (!cap_fl1gp_support(iommu->cap))
671 					mask = 0x1;
672 			} else {
673 				mask &= cap_super_page_val(iommu->cap);
674 			}
675 
676 			if (!mask)
677 				break;
678 		}
679 	}
680 	rcu_read_unlock();
681 
682 	return fls(mask);
683 }
684 
685 static int domain_update_device_node(struct dmar_domain *domain)
686 {
687 	struct device_domain_info *info;
688 	int nid = NUMA_NO_NODE;
689 
690 	assert_spin_locked(&device_domain_lock);
691 
692 	if (list_empty(&domain->devices))
693 		return NUMA_NO_NODE;
694 
695 	list_for_each_entry(info, &domain->devices, link) {
696 		if (!info->dev)
697 			continue;
698 
699 		/*
700 		 * There could possibly be multiple device numa nodes as devices
701 		 * within the same domain may sit behind different IOMMUs. There
702 		 * isn't perfect answer in such situation, so we select first
703 		 * come first served policy.
704 		 */
705 		nid = dev_to_node(info->dev);
706 		if (nid != NUMA_NO_NODE)
707 			break;
708 	}
709 
710 	return nid;
711 }
712 
713 static void domain_update_iotlb(struct dmar_domain *domain);
714 
715 /* Return the super pagesize bitmap if supported. */
716 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
717 {
718 	unsigned long bitmap = 0;
719 
720 	/*
721 	 * 1-level super page supports page size of 2MiB, 2-level super page
722 	 * supports page size of both 2MiB and 1GiB.
723 	 */
724 	if (domain->iommu_superpage == 1)
725 		bitmap |= SZ_2M;
726 	else if (domain->iommu_superpage == 2)
727 		bitmap |= SZ_2M | SZ_1G;
728 
729 	return bitmap;
730 }
731 
732 /* Some capabilities may be different across iommus */
733 static void domain_update_iommu_cap(struct dmar_domain *domain)
734 {
735 	domain_update_iommu_coherency(domain);
736 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
737 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
738 
739 	/*
740 	 * If RHSA is missing, we should default to the device numa domain
741 	 * as fall back.
742 	 */
743 	if (domain->nid == NUMA_NO_NODE)
744 		domain->nid = domain_update_device_node(domain);
745 
746 	/*
747 	 * First-level translation restricts the input-address to a
748 	 * canonical address (i.e., address bits 63:N have the same
749 	 * value as address bit [N-1], where N is 48-bits with 4-level
750 	 * paging and 57-bits with 5-level paging). Hence, skip bit
751 	 * [N-1].
752 	 */
753 	if (domain_use_first_level(domain))
754 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
755 	else
756 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
757 
758 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
759 	domain_update_iotlb(domain);
760 }
761 
762 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
763 					 u8 devfn, int alloc)
764 {
765 	struct root_entry *root = &iommu->root_entry[bus];
766 	struct context_entry *context;
767 	u64 *entry;
768 
769 	entry = &root->lo;
770 	if (sm_supported(iommu)) {
771 		if (devfn >= 0x80) {
772 			devfn -= 0x80;
773 			entry = &root->hi;
774 		}
775 		devfn *= 2;
776 	}
777 	if (*entry & 1)
778 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
779 	else {
780 		unsigned long phy_addr;
781 		if (!alloc)
782 			return NULL;
783 
784 		context = alloc_pgtable_page(iommu->node);
785 		if (!context)
786 			return NULL;
787 
788 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
789 		phy_addr = virt_to_phys((void *)context);
790 		*entry = phy_addr | 1;
791 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
792 	}
793 	return &context[devfn];
794 }
795 
796 static bool attach_deferred(struct device *dev)
797 {
798 	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
799 }
800 
801 /**
802  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
803  *				 sub-hierarchy of a candidate PCI-PCI bridge
804  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
805  * @bridge: the candidate PCI-PCI bridge
806  *
807  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
808  */
809 static bool
810 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
811 {
812 	struct pci_dev *pdev, *pbridge;
813 
814 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
815 		return false;
816 
817 	pdev = to_pci_dev(dev);
818 	pbridge = to_pci_dev(bridge);
819 
820 	if (pbridge->subordinate &&
821 	    pbridge->subordinate->number <= pdev->bus->number &&
822 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
823 		return true;
824 
825 	return false;
826 }
827 
828 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
829 {
830 	struct dmar_drhd_unit *drhd;
831 	u32 vtbar;
832 	int rc;
833 
834 	/* We know that this device on this chipset has its own IOMMU.
835 	 * If we find it under a different IOMMU, then the BIOS is lying
836 	 * to us. Hope that the IOMMU for this device is actually
837 	 * disabled, and it needs no translation...
838 	 */
839 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
840 	if (rc) {
841 		/* "can't" happen */
842 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
843 		return false;
844 	}
845 	vtbar &= 0xffff0000;
846 
847 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
848 	drhd = dmar_find_matched_drhd_unit(pdev);
849 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
850 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
851 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
852 		return true;
853 	}
854 
855 	return false;
856 }
857 
858 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
859 {
860 	if (!iommu || iommu->drhd->ignored)
861 		return true;
862 
863 	if (dev_is_pci(dev)) {
864 		struct pci_dev *pdev = to_pci_dev(dev);
865 
866 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
867 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
868 		    quirk_ioat_snb_local_iommu(pdev))
869 			return true;
870 	}
871 
872 	return false;
873 }
874 
875 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
876 {
877 	struct dmar_drhd_unit *drhd = NULL;
878 	struct pci_dev *pdev = NULL;
879 	struct intel_iommu *iommu;
880 	struct device *tmp;
881 	u16 segment = 0;
882 	int i;
883 
884 	if (!dev)
885 		return NULL;
886 
887 	if (dev_is_pci(dev)) {
888 		struct pci_dev *pf_pdev;
889 
890 		pdev = pci_real_dma_dev(to_pci_dev(dev));
891 
892 		/* VFs aren't listed in scope tables; we need to look up
893 		 * the PF instead to find the IOMMU. */
894 		pf_pdev = pci_physfn(pdev);
895 		dev = &pf_pdev->dev;
896 		segment = pci_domain_nr(pdev->bus);
897 	} else if (has_acpi_companion(dev))
898 		dev = &ACPI_COMPANION(dev)->dev;
899 
900 	rcu_read_lock();
901 	for_each_iommu(iommu, drhd) {
902 		if (pdev && segment != drhd->segment)
903 			continue;
904 
905 		for_each_active_dev_scope(drhd->devices,
906 					  drhd->devices_cnt, i, tmp) {
907 			if (tmp == dev) {
908 				/* For a VF use its original BDF# not that of the PF
909 				 * which we used for the IOMMU lookup. Strictly speaking
910 				 * we could do this for all PCI devices; we only need to
911 				 * get the BDF# from the scope table for ACPI matches. */
912 				if (pdev && pdev->is_virtfn)
913 					goto got_pdev;
914 
915 				if (bus && devfn) {
916 					*bus = drhd->devices[i].bus;
917 					*devfn = drhd->devices[i].devfn;
918 				}
919 				goto out;
920 			}
921 
922 			if (is_downstream_to_pci_bridge(dev, tmp))
923 				goto got_pdev;
924 		}
925 
926 		if (pdev && drhd->include_all) {
927 		got_pdev:
928 			if (bus && devfn) {
929 				*bus = pdev->bus->number;
930 				*devfn = pdev->devfn;
931 			}
932 			goto out;
933 		}
934 	}
935 	iommu = NULL;
936  out:
937 	if (iommu_is_dummy(iommu, dev))
938 		iommu = NULL;
939 
940 	rcu_read_unlock();
941 
942 	return iommu;
943 }
944 
945 static void domain_flush_cache(struct dmar_domain *domain,
946 			       void *addr, int size)
947 {
948 	if (!domain->iommu_coherency)
949 		clflush_cache_range(addr, size);
950 }
951 
952 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
953 {
954 	struct context_entry *context;
955 	int ret = 0;
956 	unsigned long flags;
957 
958 	spin_lock_irqsave(&iommu->lock, flags);
959 	context = iommu_context_addr(iommu, bus, devfn, 0);
960 	if (context)
961 		ret = context_present(context);
962 	spin_unlock_irqrestore(&iommu->lock, flags);
963 	return ret;
964 }
965 
966 static void free_context_table(struct intel_iommu *iommu)
967 {
968 	int i;
969 	unsigned long flags;
970 	struct context_entry *context;
971 
972 	spin_lock_irqsave(&iommu->lock, flags);
973 	if (!iommu->root_entry) {
974 		goto out;
975 	}
976 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
977 		context = iommu_context_addr(iommu, i, 0, 0);
978 		if (context)
979 			free_pgtable_page(context);
980 
981 		if (!sm_supported(iommu))
982 			continue;
983 
984 		context = iommu_context_addr(iommu, i, 0x80, 0);
985 		if (context)
986 			free_pgtable_page(context);
987 
988 	}
989 	free_pgtable_page(iommu->root_entry);
990 	iommu->root_entry = NULL;
991 out:
992 	spin_unlock_irqrestore(&iommu->lock, flags);
993 }
994 
995 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
996 				      unsigned long pfn, int *target_level)
997 {
998 	struct dma_pte *parent, *pte;
999 	int level = agaw_to_level(domain->agaw);
1000 	int offset;
1001 
1002 	BUG_ON(!domain->pgd);
1003 
1004 	if (!domain_pfn_supported(domain, pfn))
1005 		/* Address beyond IOMMU's addressing capabilities. */
1006 		return NULL;
1007 
1008 	parent = domain->pgd;
1009 
1010 	while (1) {
1011 		void *tmp_page;
1012 
1013 		offset = pfn_level_offset(pfn, level);
1014 		pte = &parent[offset];
1015 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1016 			break;
1017 		if (level == *target_level)
1018 			break;
1019 
1020 		if (!dma_pte_present(pte)) {
1021 			uint64_t pteval;
1022 
1023 			tmp_page = alloc_pgtable_page(domain->nid);
1024 
1025 			if (!tmp_page)
1026 				return NULL;
1027 
1028 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1029 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1030 			if (domain_use_first_level(domain)) {
1031 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1032 				if (iommu_is_dma_domain(&domain->domain))
1033 					pteval |= DMA_FL_PTE_ACCESS;
1034 			}
1035 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1036 				/* Someone else set it while we were thinking; use theirs. */
1037 				free_pgtable_page(tmp_page);
1038 			else
1039 				domain_flush_cache(domain, pte, sizeof(*pte));
1040 		}
1041 		if (level == 1)
1042 			break;
1043 
1044 		parent = phys_to_virt(dma_pte_addr(pte));
1045 		level--;
1046 	}
1047 
1048 	if (!*target_level)
1049 		*target_level = level;
1050 
1051 	return pte;
1052 }
1053 
1054 /* return address's pte at specific level */
1055 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1056 					 unsigned long pfn,
1057 					 int level, int *large_page)
1058 {
1059 	struct dma_pte *parent, *pte;
1060 	int total = agaw_to_level(domain->agaw);
1061 	int offset;
1062 
1063 	parent = domain->pgd;
1064 	while (level <= total) {
1065 		offset = pfn_level_offset(pfn, total);
1066 		pte = &parent[offset];
1067 		if (level == total)
1068 			return pte;
1069 
1070 		if (!dma_pte_present(pte)) {
1071 			*large_page = total;
1072 			break;
1073 		}
1074 
1075 		if (dma_pte_superpage(pte)) {
1076 			*large_page = total;
1077 			return pte;
1078 		}
1079 
1080 		parent = phys_to_virt(dma_pte_addr(pte));
1081 		total--;
1082 	}
1083 	return NULL;
1084 }
1085 
1086 /* clear last level pte, a tlb flush should be followed */
1087 static void dma_pte_clear_range(struct dmar_domain *domain,
1088 				unsigned long start_pfn,
1089 				unsigned long last_pfn)
1090 {
1091 	unsigned int large_page;
1092 	struct dma_pte *first_pte, *pte;
1093 
1094 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1095 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1096 	BUG_ON(start_pfn > last_pfn);
1097 
1098 	/* we don't need lock here; nobody else touches the iova range */
1099 	do {
1100 		large_page = 1;
1101 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1102 		if (!pte) {
1103 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1104 			continue;
1105 		}
1106 		do {
1107 			dma_clear_pte(pte);
1108 			start_pfn += lvl_to_nr_pages(large_page);
1109 			pte++;
1110 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1111 
1112 		domain_flush_cache(domain, first_pte,
1113 				   (void *)pte - (void *)first_pte);
1114 
1115 	} while (start_pfn && start_pfn <= last_pfn);
1116 }
1117 
1118 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1119 			       int retain_level, struct dma_pte *pte,
1120 			       unsigned long pfn, unsigned long start_pfn,
1121 			       unsigned long last_pfn)
1122 {
1123 	pfn = max(start_pfn, pfn);
1124 	pte = &pte[pfn_level_offset(pfn, level)];
1125 
1126 	do {
1127 		unsigned long level_pfn;
1128 		struct dma_pte *level_pte;
1129 
1130 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1131 			goto next;
1132 
1133 		level_pfn = pfn & level_mask(level);
1134 		level_pte = phys_to_virt(dma_pte_addr(pte));
1135 
1136 		if (level > 2) {
1137 			dma_pte_free_level(domain, level - 1, retain_level,
1138 					   level_pte, level_pfn, start_pfn,
1139 					   last_pfn);
1140 		}
1141 
1142 		/*
1143 		 * Free the page table if we're below the level we want to
1144 		 * retain and the range covers the entire table.
1145 		 */
1146 		if (level < retain_level && !(start_pfn > level_pfn ||
1147 		      last_pfn < level_pfn + level_size(level) - 1)) {
1148 			dma_clear_pte(pte);
1149 			domain_flush_cache(domain, pte, sizeof(*pte));
1150 			free_pgtable_page(level_pte);
1151 		}
1152 next:
1153 		pfn += level_size(level);
1154 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155 }
1156 
1157 /*
1158  * clear last level (leaf) ptes and free page table pages below the
1159  * level we wish to keep intact.
1160  */
1161 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1162 				   unsigned long start_pfn,
1163 				   unsigned long last_pfn,
1164 				   int retain_level)
1165 {
1166 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1167 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1168 	BUG_ON(start_pfn > last_pfn);
1169 
1170 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1171 
1172 	/* We don't need lock here; nobody else touches the iova range */
1173 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1174 			   domain->pgd, 0, start_pfn, last_pfn);
1175 
1176 	/* free pgd */
1177 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 		free_pgtable_page(domain->pgd);
1179 		domain->pgd = NULL;
1180 	}
1181 }
1182 
1183 /* When a page at a given level is being unlinked from its parent, we don't
1184    need to *modify* it at all. All we need to do is make a list of all the
1185    pages which can be freed just as soon as we've flushed the IOTLB and we
1186    know the hardware page-walk will no longer touch them.
1187    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1188    be freed. */
1189 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1190 					    int level, struct dma_pte *pte,
1191 					    struct page *freelist)
1192 {
1193 	struct page *pg;
1194 
1195 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1196 	pg->freelist = freelist;
1197 	freelist = pg;
1198 
1199 	if (level == 1)
1200 		return freelist;
1201 
1202 	pte = page_address(pg);
1203 	do {
1204 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1205 			freelist = dma_pte_list_pagetables(domain, level - 1,
1206 							   pte, freelist);
1207 		pte++;
1208 	} while (!first_pte_in_page(pte));
1209 
1210 	return freelist;
1211 }
1212 
1213 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1214 					struct dma_pte *pte, unsigned long pfn,
1215 					unsigned long start_pfn,
1216 					unsigned long last_pfn,
1217 					struct page *freelist)
1218 {
1219 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1220 
1221 	pfn = max(start_pfn, pfn);
1222 	pte = &pte[pfn_level_offset(pfn, level)];
1223 
1224 	do {
1225 		unsigned long level_pfn;
1226 
1227 		if (!dma_pte_present(pte))
1228 			goto next;
1229 
1230 		level_pfn = pfn & level_mask(level);
1231 
1232 		/* If range covers entire pagetable, free it */
1233 		if (start_pfn <= level_pfn &&
1234 		    last_pfn >= level_pfn + level_size(level) - 1) {
1235 			/* These suborbinate page tables are going away entirely. Don't
1236 			   bother to clear them; we're just going to *free* them. */
1237 			if (level > 1 && !dma_pte_superpage(pte))
1238 				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1239 
1240 			dma_clear_pte(pte);
1241 			if (!first_pte)
1242 				first_pte = pte;
1243 			last_pte = pte;
1244 		} else if (level > 1) {
1245 			/* Recurse down into a level that isn't *entirely* obsolete */
1246 			freelist = dma_pte_clear_level(domain, level - 1,
1247 						       phys_to_virt(dma_pte_addr(pte)),
1248 						       level_pfn, start_pfn, last_pfn,
1249 						       freelist);
1250 		}
1251 next:
1252 		pfn += level_size(level);
1253 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1254 
1255 	if (first_pte)
1256 		domain_flush_cache(domain, first_pte,
1257 				   (void *)++last_pte - (void *)first_pte);
1258 
1259 	return freelist;
1260 }
1261 
1262 /* We can't just free the pages because the IOMMU may still be walking
1263    the page tables, and may have cached the intermediate levels. The
1264    pages can only be freed after the IOTLB flush has been done. */
1265 static struct page *domain_unmap(struct dmar_domain *domain,
1266 				 unsigned long start_pfn,
1267 				 unsigned long last_pfn,
1268 				 struct page *freelist)
1269 {
1270 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1271 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1272 	BUG_ON(start_pfn > last_pfn);
1273 
1274 	/* we don't need lock here; nobody else touches the iova range */
1275 	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1276 				       domain->pgd, 0, start_pfn, last_pfn,
1277 				       freelist);
1278 
1279 	/* free pgd */
1280 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1281 		struct page *pgd_page = virt_to_page(domain->pgd);
1282 		pgd_page->freelist = freelist;
1283 		freelist = pgd_page;
1284 
1285 		domain->pgd = NULL;
1286 	}
1287 
1288 	return freelist;
1289 }
1290 
1291 static void dma_free_pagelist(struct page *freelist)
1292 {
1293 	struct page *pg;
1294 
1295 	while ((pg = freelist)) {
1296 		freelist = pg->freelist;
1297 		free_pgtable_page(page_address(pg));
1298 	}
1299 }
1300 
1301 /* iommu handling */
1302 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1303 {
1304 	struct root_entry *root;
1305 	unsigned long flags;
1306 
1307 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1308 	if (!root) {
1309 		pr_err("Allocating root entry for %s failed\n",
1310 			iommu->name);
1311 		return -ENOMEM;
1312 	}
1313 
1314 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1315 
1316 	spin_lock_irqsave(&iommu->lock, flags);
1317 	iommu->root_entry = root;
1318 	spin_unlock_irqrestore(&iommu->lock, flags);
1319 
1320 	return 0;
1321 }
1322 
1323 static void iommu_set_root_entry(struct intel_iommu *iommu)
1324 {
1325 	u64 addr;
1326 	u32 sts;
1327 	unsigned long flag;
1328 
1329 	addr = virt_to_phys(iommu->root_entry);
1330 	if (sm_supported(iommu))
1331 		addr |= DMA_RTADDR_SMT;
1332 
1333 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1334 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1335 
1336 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1337 
1338 	/* Make sure hardware complete it */
1339 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1340 		      readl, (sts & DMA_GSTS_RTPS), sts);
1341 
1342 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1343 
1344 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1345 	if (sm_supported(iommu))
1346 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1347 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1348 }
1349 
1350 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1351 {
1352 	u32 val;
1353 	unsigned long flag;
1354 
1355 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1356 		return;
1357 
1358 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1359 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1360 
1361 	/* Make sure hardware complete it */
1362 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1363 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1364 
1365 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1366 }
1367 
1368 /* return value determine if we need a write buffer flush */
1369 static void __iommu_flush_context(struct intel_iommu *iommu,
1370 				  u16 did, u16 source_id, u8 function_mask,
1371 				  u64 type)
1372 {
1373 	u64 val = 0;
1374 	unsigned long flag;
1375 
1376 	switch (type) {
1377 	case DMA_CCMD_GLOBAL_INVL:
1378 		val = DMA_CCMD_GLOBAL_INVL;
1379 		break;
1380 	case DMA_CCMD_DOMAIN_INVL:
1381 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1382 		break;
1383 	case DMA_CCMD_DEVICE_INVL:
1384 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1385 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1386 		break;
1387 	default:
1388 		BUG();
1389 	}
1390 	val |= DMA_CCMD_ICC;
1391 
1392 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1393 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1394 
1395 	/* Make sure hardware complete it */
1396 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1397 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1398 
1399 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1400 }
1401 
1402 /* return value determine if we need a write buffer flush */
1403 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1404 				u64 addr, unsigned int size_order, u64 type)
1405 {
1406 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1407 	u64 val = 0, val_iva = 0;
1408 	unsigned long flag;
1409 
1410 	switch (type) {
1411 	case DMA_TLB_GLOBAL_FLUSH:
1412 		/* global flush doesn't need set IVA_REG */
1413 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1414 		break;
1415 	case DMA_TLB_DSI_FLUSH:
1416 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1417 		break;
1418 	case DMA_TLB_PSI_FLUSH:
1419 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1420 		/* IH bit is passed in as part of address */
1421 		val_iva = size_order | addr;
1422 		break;
1423 	default:
1424 		BUG();
1425 	}
1426 	/* Note: set drain read/write */
1427 #if 0
1428 	/*
1429 	 * This is probably to be super secure.. Looks like we can
1430 	 * ignore it without any impact.
1431 	 */
1432 	if (cap_read_drain(iommu->cap))
1433 		val |= DMA_TLB_READ_DRAIN;
1434 #endif
1435 	if (cap_write_drain(iommu->cap))
1436 		val |= DMA_TLB_WRITE_DRAIN;
1437 
1438 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1439 	/* Note: Only uses first TLB reg currently */
1440 	if (val_iva)
1441 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1442 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1443 
1444 	/* Make sure hardware complete it */
1445 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1446 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1447 
1448 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1449 
1450 	/* check IOTLB invalidation granularity */
1451 	if (DMA_TLB_IAIG(val) == 0)
1452 		pr_err("Flush IOTLB failed\n");
1453 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1454 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1455 			(unsigned long long)DMA_TLB_IIRG(type),
1456 			(unsigned long long)DMA_TLB_IAIG(val));
1457 }
1458 
1459 static struct device_domain_info *
1460 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1461 			 u8 bus, u8 devfn)
1462 {
1463 	struct device_domain_info *info;
1464 
1465 	assert_spin_locked(&device_domain_lock);
1466 
1467 	if (!iommu->qi)
1468 		return NULL;
1469 
1470 	list_for_each_entry(info, &domain->devices, link)
1471 		if (info->iommu == iommu && info->bus == bus &&
1472 		    info->devfn == devfn) {
1473 			if (info->ats_supported && info->dev)
1474 				return info;
1475 			break;
1476 		}
1477 
1478 	return NULL;
1479 }
1480 
1481 static void domain_update_iotlb(struct dmar_domain *domain)
1482 {
1483 	struct device_domain_info *info;
1484 	bool has_iotlb_device = false;
1485 
1486 	assert_spin_locked(&device_domain_lock);
1487 
1488 	list_for_each_entry(info, &domain->devices, link)
1489 		if (info->ats_enabled) {
1490 			has_iotlb_device = true;
1491 			break;
1492 		}
1493 
1494 	if (!has_iotlb_device) {
1495 		struct subdev_domain_info *sinfo;
1496 
1497 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1498 			info = get_domain_info(sinfo->pdev);
1499 			if (info && info->ats_enabled) {
1500 				has_iotlb_device = true;
1501 				break;
1502 			}
1503 		}
1504 	}
1505 
1506 	domain->has_iotlb_device = has_iotlb_device;
1507 }
1508 
1509 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1510 {
1511 	struct pci_dev *pdev;
1512 
1513 	assert_spin_locked(&device_domain_lock);
1514 
1515 	if (!info || !dev_is_pci(info->dev))
1516 		return;
1517 
1518 	pdev = to_pci_dev(info->dev);
1519 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1520 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1521 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1522 	 * reserved, which should be set to 0.
1523 	 */
1524 	if (!ecap_dit(info->iommu->ecap))
1525 		info->pfsid = 0;
1526 	else {
1527 		struct pci_dev *pf_pdev;
1528 
1529 		/* pdev will be returned if device is not a vf */
1530 		pf_pdev = pci_physfn(pdev);
1531 		info->pfsid = pci_dev_id(pf_pdev);
1532 	}
1533 
1534 #ifdef CONFIG_INTEL_IOMMU_SVM
1535 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1536 	   the device if you enable PASID support after ATS support is
1537 	   undefined. So always enable PASID support on devices which
1538 	   have it, even if we can't yet know if we're ever going to
1539 	   use it. */
1540 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1541 		info->pasid_enabled = 1;
1542 
1543 	if (info->pri_supported &&
1544 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1545 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1546 		info->pri_enabled = 1;
1547 #endif
1548 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1549 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1550 		info->ats_enabled = 1;
1551 		domain_update_iotlb(info->domain);
1552 		info->ats_qdep = pci_ats_queue_depth(pdev);
1553 	}
1554 }
1555 
1556 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1557 {
1558 	struct pci_dev *pdev;
1559 
1560 	assert_spin_locked(&device_domain_lock);
1561 
1562 	if (!dev_is_pci(info->dev))
1563 		return;
1564 
1565 	pdev = to_pci_dev(info->dev);
1566 
1567 	if (info->ats_enabled) {
1568 		pci_disable_ats(pdev);
1569 		info->ats_enabled = 0;
1570 		domain_update_iotlb(info->domain);
1571 	}
1572 #ifdef CONFIG_INTEL_IOMMU_SVM
1573 	if (info->pri_enabled) {
1574 		pci_disable_pri(pdev);
1575 		info->pri_enabled = 0;
1576 	}
1577 	if (info->pasid_enabled) {
1578 		pci_disable_pasid(pdev);
1579 		info->pasid_enabled = 0;
1580 	}
1581 #endif
1582 }
1583 
1584 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1585 				    u64 addr, unsigned int mask)
1586 {
1587 	u16 sid, qdep;
1588 
1589 	if (!info || !info->ats_enabled)
1590 		return;
1591 
1592 	sid = info->bus << 8 | info->devfn;
1593 	qdep = info->ats_qdep;
1594 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1595 			   qdep, addr, mask);
1596 }
1597 
1598 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1599 				  u64 addr, unsigned mask)
1600 {
1601 	unsigned long flags;
1602 	struct device_domain_info *info;
1603 	struct subdev_domain_info *sinfo;
1604 
1605 	if (!domain->has_iotlb_device)
1606 		return;
1607 
1608 	spin_lock_irqsave(&device_domain_lock, flags);
1609 	list_for_each_entry(info, &domain->devices, link)
1610 		__iommu_flush_dev_iotlb(info, addr, mask);
1611 
1612 	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1613 		info = get_domain_info(sinfo->pdev);
1614 		__iommu_flush_dev_iotlb(info, addr, mask);
1615 	}
1616 	spin_unlock_irqrestore(&device_domain_lock, flags);
1617 }
1618 
1619 static void domain_flush_piotlb(struct intel_iommu *iommu,
1620 				struct dmar_domain *domain,
1621 				u64 addr, unsigned long npages, bool ih)
1622 {
1623 	u16 did = domain->iommu_did[iommu->seq_id];
1624 
1625 	if (domain->default_pasid)
1626 		qi_flush_piotlb(iommu, did, domain->default_pasid,
1627 				addr, npages, ih);
1628 
1629 	if (!list_empty(&domain->devices))
1630 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1631 }
1632 
1633 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1634 				  struct dmar_domain *domain,
1635 				  unsigned long pfn, unsigned int pages,
1636 				  int ih, int map)
1637 {
1638 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1639 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1640 	u16 did = domain->iommu_did[iommu->seq_id];
1641 
1642 	BUG_ON(pages == 0);
1643 
1644 	if (ih)
1645 		ih = 1 << 6;
1646 
1647 	if (domain_use_first_level(domain)) {
1648 		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1649 	} else {
1650 		/*
1651 		 * Fallback to domain selective flush if no PSI support or
1652 		 * the size is too big. PSI requires page size to be 2 ^ x,
1653 		 * and the base address is naturally aligned to the size.
1654 		 */
1655 		if (!cap_pgsel_inv(iommu->cap) ||
1656 		    mask > cap_max_amask_val(iommu->cap))
1657 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1658 							DMA_TLB_DSI_FLUSH);
1659 		else
1660 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1661 							DMA_TLB_PSI_FLUSH);
1662 	}
1663 
1664 	/*
1665 	 * In caching mode, changes of pages from non-present to present require
1666 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1667 	 */
1668 	if (!cap_caching_mode(iommu->cap) || !map)
1669 		iommu_flush_dev_iotlb(domain, addr, mask);
1670 }
1671 
1672 /* Notification for newly created mappings */
1673 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1674 					struct dmar_domain *domain,
1675 					unsigned long pfn, unsigned int pages)
1676 {
1677 	/*
1678 	 * It's a non-present to present mapping. Only flush if caching mode
1679 	 * and second level.
1680 	 */
1681 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1682 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1683 	else
1684 		iommu_flush_write_buffer(iommu);
1685 }
1686 
1687 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1688 {
1689 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1690 	int idx;
1691 
1692 	for_each_domain_iommu(idx, dmar_domain) {
1693 		struct intel_iommu *iommu = g_iommus[idx];
1694 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1695 
1696 		if (domain_use_first_level(dmar_domain))
1697 			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1698 		else
1699 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1700 						 DMA_TLB_DSI_FLUSH);
1701 
1702 		if (!cap_caching_mode(iommu->cap))
1703 			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1704 					      0, MAX_AGAW_PFN_WIDTH);
1705 	}
1706 }
1707 
1708 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1709 {
1710 	u32 pmen;
1711 	unsigned long flags;
1712 
1713 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1714 		return;
1715 
1716 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1717 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1718 	pmen &= ~DMA_PMEN_EPM;
1719 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1720 
1721 	/* wait for the protected region status bit to clear */
1722 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1723 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1724 
1725 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1726 }
1727 
1728 static void iommu_enable_translation(struct intel_iommu *iommu)
1729 {
1730 	u32 sts;
1731 	unsigned long flags;
1732 
1733 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1734 	iommu->gcmd |= DMA_GCMD_TE;
1735 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1736 
1737 	/* Make sure hardware complete it */
1738 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1739 		      readl, (sts & DMA_GSTS_TES), sts);
1740 
1741 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1742 }
1743 
1744 static void iommu_disable_translation(struct intel_iommu *iommu)
1745 {
1746 	u32 sts;
1747 	unsigned long flag;
1748 
1749 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1750 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1751 		return;
1752 
1753 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1754 	iommu->gcmd &= ~DMA_GCMD_TE;
1755 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1756 
1757 	/* Make sure hardware complete it */
1758 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1759 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1760 
1761 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1762 }
1763 
1764 static int iommu_init_domains(struct intel_iommu *iommu)
1765 {
1766 	u32 ndomains, nlongs;
1767 	size_t size;
1768 
1769 	ndomains = cap_ndoms(iommu->cap);
1770 	pr_debug("%s: Number of Domains supported <%d>\n",
1771 		 iommu->name, ndomains);
1772 	nlongs = BITS_TO_LONGS(ndomains);
1773 
1774 	spin_lock_init(&iommu->lock);
1775 
1776 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1777 	if (!iommu->domain_ids)
1778 		return -ENOMEM;
1779 
1780 	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1781 	iommu->domains = kzalloc(size, GFP_KERNEL);
1782 
1783 	if (iommu->domains) {
1784 		size = 256 * sizeof(struct dmar_domain *);
1785 		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1786 	}
1787 
1788 	if (!iommu->domains || !iommu->domains[0]) {
1789 		pr_err("%s: Allocating domain array failed\n",
1790 		       iommu->name);
1791 		kfree(iommu->domain_ids);
1792 		kfree(iommu->domains);
1793 		iommu->domain_ids = NULL;
1794 		iommu->domains    = NULL;
1795 		return -ENOMEM;
1796 	}
1797 
1798 	/*
1799 	 * If Caching mode is set, then invalid translations are tagged
1800 	 * with domain-id 0, hence we need to pre-allocate it. We also
1801 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1802 	 * make sure it is not used for a real domain.
1803 	 */
1804 	set_bit(0, iommu->domain_ids);
1805 
1806 	/*
1807 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1808 	 * entry for first-level or pass-through translation modes should
1809 	 * be programmed with a domain id different from those used for
1810 	 * second-level or nested translation. We reserve a domain id for
1811 	 * this purpose.
1812 	 */
1813 	if (sm_supported(iommu))
1814 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1815 
1816 	return 0;
1817 }
1818 
1819 static void disable_dmar_iommu(struct intel_iommu *iommu)
1820 {
1821 	struct device_domain_info *info, *tmp;
1822 	unsigned long flags;
1823 
1824 	if (!iommu->domains || !iommu->domain_ids)
1825 		return;
1826 
1827 	spin_lock_irqsave(&device_domain_lock, flags);
1828 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1829 		if (info->iommu != iommu)
1830 			continue;
1831 
1832 		if (!info->dev || !info->domain)
1833 			continue;
1834 
1835 		__dmar_remove_one_dev_info(info);
1836 	}
1837 	spin_unlock_irqrestore(&device_domain_lock, flags);
1838 
1839 	if (iommu->gcmd & DMA_GCMD_TE)
1840 		iommu_disable_translation(iommu);
1841 }
1842 
1843 static void free_dmar_iommu(struct intel_iommu *iommu)
1844 {
1845 	if ((iommu->domains) && (iommu->domain_ids)) {
1846 		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1847 		int i;
1848 
1849 		for (i = 0; i < elems; i++)
1850 			kfree(iommu->domains[i]);
1851 		kfree(iommu->domains);
1852 		kfree(iommu->domain_ids);
1853 		iommu->domains = NULL;
1854 		iommu->domain_ids = NULL;
1855 	}
1856 
1857 	g_iommus[iommu->seq_id] = NULL;
1858 
1859 	/* free context mapping */
1860 	free_context_table(iommu);
1861 
1862 #ifdef CONFIG_INTEL_IOMMU_SVM
1863 	if (pasid_supported(iommu)) {
1864 		if (ecap_prs(iommu->ecap))
1865 			intel_svm_finish_prq(iommu);
1866 	}
1867 	if (vccap_pasid(iommu->vccap))
1868 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1869 
1870 #endif
1871 }
1872 
1873 /*
1874  * Check and return whether first level is used by default for
1875  * DMA translation.
1876  */
1877 static bool first_level_by_default(void)
1878 {
1879 	return scalable_mode_support() && intel_cap_flts_sanity();
1880 }
1881 
1882 static struct dmar_domain *alloc_domain(int flags)
1883 {
1884 	struct dmar_domain *domain;
1885 
1886 	domain = alloc_domain_mem();
1887 	if (!domain)
1888 		return NULL;
1889 
1890 	memset(domain, 0, sizeof(*domain));
1891 	domain->nid = NUMA_NO_NODE;
1892 	domain->flags = flags;
1893 	if (first_level_by_default())
1894 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1895 	domain->has_iotlb_device = false;
1896 	INIT_LIST_HEAD(&domain->devices);
1897 	INIT_LIST_HEAD(&domain->subdevices);
1898 
1899 	return domain;
1900 }
1901 
1902 /* Must be called with iommu->lock */
1903 static int domain_attach_iommu(struct dmar_domain *domain,
1904 			       struct intel_iommu *iommu)
1905 {
1906 	unsigned long ndomains;
1907 	int num;
1908 
1909 	assert_spin_locked(&device_domain_lock);
1910 	assert_spin_locked(&iommu->lock);
1911 
1912 	domain->iommu_refcnt[iommu->seq_id] += 1;
1913 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1914 		ndomains = cap_ndoms(iommu->cap);
1915 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1916 
1917 		if (num >= ndomains) {
1918 			pr_err("%s: No free domain ids\n", iommu->name);
1919 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1920 			return -ENOSPC;
1921 		}
1922 
1923 		set_bit(num, iommu->domain_ids);
1924 		set_iommu_domain(iommu, num, domain);
1925 
1926 		domain->iommu_did[iommu->seq_id] = num;
1927 		domain->nid			 = iommu->node;
1928 
1929 		domain_update_iommu_cap(domain);
1930 	}
1931 
1932 	return 0;
1933 }
1934 
1935 static void domain_detach_iommu(struct dmar_domain *domain,
1936 				struct intel_iommu *iommu)
1937 {
1938 	int num;
1939 
1940 	assert_spin_locked(&device_domain_lock);
1941 	assert_spin_locked(&iommu->lock);
1942 
1943 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1944 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1945 		num = domain->iommu_did[iommu->seq_id];
1946 		clear_bit(num, iommu->domain_ids);
1947 		set_iommu_domain(iommu, num, NULL);
1948 
1949 		domain_update_iommu_cap(domain);
1950 		domain->iommu_did[iommu->seq_id] = 0;
1951 	}
1952 }
1953 
1954 static inline int guestwidth_to_adjustwidth(int gaw)
1955 {
1956 	int agaw;
1957 	int r = (gaw - 12) % 9;
1958 
1959 	if (r == 0)
1960 		agaw = gaw;
1961 	else
1962 		agaw = gaw + 9 - r;
1963 	if (agaw > 64)
1964 		agaw = 64;
1965 	return agaw;
1966 }
1967 
1968 static void domain_exit(struct dmar_domain *domain)
1969 {
1970 
1971 	/* Remove associated devices and clear attached or cached domains */
1972 	domain_remove_dev_info(domain);
1973 
1974 	if (domain->pgd) {
1975 		struct page *freelist;
1976 
1977 		freelist = domain_unmap(domain, 0,
1978 					DOMAIN_MAX_PFN(domain->gaw), NULL);
1979 		dma_free_pagelist(freelist);
1980 	}
1981 
1982 	free_domain_mem(domain);
1983 }
1984 
1985 /*
1986  * Get the PASID directory size for scalable mode context entry.
1987  * Value of X in the PDTS field of a scalable mode context entry
1988  * indicates PASID directory with 2^(X + 7) entries.
1989  */
1990 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1991 {
1992 	int pds, max_pde;
1993 
1994 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1995 	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1996 	if (pds < 7)
1997 		return 0;
1998 
1999 	return pds - 7;
2000 }
2001 
2002 /*
2003  * Set the RID_PASID field of a scalable mode context entry. The
2004  * IOMMU hardware will use the PASID value set in this field for
2005  * DMA translations of DMA requests without PASID.
2006  */
2007 static inline void
2008 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2009 {
2010 	context->hi |= pasid & ((1 << 20) - 1);
2011 }
2012 
2013 /*
2014  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2015  * entry.
2016  */
2017 static inline void context_set_sm_dte(struct context_entry *context)
2018 {
2019 	context->lo |= (1 << 2);
2020 }
2021 
2022 /*
2023  * Set the PRE(Page Request Enable) field of a scalable mode context
2024  * entry.
2025  */
2026 static inline void context_set_sm_pre(struct context_entry *context)
2027 {
2028 	context->lo |= (1 << 4);
2029 }
2030 
2031 /* Convert value to context PASID directory size field coding. */
2032 #define context_pdts(pds)	(((pds) & 0x7) << 9)
2033 
2034 static int domain_context_mapping_one(struct dmar_domain *domain,
2035 				      struct intel_iommu *iommu,
2036 				      struct pasid_table *table,
2037 				      u8 bus, u8 devfn)
2038 {
2039 	u16 did = domain->iommu_did[iommu->seq_id];
2040 	int translation = CONTEXT_TT_MULTI_LEVEL;
2041 	struct device_domain_info *info = NULL;
2042 	struct context_entry *context;
2043 	unsigned long flags;
2044 	int ret;
2045 
2046 	WARN_ON(did == 0);
2047 
2048 	if (hw_pass_through && domain_type_is_si(domain))
2049 		translation = CONTEXT_TT_PASS_THROUGH;
2050 
2051 	pr_debug("Set context mapping for %02x:%02x.%d\n",
2052 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2053 
2054 	BUG_ON(!domain->pgd);
2055 
2056 	spin_lock_irqsave(&device_domain_lock, flags);
2057 	spin_lock(&iommu->lock);
2058 
2059 	ret = -ENOMEM;
2060 	context = iommu_context_addr(iommu, bus, devfn, 1);
2061 	if (!context)
2062 		goto out_unlock;
2063 
2064 	ret = 0;
2065 	if (context_present(context))
2066 		goto out_unlock;
2067 
2068 	/*
2069 	 * For kdump cases, old valid entries may be cached due to the
2070 	 * in-flight DMA and copied pgtable, but there is no unmapping
2071 	 * behaviour for them, thus we need an explicit cache flush for
2072 	 * the newly-mapped device. For kdump, at this point, the device
2073 	 * is supposed to finish reset at its driver probe stage, so no
2074 	 * in-flight DMA will exist, and we don't need to worry anymore
2075 	 * hereafter.
2076 	 */
2077 	if (context_copied(context)) {
2078 		u16 did_old = context_domain_id(context);
2079 
2080 		if (did_old < cap_ndoms(iommu->cap)) {
2081 			iommu->flush.flush_context(iommu, did_old,
2082 						   (((u16)bus) << 8) | devfn,
2083 						   DMA_CCMD_MASK_NOBIT,
2084 						   DMA_CCMD_DEVICE_INVL);
2085 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2086 						 DMA_TLB_DSI_FLUSH);
2087 		}
2088 	}
2089 
2090 	context_clear_entry(context);
2091 
2092 	if (sm_supported(iommu)) {
2093 		unsigned long pds;
2094 
2095 		WARN_ON(!table);
2096 
2097 		/* Setup the PASID DIR pointer: */
2098 		pds = context_get_sm_pds(table);
2099 		context->lo = (u64)virt_to_phys(table->table) |
2100 				context_pdts(pds);
2101 
2102 		/* Setup the RID_PASID field: */
2103 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2104 
2105 		/*
2106 		 * Setup the Device-TLB enable bit and Page request
2107 		 * Enable bit:
2108 		 */
2109 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2110 		if (info && info->ats_supported)
2111 			context_set_sm_dte(context);
2112 		if (info && info->pri_supported)
2113 			context_set_sm_pre(context);
2114 	} else {
2115 		struct dma_pte *pgd = domain->pgd;
2116 		int agaw;
2117 
2118 		context_set_domain_id(context, did);
2119 
2120 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2121 			/*
2122 			 * Skip top levels of page tables for iommu which has
2123 			 * less agaw than default. Unnecessary for PT mode.
2124 			 */
2125 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2126 				ret = -ENOMEM;
2127 				pgd = phys_to_virt(dma_pte_addr(pgd));
2128 				if (!dma_pte_present(pgd))
2129 					goto out_unlock;
2130 			}
2131 
2132 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2133 			if (info && info->ats_supported)
2134 				translation = CONTEXT_TT_DEV_IOTLB;
2135 			else
2136 				translation = CONTEXT_TT_MULTI_LEVEL;
2137 
2138 			context_set_address_root(context, virt_to_phys(pgd));
2139 			context_set_address_width(context, agaw);
2140 		} else {
2141 			/*
2142 			 * In pass through mode, AW must be programmed to
2143 			 * indicate the largest AGAW value supported by
2144 			 * hardware. And ASR is ignored by hardware.
2145 			 */
2146 			context_set_address_width(context, iommu->msagaw);
2147 		}
2148 
2149 		context_set_translation_type(context, translation);
2150 	}
2151 
2152 	context_set_fault_enable(context);
2153 	context_set_present(context);
2154 	if (!ecap_coherent(iommu->ecap))
2155 		clflush_cache_range(context, sizeof(*context));
2156 
2157 	/*
2158 	 * It's a non-present to present mapping. If hardware doesn't cache
2159 	 * non-present entry we only need to flush the write-buffer. If the
2160 	 * _does_ cache non-present entries, then it does so in the special
2161 	 * domain #0, which we have to flush:
2162 	 */
2163 	if (cap_caching_mode(iommu->cap)) {
2164 		iommu->flush.flush_context(iommu, 0,
2165 					   (((u16)bus) << 8) | devfn,
2166 					   DMA_CCMD_MASK_NOBIT,
2167 					   DMA_CCMD_DEVICE_INVL);
2168 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2169 	} else {
2170 		iommu_flush_write_buffer(iommu);
2171 	}
2172 	iommu_enable_dev_iotlb(info);
2173 
2174 	ret = 0;
2175 
2176 out_unlock:
2177 	spin_unlock(&iommu->lock);
2178 	spin_unlock_irqrestore(&device_domain_lock, flags);
2179 
2180 	return ret;
2181 }
2182 
2183 struct domain_context_mapping_data {
2184 	struct dmar_domain *domain;
2185 	struct intel_iommu *iommu;
2186 	struct pasid_table *table;
2187 };
2188 
2189 static int domain_context_mapping_cb(struct pci_dev *pdev,
2190 				     u16 alias, void *opaque)
2191 {
2192 	struct domain_context_mapping_data *data = opaque;
2193 
2194 	return domain_context_mapping_one(data->domain, data->iommu,
2195 					  data->table, PCI_BUS_NUM(alias),
2196 					  alias & 0xff);
2197 }
2198 
2199 static int
2200 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2201 {
2202 	struct domain_context_mapping_data data;
2203 	struct pasid_table *table;
2204 	struct intel_iommu *iommu;
2205 	u8 bus, devfn;
2206 
2207 	iommu = device_to_iommu(dev, &bus, &devfn);
2208 	if (!iommu)
2209 		return -ENODEV;
2210 
2211 	table = intel_pasid_get_table(dev);
2212 
2213 	if (!dev_is_pci(dev))
2214 		return domain_context_mapping_one(domain, iommu, table,
2215 						  bus, devfn);
2216 
2217 	data.domain = domain;
2218 	data.iommu = iommu;
2219 	data.table = table;
2220 
2221 	return pci_for_each_dma_alias(to_pci_dev(dev),
2222 				      &domain_context_mapping_cb, &data);
2223 }
2224 
2225 static int domain_context_mapped_cb(struct pci_dev *pdev,
2226 				    u16 alias, void *opaque)
2227 {
2228 	struct intel_iommu *iommu = opaque;
2229 
2230 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2231 }
2232 
2233 static int domain_context_mapped(struct device *dev)
2234 {
2235 	struct intel_iommu *iommu;
2236 	u8 bus, devfn;
2237 
2238 	iommu = device_to_iommu(dev, &bus, &devfn);
2239 	if (!iommu)
2240 		return -ENODEV;
2241 
2242 	if (!dev_is_pci(dev))
2243 		return device_context_mapped(iommu, bus, devfn);
2244 
2245 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2246 				       domain_context_mapped_cb, iommu);
2247 }
2248 
2249 /* Returns a number of VTD pages, but aligned to MM page size */
2250 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2251 					    size_t size)
2252 {
2253 	host_addr &= ~PAGE_MASK;
2254 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2255 }
2256 
2257 /* Return largest possible superpage level for a given mapping */
2258 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2259 					  unsigned long iov_pfn,
2260 					  unsigned long phy_pfn,
2261 					  unsigned long pages)
2262 {
2263 	int support, level = 1;
2264 	unsigned long pfnmerge;
2265 
2266 	support = domain->iommu_superpage;
2267 
2268 	/* To use a large page, the virtual *and* physical addresses
2269 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2270 	   of them will mean we have to use smaller pages. So just
2271 	   merge them and check both at once. */
2272 	pfnmerge = iov_pfn | phy_pfn;
2273 
2274 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2275 		pages >>= VTD_STRIDE_SHIFT;
2276 		if (!pages)
2277 			break;
2278 		pfnmerge >>= VTD_STRIDE_SHIFT;
2279 		level++;
2280 		support--;
2281 	}
2282 	return level;
2283 }
2284 
2285 /*
2286  * Ensure that old small page tables are removed to make room for superpage(s).
2287  * We're going to add new large pages, so make sure we don't remove their parent
2288  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2289  */
2290 static void switch_to_super_page(struct dmar_domain *domain,
2291 				 unsigned long start_pfn,
2292 				 unsigned long end_pfn, int level)
2293 {
2294 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2295 	struct dma_pte *pte = NULL;
2296 	int i;
2297 
2298 	while (start_pfn <= end_pfn) {
2299 		if (!pte)
2300 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2301 
2302 		if (dma_pte_present(pte)) {
2303 			dma_pte_free_pagetable(domain, start_pfn,
2304 					       start_pfn + lvl_pages - 1,
2305 					       level + 1);
2306 
2307 			for_each_domain_iommu(i, domain)
2308 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2309 						      start_pfn, lvl_pages,
2310 						      0, 0);
2311 		}
2312 
2313 		pte++;
2314 		start_pfn += lvl_pages;
2315 		if (first_pte_in_page(pte))
2316 			pte = NULL;
2317 	}
2318 }
2319 
2320 static int
2321 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2323 {
2324 	struct dma_pte *first_pte = NULL, *pte = NULL;
2325 	unsigned int largepage_lvl = 0;
2326 	unsigned long lvl_pages = 0;
2327 	phys_addr_t pteval;
2328 	u64 attr;
2329 
2330 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2331 
2332 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2333 		return -EINVAL;
2334 
2335 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2336 	attr |= DMA_FL_PTE_PRESENT;
2337 	if (domain_use_first_level(domain)) {
2338 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2339 		if (prot & DMA_PTE_WRITE)
2340 			attr |= DMA_FL_PTE_DIRTY;
2341 	}
2342 
2343 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344 
2345 	while (nr_pages > 0) {
2346 		uint64_t tmp;
2347 
2348 		if (!pte) {
2349 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2350 					phys_pfn, nr_pages);
2351 
2352 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353 			if (!pte)
2354 				return -ENOMEM;
2355 			first_pte = pte;
2356 
2357 			/* It is large page*/
2358 			if (largepage_lvl > 1) {
2359 				unsigned long end_pfn;
2360 
2361 				pteval |= DMA_PTE_LARGE_PAGE;
2362 				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2363 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2364 			} else {
2365 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2366 			}
2367 
2368 		}
2369 		/* We don't need lock here, nobody else
2370 		 * touches the iova range
2371 		 */
2372 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2373 		if (tmp) {
2374 			static int dumps = 5;
2375 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2376 				iov_pfn, tmp, (unsigned long long)pteval);
2377 			if (dumps) {
2378 				dumps--;
2379 				debug_dma_dump_mappings(NULL);
2380 			}
2381 			WARN_ON(1);
2382 		}
2383 
2384 		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2385 
2386 		BUG_ON(nr_pages < lvl_pages);
2387 
2388 		nr_pages -= lvl_pages;
2389 		iov_pfn += lvl_pages;
2390 		phys_pfn += lvl_pages;
2391 		pteval += lvl_pages * VTD_PAGE_SIZE;
2392 
2393 		/* If the next PTE would be the first in a new page, then we
2394 		 * need to flush the cache on the entries we've just written.
2395 		 * And then we'll need to recalculate 'pte', so clear it and
2396 		 * let it get set again in the if (!pte) block above.
2397 		 *
2398 		 * If we're done (!nr_pages) we need to flush the cache too.
2399 		 *
2400 		 * Also if we've been setting superpages, we may need to
2401 		 * recalculate 'pte' and switch back to smaller pages for the
2402 		 * end of the mapping, if the trailing size is not enough to
2403 		 * use another superpage (i.e. nr_pages < lvl_pages).
2404 		 */
2405 		pte++;
2406 		if (!nr_pages || first_pte_in_page(pte) ||
2407 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2408 			domain_flush_cache(domain, first_pte,
2409 					   (void *)pte - (void *)first_pte);
2410 			pte = NULL;
2411 		}
2412 	}
2413 
2414 	return 0;
2415 }
2416 
2417 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2418 {
2419 	struct intel_iommu *iommu = info->iommu;
2420 	struct context_entry *context;
2421 	unsigned long flags;
2422 	u16 did_old;
2423 
2424 	if (!iommu)
2425 		return;
2426 
2427 	spin_lock_irqsave(&iommu->lock, flags);
2428 	context = iommu_context_addr(iommu, bus, devfn, 0);
2429 	if (!context) {
2430 		spin_unlock_irqrestore(&iommu->lock, flags);
2431 		return;
2432 	}
2433 
2434 	if (sm_supported(iommu)) {
2435 		if (hw_pass_through && domain_type_is_si(info->domain))
2436 			did_old = FLPT_DEFAULT_DID;
2437 		else
2438 			did_old = info->domain->iommu_did[iommu->seq_id];
2439 	} else {
2440 		did_old = context_domain_id(context);
2441 	}
2442 
2443 	context_clear_entry(context);
2444 	__iommu_flush_cache(iommu, context, sizeof(*context));
2445 	spin_unlock_irqrestore(&iommu->lock, flags);
2446 	iommu->flush.flush_context(iommu,
2447 				   did_old,
2448 				   (((u16)bus) << 8) | devfn,
2449 				   DMA_CCMD_MASK_NOBIT,
2450 				   DMA_CCMD_DEVICE_INVL);
2451 
2452 	if (sm_supported(iommu))
2453 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2454 
2455 	iommu->flush.flush_iotlb(iommu,
2456 				 did_old,
2457 				 0,
2458 				 0,
2459 				 DMA_TLB_DSI_FLUSH);
2460 
2461 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2462 }
2463 
2464 static inline void unlink_domain_info(struct device_domain_info *info)
2465 {
2466 	assert_spin_locked(&device_domain_lock);
2467 	list_del(&info->link);
2468 	list_del(&info->global);
2469 	if (info->dev)
2470 		dev_iommu_priv_set(info->dev, NULL);
2471 }
2472 
2473 static void domain_remove_dev_info(struct dmar_domain *domain)
2474 {
2475 	struct device_domain_info *info, *tmp;
2476 	unsigned long flags;
2477 
2478 	spin_lock_irqsave(&device_domain_lock, flags);
2479 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2480 		__dmar_remove_one_dev_info(info);
2481 	spin_unlock_irqrestore(&device_domain_lock, flags);
2482 }
2483 
2484 struct dmar_domain *find_domain(struct device *dev)
2485 {
2486 	struct device_domain_info *info;
2487 
2488 	if (unlikely(!dev || !dev->iommu))
2489 		return NULL;
2490 
2491 	if (unlikely(attach_deferred(dev)))
2492 		return NULL;
2493 
2494 	/* No lock here, assumes no domain exit in normal case */
2495 	info = get_domain_info(dev);
2496 	if (likely(info))
2497 		return info->domain;
2498 
2499 	return NULL;
2500 }
2501 
2502 static inline struct device_domain_info *
2503 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2504 {
2505 	struct device_domain_info *info;
2506 
2507 	list_for_each_entry(info, &device_domain_list, global)
2508 		if (info->segment == segment && info->bus == bus &&
2509 		    info->devfn == devfn)
2510 			return info;
2511 
2512 	return NULL;
2513 }
2514 
2515 static int domain_setup_first_level(struct intel_iommu *iommu,
2516 				    struct dmar_domain *domain,
2517 				    struct device *dev,
2518 				    u32 pasid)
2519 {
2520 	struct dma_pte *pgd = domain->pgd;
2521 	int agaw, level;
2522 	int flags = 0;
2523 
2524 	/*
2525 	 * Skip top levels of page tables for iommu which has
2526 	 * less agaw than default. Unnecessary for PT mode.
2527 	 */
2528 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2529 		pgd = phys_to_virt(dma_pte_addr(pgd));
2530 		if (!dma_pte_present(pgd))
2531 			return -ENOMEM;
2532 	}
2533 
2534 	level = agaw_to_level(agaw);
2535 	if (level != 4 && level != 5)
2536 		return -EINVAL;
2537 
2538 	if (pasid != PASID_RID2PASID)
2539 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2540 	if (level == 5)
2541 		flags |= PASID_FLAG_FL5LP;
2542 
2543 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2544 		flags |= PASID_FLAG_PAGE_SNOOP;
2545 
2546 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2547 					     domain->iommu_did[iommu->seq_id],
2548 					     flags);
2549 }
2550 
2551 static bool dev_is_real_dma_subdevice(struct device *dev)
2552 {
2553 	return dev && dev_is_pci(dev) &&
2554 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2555 }
2556 
2557 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2558 						    int bus, int devfn,
2559 						    struct device *dev,
2560 						    struct dmar_domain *domain)
2561 {
2562 	struct dmar_domain *found = NULL;
2563 	struct device_domain_info *info;
2564 	unsigned long flags;
2565 	int ret;
2566 
2567 	info = alloc_devinfo_mem();
2568 	if (!info)
2569 		return NULL;
2570 
2571 	if (!dev_is_real_dma_subdevice(dev)) {
2572 		info->bus = bus;
2573 		info->devfn = devfn;
2574 		info->segment = iommu->segment;
2575 	} else {
2576 		struct pci_dev *pdev = to_pci_dev(dev);
2577 
2578 		info->bus = pdev->bus->number;
2579 		info->devfn = pdev->devfn;
2580 		info->segment = pci_domain_nr(pdev->bus);
2581 	}
2582 
2583 	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2584 	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2585 	info->ats_qdep = 0;
2586 	info->dev = dev;
2587 	info->domain = domain;
2588 	info->iommu = iommu;
2589 	info->pasid_table = NULL;
2590 	info->auxd_enabled = 0;
2591 	INIT_LIST_HEAD(&info->subdevices);
2592 
2593 	if (dev && dev_is_pci(dev)) {
2594 		struct pci_dev *pdev = to_pci_dev(info->dev);
2595 
2596 		if (ecap_dev_iotlb_support(iommu->ecap) &&
2597 		    pci_ats_supported(pdev) &&
2598 		    dmar_find_matched_atsr_unit(pdev))
2599 			info->ats_supported = 1;
2600 
2601 		if (sm_supported(iommu)) {
2602 			if (pasid_supported(iommu)) {
2603 				int features = pci_pasid_features(pdev);
2604 				if (features >= 0)
2605 					info->pasid_supported = features | 1;
2606 			}
2607 
2608 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2609 			    pci_pri_supported(pdev))
2610 				info->pri_supported = 1;
2611 		}
2612 	}
2613 
2614 	spin_lock_irqsave(&device_domain_lock, flags);
2615 	if (dev)
2616 		found = find_domain(dev);
2617 
2618 	if (!found) {
2619 		struct device_domain_info *info2;
2620 		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2621 						       info->devfn);
2622 		if (info2) {
2623 			found      = info2->domain;
2624 			info2->dev = dev;
2625 		}
2626 	}
2627 
2628 	if (found) {
2629 		spin_unlock_irqrestore(&device_domain_lock, flags);
2630 		free_devinfo_mem(info);
2631 		/* Caller must free the original domain */
2632 		return found;
2633 	}
2634 
2635 	spin_lock(&iommu->lock);
2636 	ret = domain_attach_iommu(domain, iommu);
2637 	spin_unlock(&iommu->lock);
2638 
2639 	if (ret) {
2640 		spin_unlock_irqrestore(&device_domain_lock, flags);
2641 		free_devinfo_mem(info);
2642 		return NULL;
2643 	}
2644 
2645 	list_add(&info->link, &domain->devices);
2646 	list_add(&info->global, &device_domain_list);
2647 	if (dev)
2648 		dev_iommu_priv_set(dev, info);
2649 	spin_unlock_irqrestore(&device_domain_lock, flags);
2650 
2651 	/* PASID table is mandatory for a PCI device in scalable mode. */
2652 	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2653 		ret = intel_pasid_alloc_table(dev);
2654 		if (ret) {
2655 			dev_err(dev, "PASID table allocation failed\n");
2656 			dmar_remove_one_dev_info(dev);
2657 			return NULL;
2658 		}
2659 
2660 		/* Setup the PASID entry for requests without PASID: */
2661 		spin_lock_irqsave(&iommu->lock, flags);
2662 		if (hw_pass_through && domain_type_is_si(domain))
2663 			ret = intel_pasid_setup_pass_through(iommu, domain,
2664 					dev, PASID_RID2PASID);
2665 		else if (domain_use_first_level(domain))
2666 			ret = domain_setup_first_level(iommu, domain, dev,
2667 					PASID_RID2PASID);
2668 		else
2669 			ret = intel_pasid_setup_second_level(iommu, domain,
2670 					dev, PASID_RID2PASID);
2671 		spin_unlock_irqrestore(&iommu->lock, flags);
2672 		if (ret) {
2673 			dev_err(dev, "Setup RID2PASID failed\n");
2674 			dmar_remove_one_dev_info(dev);
2675 			return NULL;
2676 		}
2677 	}
2678 
2679 	if (dev && domain_context_mapping(domain, dev)) {
2680 		dev_err(dev, "Domain context map failed\n");
2681 		dmar_remove_one_dev_info(dev);
2682 		return NULL;
2683 	}
2684 
2685 	return domain;
2686 }
2687 
2688 static int iommu_domain_identity_map(struct dmar_domain *domain,
2689 				     unsigned long first_vpfn,
2690 				     unsigned long last_vpfn)
2691 {
2692 	/*
2693 	 * RMRR range might have overlap with physical memory range,
2694 	 * clear it first
2695 	 */
2696 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2697 
2698 	return __domain_mapping(domain, first_vpfn,
2699 				first_vpfn, last_vpfn - first_vpfn + 1,
2700 				DMA_PTE_READ|DMA_PTE_WRITE);
2701 }
2702 
2703 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2704 
2705 static int __init si_domain_init(int hw)
2706 {
2707 	struct dmar_rmrr_unit *rmrr;
2708 	struct device *dev;
2709 	int i, nid, ret;
2710 
2711 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2712 	if (!si_domain)
2713 		return -EFAULT;
2714 
2715 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2716 		domain_exit(si_domain);
2717 		return -EFAULT;
2718 	}
2719 
2720 	if (hw)
2721 		return 0;
2722 
2723 	for_each_online_node(nid) {
2724 		unsigned long start_pfn, end_pfn;
2725 		int i;
2726 
2727 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2728 			ret = iommu_domain_identity_map(si_domain,
2729 					mm_to_dma_pfn(start_pfn),
2730 					mm_to_dma_pfn(end_pfn));
2731 			if (ret)
2732 				return ret;
2733 		}
2734 	}
2735 
2736 	/*
2737 	 * Identity map the RMRRs so that devices with RMRRs could also use
2738 	 * the si_domain.
2739 	 */
2740 	for_each_rmrr_units(rmrr) {
2741 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2742 					  i, dev) {
2743 			unsigned long long start = rmrr->base_address;
2744 			unsigned long long end = rmrr->end_address;
2745 
2746 			if (WARN_ON(end < start ||
2747 				    end >> agaw_to_width(si_domain->agaw)))
2748 				continue;
2749 
2750 			ret = iommu_domain_identity_map(si_domain,
2751 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2752 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2753 			if (ret)
2754 				return ret;
2755 		}
2756 	}
2757 
2758 	return 0;
2759 }
2760 
2761 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2762 {
2763 	struct dmar_domain *ndomain;
2764 	struct intel_iommu *iommu;
2765 	u8 bus, devfn;
2766 
2767 	iommu = device_to_iommu(dev, &bus, &devfn);
2768 	if (!iommu)
2769 		return -ENODEV;
2770 
2771 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2772 	if (ndomain != domain)
2773 		return -EBUSY;
2774 
2775 	return 0;
2776 }
2777 
2778 static bool device_has_rmrr(struct device *dev)
2779 {
2780 	struct dmar_rmrr_unit *rmrr;
2781 	struct device *tmp;
2782 	int i;
2783 
2784 	rcu_read_lock();
2785 	for_each_rmrr_units(rmrr) {
2786 		/*
2787 		 * Return TRUE if this RMRR contains the device that
2788 		 * is passed in.
2789 		 */
2790 		for_each_active_dev_scope(rmrr->devices,
2791 					  rmrr->devices_cnt, i, tmp)
2792 			if (tmp == dev ||
2793 			    is_downstream_to_pci_bridge(dev, tmp)) {
2794 				rcu_read_unlock();
2795 				return true;
2796 			}
2797 	}
2798 	rcu_read_unlock();
2799 	return false;
2800 }
2801 
2802 /**
2803  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2804  * is relaxable (ie. is allowed to be not enforced under some conditions)
2805  * @dev: device handle
2806  *
2807  * We assume that PCI USB devices with RMRRs have them largely
2808  * for historical reasons and that the RMRR space is not actively used post
2809  * boot.  This exclusion may change if vendors begin to abuse it.
2810  *
2811  * The same exception is made for graphics devices, with the requirement that
2812  * any use of the RMRR regions will be torn down before assigning the device
2813  * to a guest.
2814  *
2815  * Return: true if the RMRR is relaxable, false otherwise
2816  */
2817 static bool device_rmrr_is_relaxable(struct device *dev)
2818 {
2819 	struct pci_dev *pdev;
2820 
2821 	if (!dev_is_pci(dev))
2822 		return false;
2823 
2824 	pdev = to_pci_dev(dev);
2825 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2826 		return true;
2827 	else
2828 		return false;
2829 }
2830 
2831 /*
2832  * There are a couple cases where we need to restrict the functionality of
2833  * devices associated with RMRRs.  The first is when evaluating a device for
2834  * identity mapping because problems exist when devices are moved in and out
2835  * of domains and their respective RMRR information is lost.  This means that
2836  * a device with associated RMRRs will never be in a "passthrough" domain.
2837  * The second is use of the device through the IOMMU API.  This interface
2838  * expects to have full control of the IOVA space for the device.  We cannot
2839  * satisfy both the requirement that RMRR access is maintained and have an
2840  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2841  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2842  * We therefore prevent devices associated with an RMRR from participating in
2843  * the IOMMU API, which eliminates them from device assignment.
2844  *
2845  * In both cases, devices which have relaxable RMRRs are not concerned by this
2846  * restriction. See device_rmrr_is_relaxable comment.
2847  */
2848 static bool device_is_rmrr_locked(struct device *dev)
2849 {
2850 	if (!device_has_rmrr(dev))
2851 		return false;
2852 
2853 	if (device_rmrr_is_relaxable(dev))
2854 		return false;
2855 
2856 	return true;
2857 }
2858 
2859 /*
2860  * Return the required default domain type for a specific device.
2861  *
2862  * @dev: the device in query
2863  * @startup: true if this is during early boot
2864  *
2865  * Returns:
2866  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2867  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2868  *  - 0: both identity and dynamic domains work for this device
2869  */
2870 static int device_def_domain_type(struct device *dev)
2871 {
2872 	if (dev_is_pci(dev)) {
2873 		struct pci_dev *pdev = to_pci_dev(dev);
2874 
2875 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2876 			return IOMMU_DOMAIN_IDENTITY;
2877 
2878 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2879 			return IOMMU_DOMAIN_IDENTITY;
2880 	}
2881 
2882 	return 0;
2883 }
2884 
2885 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2886 {
2887 	/*
2888 	 * Start from the sane iommu hardware state.
2889 	 * If the queued invalidation is already initialized by us
2890 	 * (for example, while enabling interrupt-remapping) then
2891 	 * we got the things already rolling from a sane state.
2892 	 */
2893 	if (!iommu->qi) {
2894 		/*
2895 		 * Clear any previous faults.
2896 		 */
2897 		dmar_fault(-1, iommu);
2898 		/*
2899 		 * Disable queued invalidation if supported and already enabled
2900 		 * before OS handover.
2901 		 */
2902 		dmar_disable_qi(iommu);
2903 	}
2904 
2905 	if (dmar_enable_qi(iommu)) {
2906 		/*
2907 		 * Queued Invalidate not enabled, use Register Based Invalidate
2908 		 */
2909 		iommu->flush.flush_context = __iommu_flush_context;
2910 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2911 		pr_info("%s: Using Register based invalidation\n",
2912 			iommu->name);
2913 	} else {
2914 		iommu->flush.flush_context = qi_flush_context;
2915 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2916 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2917 	}
2918 }
2919 
2920 static int copy_context_table(struct intel_iommu *iommu,
2921 			      struct root_entry *old_re,
2922 			      struct context_entry **tbl,
2923 			      int bus, bool ext)
2924 {
2925 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2926 	struct context_entry *new_ce = NULL, ce;
2927 	struct context_entry *old_ce = NULL;
2928 	struct root_entry re;
2929 	phys_addr_t old_ce_phys;
2930 
2931 	tbl_idx = ext ? bus * 2 : bus;
2932 	memcpy(&re, old_re, sizeof(re));
2933 
2934 	for (devfn = 0; devfn < 256; devfn++) {
2935 		/* First calculate the correct index */
2936 		idx = (ext ? devfn * 2 : devfn) % 256;
2937 
2938 		if (idx == 0) {
2939 			/* First save what we may have and clean up */
2940 			if (new_ce) {
2941 				tbl[tbl_idx] = new_ce;
2942 				__iommu_flush_cache(iommu, new_ce,
2943 						    VTD_PAGE_SIZE);
2944 				pos = 1;
2945 			}
2946 
2947 			if (old_ce)
2948 				memunmap(old_ce);
2949 
2950 			ret = 0;
2951 			if (devfn < 0x80)
2952 				old_ce_phys = root_entry_lctp(&re);
2953 			else
2954 				old_ce_phys = root_entry_uctp(&re);
2955 
2956 			if (!old_ce_phys) {
2957 				if (ext && devfn == 0) {
2958 					/* No LCTP, try UCTP */
2959 					devfn = 0x7f;
2960 					continue;
2961 				} else {
2962 					goto out;
2963 				}
2964 			}
2965 
2966 			ret = -ENOMEM;
2967 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2968 					MEMREMAP_WB);
2969 			if (!old_ce)
2970 				goto out;
2971 
2972 			new_ce = alloc_pgtable_page(iommu->node);
2973 			if (!new_ce)
2974 				goto out_unmap;
2975 
2976 			ret = 0;
2977 		}
2978 
2979 		/* Now copy the context entry */
2980 		memcpy(&ce, old_ce + idx, sizeof(ce));
2981 
2982 		if (!__context_present(&ce))
2983 			continue;
2984 
2985 		did = context_domain_id(&ce);
2986 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2987 			set_bit(did, iommu->domain_ids);
2988 
2989 		/*
2990 		 * We need a marker for copied context entries. This
2991 		 * marker needs to work for the old format as well as
2992 		 * for extended context entries.
2993 		 *
2994 		 * Bit 67 of the context entry is used. In the old
2995 		 * format this bit is available to software, in the
2996 		 * extended format it is the PGE bit, but PGE is ignored
2997 		 * by HW if PASIDs are disabled (and thus still
2998 		 * available).
2999 		 *
3000 		 * So disable PASIDs first and then mark the entry
3001 		 * copied. This means that we don't copy PASID
3002 		 * translations from the old kernel, but this is fine as
3003 		 * faults there are not fatal.
3004 		 */
3005 		context_clear_pasid_enable(&ce);
3006 		context_set_copied(&ce);
3007 
3008 		new_ce[idx] = ce;
3009 	}
3010 
3011 	tbl[tbl_idx + pos] = new_ce;
3012 
3013 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3014 
3015 out_unmap:
3016 	memunmap(old_ce);
3017 
3018 out:
3019 	return ret;
3020 }
3021 
3022 static int copy_translation_tables(struct intel_iommu *iommu)
3023 {
3024 	struct context_entry **ctxt_tbls;
3025 	struct root_entry *old_rt;
3026 	phys_addr_t old_rt_phys;
3027 	int ctxt_table_entries;
3028 	unsigned long flags;
3029 	u64 rtaddr_reg;
3030 	int bus, ret;
3031 	bool new_ext, ext;
3032 
3033 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3034 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3035 	new_ext    = !!ecap_ecs(iommu->ecap);
3036 
3037 	/*
3038 	 * The RTT bit can only be changed when translation is disabled,
3039 	 * but disabling translation means to open a window for data
3040 	 * corruption. So bail out and don't copy anything if we would
3041 	 * have to change the bit.
3042 	 */
3043 	if (new_ext != ext)
3044 		return -EINVAL;
3045 
3046 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3047 	if (!old_rt_phys)
3048 		return -EINVAL;
3049 
3050 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3051 	if (!old_rt)
3052 		return -ENOMEM;
3053 
3054 	/* This is too big for the stack - allocate it from slab */
3055 	ctxt_table_entries = ext ? 512 : 256;
3056 	ret = -ENOMEM;
3057 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3058 	if (!ctxt_tbls)
3059 		goto out_unmap;
3060 
3061 	for (bus = 0; bus < 256; bus++) {
3062 		ret = copy_context_table(iommu, &old_rt[bus],
3063 					 ctxt_tbls, bus, ext);
3064 		if (ret) {
3065 			pr_err("%s: Failed to copy context table for bus %d\n",
3066 				iommu->name, bus);
3067 			continue;
3068 		}
3069 	}
3070 
3071 	spin_lock_irqsave(&iommu->lock, flags);
3072 
3073 	/* Context tables are copied, now write them to the root_entry table */
3074 	for (bus = 0; bus < 256; bus++) {
3075 		int idx = ext ? bus * 2 : bus;
3076 		u64 val;
3077 
3078 		if (ctxt_tbls[idx]) {
3079 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3080 			iommu->root_entry[bus].lo = val;
3081 		}
3082 
3083 		if (!ext || !ctxt_tbls[idx + 1])
3084 			continue;
3085 
3086 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3087 		iommu->root_entry[bus].hi = val;
3088 	}
3089 
3090 	spin_unlock_irqrestore(&iommu->lock, flags);
3091 
3092 	kfree(ctxt_tbls);
3093 
3094 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3095 
3096 	ret = 0;
3097 
3098 out_unmap:
3099 	memunmap(old_rt);
3100 
3101 	return ret;
3102 }
3103 
3104 #ifdef CONFIG_INTEL_IOMMU_SVM
3105 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3106 {
3107 	struct intel_iommu *iommu = data;
3108 	ioasid_t ioasid;
3109 
3110 	if (!iommu)
3111 		return INVALID_IOASID;
3112 	/*
3113 	 * VT-d virtual command interface always uses the full 20 bit
3114 	 * PASID range. Host can partition guest PASID range based on
3115 	 * policies but it is out of guest's control.
3116 	 */
3117 	if (min < PASID_MIN || max > intel_pasid_max_id)
3118 		return INVALID_IOASID;
3119 
3120 	if (vcmd_alloc_pasid(iommu, &ioasid))
3121 		return INVALID_IOASID;
3122 
3123 	return ioasid;
3124 }
3125 
3126 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3127 {
3128 	struct intel_iommu *iommu = data;
3129 
3130 	if (!iommu)
3131 		return;
3132 	/*
3133 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3134 	 * We can only free the PASID when all the devices are unbound.
3135 	 */
3136 	if (ioasid_find(NULL, ioasid, NULL)) {
3137 		pr_alert("Cannot free active IOASID %d\n", ioasid);
3138 		return;
3139 	}
3140 	vcmd_free_pasid(iommu, ioasid);
3141 }
3142 
3143 static void register_pasid_allocator(struct intel_iommu *iommu)
3144 {
3145 	/*
3146 	 * If we are running in the host, no need for custom allocator
3147 	 * in that PASIDs are allocated from the host system-wide.
3148 	 */
3149 	if (!cap_caching_mode(iommu->cap))
3150 		return;
3151 
3152 	if (!sm_supported(iommu)) {
3153 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3154 		return;
3155 	}
3156 
3157 	/*
3158 	 * Register a custom PASID allocator if we are running in a guest,
3159 	 * guest PASID must be obtained via virtual command interface.
3160 	 * There can be multiple vIOMMUs in each guest but only one allocator
3161 	 * is active. All vIOMMU allocators will eventually be calling the same
3162 	 * host allocator.
3163 	 */
3164 	if (!vccap_pasid(iommu->vccap))
3165 		return;
3166 
3167 	pr_info("Register custom PASID allocator\n");
3168 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3169 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3170 	iommu->pasid_allocator.pdata = (void *)iommu;
3171 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3172 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3173 		/*
3174 		 * Disable scalable mode on this IOMMU if there
3175 		 * is no custom allocator. Mixing SM capable vIOMMU
3176 		 * and non-SM vIOMMU are not supported.
3177 		 */
3178 		intel_iommu_sm = 0;
3179 	}
3180 }
3181 #endif
3182 
3183 static int __init init_dmars(void)
3184 {
3185 	struct dmar_drhd_unit *drhd;
3186 	struct intel_iommu *iommu;
3187 	int ret;
3188 
3189 	/*
3190 	 * for each drhd
3191 	 *    allocate root
3192 	 *    initialize and program root entry to not present
3193 	 * endfor
3194 	 */
3195 	for_each_drhd_unit(drhd) {
3196 		/*
3197 		 * lock not needed as this is only incremented in the single
3198 		 * threaded kernel __init code path all other access are read
3199 		 * only
3200 		 */
3201 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3202 			g_num_of_iommus++;
3203 			continue;
3204 		}
3205 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3206 	}
3207 
3208 	/* Preallocate enough resources for IOMMU hot-addition */
3209 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3210 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3211 
3212 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3213 			GFP_KERNEL);
3214 	if (!g_iommus) {
3215 		ret = -ENOMEM;
3216 		goto error;
3217 	}
3218 
3219 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3220 	if (ret)
3221 		goto free_iommu;
3222 
3223 	for_each_iommu(iommu, drhd) {
3224 		if (drhd->ignored) {
3225 			iommu_disable_translation(iommu);
3226 			continue;
3227 		}
3228 
3229 		/*
3230 		 * Find the max pasid size of all IOMMU's in the system.
3231 		 * We need to ensure the system pasid table is no bigger
3232 		 * than the smallest supported.
3233 		 */
3234 		if (pasid_supported(iommu)) {
3235 			u32 temp = 2 << ecap_pss(iommu->ecap);
3236 
3237 			intel_pasid_max_id = min_t(u32, temp,
3238 						   intel_pasid_max_id);
3239 		}
3240 
3241 		g_iommus[iommu->seq_id] = iommu;
3242 
3243 		intel_iommu_init_qi(iommu);
3244 
3245 		ret = iommu_init_domains(iommu);
3246 		if (ret)
3247 			goto free_iommu;
3248 
3249 		init_translation_status(iommu);
3250 
3251 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3252 			iommu_disable_translation(iommu);
3253 			clear_translation_pre_enabled(iommu);
3254 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3255 				iommu->name);
3256 		}
3257 
3258 		/*
3259 		 * TBD:
3260 		 * we could share the same root & context tables
3261 		 * among all IOMMU's. Need to Split it later.
3262 		 */
3263 		ret = iommu_alloc_root_entry(iommu);
3264 		if (ret)
3265 			goto free_iommu;
3266 
3267 		if (translation_pre_enabled(iommu)) {
3268 			pr_info("Translation already enabled - trying to copy translation structures\n");
3269 
3270 			ret = copy_translation_tables(iommu);
3271 			if (ret) {
3272 				/*
3273 				 * We found the IOMMU with translation
3274 				 * enabled - but failed to copy over the
3275 				 * old root-entry table. Try to proceed
3276 				 * by disabling translation now and
3277 				 * allocating a clean root-entry table.
3278 				 * This might cause DMAR faults, but
3279 				 * probably the dump will still succeed.
3280 				 */
3281 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3282 				       iommu->name);
3283 				iommu_disable_translation(iommu);
3284 				clear_translation_pre_enabled(iommu);
3285 			} else {
3286 				pr_info("Copied translation tables from previous kernel for %s\n",
3287 					iommu->name);
3288 			}
3289 		}
3290 
3291 		if (!ecap_pass_through(iommu->ecap))
3292 			hw_pass_through = 0;
3293 		intel_svm_check(iommu);
3294 	}
3295 
3296 	/*
3297 	 * Now that qi is enabled on all iommus, set the root entry and flush
3298 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3299 	 * flush_context function will loop forever and the boot hangs.
3300 	 */
3301 	for_each_active_iommu(iommu, drhd) {
3302 		iommu_flush_write_buffer(iommu);
3303 #ifdef CONFIG_INTEL_IOMMU_SVM
3304 		register_pasid_allocator(iommu);
3305 #endif
3306 		iommu_set_root_entry(iommu);
3307 	}
3308 
3309 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3310 	dmar_map_gfx = 0;
3311 #endif
3312 
3313 	if (!dmar_map_gfx)
3314 		iommu_identity_mapping |= IDENTMAP_GFX;
3315 
3316 	check_tylersburg_isoch();
3317 
3318 	ret = si_domain_init(hw_pass_through);
3319 	if (ret)
3320 		goto free_iommu;
3321 
3322 	/*
3323 	 * for each drhd
3324 	 *   enable fault log
3325 	 *   global invalidate context cache
3326 	 *   global invalidate iotlb
3327 	 *   enable translation
3328 	 */
3329 	for_each_iommu(iommu, drhd) {
3330 		if (drhd->ignored) {
3331 			/*
3332 			 * we always have to disable PMRs or DMA may fail on
3333 			 * this device
3334 			 */
3335 			if (force_on)
3336 				iommu_disable_protect_mem_regions(iommu);
3337 			continue;
3338 		}
3339 
3340 		iommu_flush_write_buffer(iommu);
3341 
3342 #ifdef CONFIG_INTEL_IOMMU_SVM
3343 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3344 			/*
3345 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3346 			 * could cause possible lock race condition.
3347 			 */
3348 			up_write(&dmar_global_lock);
3349 			ret = intel_svm_enable_prq(iommu);
3350 			down_write(&dmar_global_lock);
3351 			if (ret)
3352 				goto free_iommu;
3353 		}
3354 #endif
3355 		ret = dmar_set_interrupt(iommu);
3356 		if (ret)
3357 			goto free_iommu;
3358 	}
3359 
3360 	return 0;
3361 
3362 free_iommu:
3363 	for_each_active_iommu(iommu, drhd) {
3364 		disable_dmar_iommu(iommu);
3365 		free_dmar_iommu(iommu);
3366 	}
3367 
3368 	kfree(g_iommus);
3369 
3370 error:
3371 	return ret;
3372 }
3373 
3374 static inline int iommu_domain_cache_init(void)
3375 {
3376 	int ret = 0;
3377 
3378 	iommu_domain_cache = kmem_cache_create("iommu_domain",
3379 					 sizeof(struct dmar_domain),
3380 					 0,
3381 					 SLAB_HWCACHE_ALIGN,
3382 
3383 					 NULL);
3384 	if (!iommu_domain_cache) {
3385 		pr_err("Couldn't create iommu_domain cache\n");
3386 		ret = -ENOMEM;
3387 	}
3388 
3389 	return ret;
3390 }
3391 
3392 static inline int iommu_devinfo_cache_init(void)
3393 {
3394 	int ret = 0;
3395 
3396 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3397 					 sizeof(struct device_domain_info),
3398 					 0,
3399 					 SLAB_HWCACHE_ALIGN,
3400 					 NULL);
3401 	if (!iommu_devinfo_cache) {
3402 		pr_err("Couldn't create devinfo cache\n");
3403 		ret = -ENOMEM;
3404 	}
3405 
3406 	return ret;
3407 }
3408 
3409 static int __init iommu_init_mempool(void)
3410 {
3411 	int ret;
3412 	ret = iova_cache_get();
3413 	if (ret)
3414 		return ret;
3415 
3416 	ret = iommu_domain_cache_init();
3417 	if (ret)
3418 		goto domain_error;
3419 
3420 	ret = iommu_devinfo_cache_init();
3421 	if (!ret)
3422 		return ret;
3423 
3424 	kmem_cache_destroy(iommu_domain_cache);
3425 domain_error:
3426 	iova_cache_put();
3427 
3428 	return -ENOMEM;
3429 }
3430 
3431 static void __init iommu_exit_mempool(void)
3432 {
3433 	kmem_cache_destroy(iommu_devinfo_cache);
3434 	kmem_cache_destroy(iommu_domain_cache);
3435 	iova_cache_put();
3436 }
3437 
3438 static void __init init_no_remapping_devices(void)
3439 {
3440 	struct dmar_drhd_unit *drhd;
3441 	struct device *dev;
3442 	int i;
3443 
3444 	for_each_drhd_unit(drhd) {
3445 		if (!drhd->include_all) {
3446 			for_each_active_dev_scope(drhd->devices,
3447 						  drhd->devices_cnt, i, dev)
3448 				break;
3449 			/* ignore DMAR unit if no devices exist */
3450 			if (i == drhd->devices_cnt)
3451 				drhd->ignored = 1;
3452 		}
3453 	}
3454 
3455 	for_each_active_drhd_unit(drhd) {
3456 		if (drhd->include_all)
3457 			continue;
3458 
3459 		for_each_active_dev_scope(drhd->devices,
3460 					  drhd->devices_cnt, i, dev)
3461 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3462 				break;
3463 		if (i < drhd->devices_cnt)
3464 			continue;
3465 
3466 		/* This IOMMU has *only* gfx devices. Either bypass it or
3467 		   set the gfx_mapped flag, as appropriate */
3468 		drhd->gfx_dedicated = 1;
3469 		if (!dmar_map_gfx)
3470 			drhd->ignored = 1;
3471 	}
3472 }
3473 
3474 #ifdef CONFIG_SUSPEND
3475 static int init_iommu_hw(void)
3476 {
3477 	struct dmar_drhd_unit *drhd;
3478 	struct intel_iommu *iommu = NULL;
3479 
3480 	for_each_active_iommu(iommu, drhd)
3481 		if (iommu->qi)
3482 			dmar_reenable_qi(iommu);
3483 
3484 	for_each_iommu(iommu, drhd) {
3485 		if (drhd->ignored) {
3486 			/*
3487 			 * we always have to disable PMRs or DMA may fail on
3488 			 * this device
3489 			 */
3490 			if (force_on)
3491 				iommu_disable_protect_mem_regions(iommu);
3492 			continue;
3493 		}
3494 
3495 		iommu_flush_write_buffer(iommu);
3496 		iommu_set_root_entry(iommu);
3497 		iommu_enable_translation(iommu);
3498 		iommu_disable_protect_mem_regions(iommu);
3499 	}
3500 
3501 	return 0;
3502 }
3503 
3504 static void iommu_flush_all(void)
3505 {
3506 	struct dmar_drhd_unit *drhd;
3507 	struct intel_iommu *iommu;
3508 
3509 	for_each_active_iommu(iommu, drhd) {
3510 		iommu->flush.flush_context(iommu, 0, 0, 0,
3511 					   DMA_CCMD_GLOBAL_INVL);
3512 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3513 					 DMA_TLB_GLOBAL_FLUSH);
3514 	}
3515 }
3516 
3517 static int iommu_suspend(void)
3518 {
3519 	struct dmar_drhd_unit *drhd;
3520 	struct intel_iommu *iommu = NULL;
3521 	unsigned long flag;
3522 
3523 	for_each_active_iommu(iommu, drhd) {
3524 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3525 					     GFP_KERNEL);
3526 		if (!iommu->iommu_state)
3527 			goto nomem;
3528 	}
3529 
3530 	iommu_flush_all();
3531 
3532 	for_each_active_iommu(iommu, drhd) {
3533 		iommu_disable_translation(iommu);
3534 
3535 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3536 
3537 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3538 			readl(iommu->reg + DMAR_FECTL_REG);
3539 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3540 			readl(iommu->reg + DMAR_FEDATA_REG);
3541 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3542 			readl(iommu->reg + DMAR_FEADDR_REG);
3543 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3544 			readl(iommu->reg + DMAR_FEUADDR_REG);
3545 
3546 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3547 	}
3548 	return 0;
3549 
3550 nomem:
3551 	for_each_active_iommu(iommu, drhd)
3552 		kfree(iommu->iommu_state);
3553 
3554 	return -ENOMEM;
3555 }
3556 
3557 static void iommu_resume(void)
3558 {
3559 	struct dmar_drhd_unit *drhd;
3560 	struct intel_iommu *iommu = NULL;
3561 	unsigned long flag;
3562 
3563 	if (init_iommu_hw()) {
3564 		if (force_on)
3565 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3566 		else
3567 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3568 		return;
3569 	}
3570 
3571 	for_each_active_iommu(iommu, drhd) {
3572 
3573 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3574 
3575 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3576 			iommu->reg + DMAR_FECTL_REG);
3577 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3578 			iommu->reg + DMAR_FEDATA_REG);
3579 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3580 			iommu->reg + DMAR_FEADDR_REG);
3581 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3582 			iommu->reg + DMAR_FEUADDR_REG);
3583 
3584 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3585 	}
3586 
3587 	for_each_active_iommu(iommu, drhd)
3588 		kfree(iommu->iommu_state);
3589 }
3590 
3591 static struct syscore_ops iommu_syscore_ops = {
3592 	.resume		= iommu_resume,
3593 	.suspend	= iommu_suspend,
3594 };
3595 
3596 static void __init init_iommu_pm_ops(void)
3597 {
3598 	register_syscore_ops(&iommu_syscore_ops);
3599 }
3600 
3601 #else
3602 static inline void init_iommu_pm_ops(void) {}
3603 #endif	/* CONFIG_PM */
3604 
3605 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3606 {
3607 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3608 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3609 	    rmrr->end_address <= rmrr->base_address ||
3610 	    arch_rmrr_sanity_check(rmrr))
3611 		return -EINVAL;
3612 
3613 	return 0;
3614 }
3615 
3616 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3617 {
3618 	struct acpi_dmar_reserved_memory *rmrr;
3619 	struct dmar_rmrr_unit *rmrru;
3620 
3621 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3622 	if (rmrr_sanity_check(rmrr)) {
3623 		pr_warn(FW_BUG
3624 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3625 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3626 			   rmrr->base_address, rmrr->end_address,
3627 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3628 			   dmi_get_system_info(DMI_BIOS_VERSION),
3629 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3630 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3631 	}
3632 
3633 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3634 	if (!rmrru)
3635 		goto out;
3636 
3637 	rmrru->hdr = header;
3638 
3639 	rmrru->base_address = rmrr->base_address;
3640 	rmrru->end_address = rmrr->end_address;
3641 
3642 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3643 				((void *)rmrr) + rmrr->header.length,
3644 				&rmrru->devices_cnt);
3645 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3646 		goto free_rmrru;
3647 
3648 	list_add(&rmrru->list, &dmar_rmrr_units);
3649 
3650 	return 0;
3651 free_rmrru:
3652 	kfree(rmrru);
3653 out:
3654 	return -ENOMEM;
3655 }
3656 
3657 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3658 {
3659 	struct dmar_atsr_unit *atsru;
3660 	struct acpi_dmar_atsr *tmp;
3661 
3662 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3663 				dmar_rcu_check()) {
3664 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3665 		if (atsr->segment != tmp->segment)
3666 			continue;
3667 		if (atsr->header.length != tmp->header.length)
3668 			continue;
3669 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3670 			return atsru;
3671 	}
3672 
3673 	return NULL;
3674 }
3675 
3676 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3677 {
3678 	struct acpi_dmar_atsr *atsr;
3679 	struct dmar_atsr_unit *atsru;
3680 
3681 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3682 		return 0;
3683 
3684 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3685 	atsru = dmar_find_atsr(atsr);
3686 	if (atsru)
3687 		return 0;
3688 
3689 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3690 	if (!atsru)
3691 		return -ENOMEM;
3692 
3693 	/*
3694 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3695 	 * copy the memory content because the memory buffer will be freed
3696 	 * on return.
3697 	 */
3698 	atsru->hdr = (void *)(atsru + 1);
3699 	memcpy(atsru->hdr, hdr, hdr->length);
3700 	atsru->include_all = atsr->flags & 0x1;
3701 	if (!atsru->include_all) {
3702 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3703 				(void *)atsr + atsr->header.length,
3704 				&atsru->devices_cnt);
3705 		if (atsru->devices_cnt && atsru->devices == NULL) {
3706 			kfree(atsru);
3707 			return -ENOMEM;
3708 		}
3709 	}
3710 
3711 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3712 
3713 	return 0;
3714 }
3715 
3716 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3717 {
3718 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3719 	kfree(atsru);
3720 }
3721 
3722 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3723 {
3724 	struct acpi_dmar_atsr *atsr;
3725 	struct dmar_atsr_unit *atsru;
3726 
3727 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3728 	atsru = dmar_find_atsr(atsr);
3729 	if (atsru) {
3730 		list_del_rcu(&atsru->list);
3731 		synchronize_rcu();
3732 		intel_iommu_free_atsr(atsru);
3733 	}
3734 
3735 	return 0;
3736 }
3737 
3738 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3739 {
3740 	int i;
3741 	struct device *dev;
3742 	struct acpi_dmar_atsr *atsr;
3743 	struct dmar_atsr_unit *atsru;
3744 
3745 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3746 	atsru = dmar_find_atsr(atsr);
3747 	if (!atsru)
3748 		return 0;
3749 
3750 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3751 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3752 					  i, dev)
3753 			return -EBUSY;
3754 	}
3755 
3756 	return 0;
3757 }
3758 
3759 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3760 {
3761 	struct dmar_satc_unit *satcu;
3762 	struct acpi_dmar_satc *tmp;
3763 
3764 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3765 				dmar_rcu_check()) {
3766 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3767 		if (satc->segment != tmp->segment)
3768 			continue;
3769 		if (satc->header.length != tmp->header.length)
3770 			continue;
3771 		if (memcmp(satc, tmp, satc->header.length) == 0)
3772 			return satcu;
3773 	}
3774 
3775 	return NULL;
3776 }
3777 
3778 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3779 {
3780 	struct acpi_dmar_satc *satc;
3781 	struct dmar_satc_unit *satcu;
3782 
3783 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3784 		return 0;
3785 
3786 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3787 	satcu = dmar_find_satc(satc);
3788 	if (satcu)
3789 		return 0;
3790 
3791 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3792 	if (!satcu)
3793 		return -ENOMEM;
3794 
3795 	satcu->hdr = (void *)(satcu + 1);
3796 	memcpy(satcu->hdr, hdr, hdr->length);
3797 	satcu->atc_required = satc->flags & 0x1;
3798 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3799 					      (void *)satc + satc->header.length,
3800 					      &satcu->devices_cnt);
3801 	if (satcu->devices_cnt && !satcu->devices) {
3802 		kfree(satcu);
3803 		return -ENOMEM;
3804 	}
3805 	list_add_rcu(&satcu->list, &dmar_satc_units);
3806 
3807 	return 0;
3808 }
3809 
3810 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3811 {
3812 	int sp, ret;
3813 	struct intel_iommu *iommu = dmaru->iommu;
3814 
3815 	if (g_iommus[iommu->seq_id])
3816 		return 0;
3817 
3818 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3819 	if (ret)
3820 		goto out;
3821 
3822 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3823 		pr_warn("%s: Doesn't support hardware pass through.\n",
3824 			iommu->name);
3825 		return -ENXIO;
3826 	}
3827 	if (!ecap_sc_support(iommu->ecap) &&
3828 	    domain_update_iommu_snooping(iommu)) {
3829 		pr_warn("%s: Doesn't support snooping.\n",
3830 			iommu->name);
3831 		return -ENXIO;
3832 	}
3833 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3834 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3835 		pr_warn("%s: Doesn't support large page.\n",
3836 			iommu->name);
3837 		return -ENXIO;
3838 	}
3839 
3840 	/*
3841 	 * Disable translation if already enabled prior to OS handover.
3842 	 */
3843 	if (iommu->gcmd & DMA_GCMD_TE)
3844 		iommu_disable_translation(iommu);
3845 
3846 	g_iommus[iommu->seq_id] = iommu;
3847 	ret = iommu_init_domains(iommu);
3848 	if (ret == 0)
3849 		ret = iommu_alloc_root_entry(iommu);
3850 	if (ret)
3851 		goto out;
3852 
3853 	intel_svm_check(iommu);
3854 
3855 	if (dmaru->ignored) {
3856 		/*
3857 		 * we always have to disable PMRs or DMA may fail on this device
3858 		 */
3859 		if (force_on)
3860 			iommu_disable_protect_mem_regions(iommu);
3861 		return 0;
3862 	}
3863 
3864 	intel_iommu_init_qi(iommu);
3865 	iommu_flush_write_buffer(iommu);
3866 
3867 #ifdef CONFIG_INTEL_IOMMU_SVM
3868 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3869 		ret = intel_svm_enable_prq(iommu);
3870 		if (ret)
3871 			goto disable_iommu;
3872 	}
3873 #endif
3874 	ret = dmar_set_interrupt(iommu);
3875 	if (ret)
3876 		goto disable_iommu;
3877 
3878 	iommu_set_root_entry(iommu);
3879 	iommu_enable_translation(iommu);
3880 
3881 	iommu_disable_protect_mem_regions(iommu);
3882 	return 0;
3883 
3884 disable_iommu:
3885 	disable_dmar_iommu(iommu);
3886 out:
3887 	free_dmar_iommu(iommu);
3888 	return ret;
3889 }
3890 
3891 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3892 {
3893 	int ret = 0;
3894 	struct intel_iommu *iommu = dmaru->iommu;
3895 
3896 	if (!intel_iommu_enabled)
3897 		return 0;
3898 	if (iommu == NULL)
3899 		return -EINVAL;
3900 
3901 	if (insert) {
3902 		ret = intel_iommu_add(dmaru);
3903 	} else {
3904 		disable_dmar_iommu(iommu);
3905 		free_dmar_iommu(iommu);
3906 	}
3907 
3908 	return ret;
3909 }
3910 
3911 static void intel_iommu_free_dmars(void)
3912 {
3913 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3914 	struct dmar_atsr_unit *atsru, *atsr_n;
3915 	struct dmar_satc_unit *satcu, *satc_n;
3916 
3917 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3918 		list_del(&rmrru->list);
3919 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3920 		kfree(rmrru);
3921 	}
3922 
3923 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3924 		list_del(&atsru->list);
3925 		intel_iommu_free_atsr(atsru);
3926 	}
3927 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3928 		list_del(&satcu->list);
3929 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3930 		kfree(satcu);
3931 	}
3932 }
3933 
3934 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3935 {
3936 	int i, ret = 1;
3937 	struct pci_bus *bus;
3938 	struct pci_dev *bridge = NULL;
3939 	struct device *tmp;
3940 	struct acpi_dmar_atsr *atsr;
3941 	struct dmar_atsr_unit *atsru;
3942 
3943 	dev = pci_physfn(dev);
3944 	for (bus = dev->bus; bus; bus = bus->parent) {
3945 		bridge = bus->self;
3946 		/* If it's an integrated device, allow ATS */
3947 		if (!bridge)
3948 			return 1;
3949 		/* Connected via non-PCIe: no ATS */
3950 		if (!pci_is_pcie(bridge) ||
3951 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3952 			return 0;
3953 		/* If we found the root port, look it up in the ATSR */
3954 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3955 			break;
3956 	}
3957 
3958 	rcu_read_lock();
3959 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3960 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3961 		if (atsr->segment != pci_domain_nr(dev->bus))
3962 			continue;
3963 
3964 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3965 			if (tmp == &bridge->dev)
3966 				goto out;
3967 
3968 		if (atsru->include_all)
3969 			goto out;
3970 	}
3971 	ret = 0;
3972 out:
3973 	rcu_read_unlock();
3974 
3975 	return ret;
3976 }
3977 
3978 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3979 {
3980 	int ret;
3981 	struct dmar_rmrr_unit *rmrru;
3982 	struct dmar_atsr_unit *atsru;
3983 	struct dmar_satc_unit *satcu;
3984 	struct acpi_dmar_atsr *atsr;
3985 	struct acpi_dmar_reserved_memory *rmrr;
3986 	struct acpi_dmar_satc *satc;
3987 
3988 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3989 		return 0;
3990 
3991 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3992 		rmrr = container_of(rmrru->hdr,
3993 				    struct acpi_dmar_reserved_memory, header);
3994 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3995 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3996 				((void *)rmrr) + rmrr->header.length,
3997 				rmrr->segment, rmrru->devices,
3998 				rmrru->devices_cnt);
3999 			if (ret < 0)
4000 				return ret;
4001 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4002 			dmar_remove_dev_scope(info, rmrr->segment,
4003 				rmrru->devices, rmrru->devices_cnt);
4004 		}
4005 	}
4006 
4007 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4008 		if (atsru->include_all)
4009 			continue;
4010 
4011 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4012 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4013 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4014 					(void *)atsr + atsr->header.length,
4015 					atsr->segment, atsru->devices,
4016 					atsru->devices_cnt);
4017 			if (ret > 0)
4018 				break;
4019 			else if (ret < 0)
4020 				return ret;
4021 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4022 			if (dmar_remove_dev_scope(info, atsr->segment,
4023 					atsru->devices, atsru->devices_cnt))
4024 				break;
4025 		}
4026 	}
4027 	list_for_each_entry(satcu, &dmar_satc_units, list) {
4028 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4029 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4030 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4031 					(void *)satc + satc->header.length,
4032 					satc->segment, satcu->devices,
4033 					satcu->devices_cnt);
4034 			if (ret > 0)
4035 				break;
4036 			else if (ret < 0)
4037 				return ret;
4038 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4039 			if (dmar_remove_dev_scope(info, satc->segment,
4040 					satcu->devices, satcu->devices_cnt))
4041 				break;
4042 		}
4043 	}
4044 
4045 	return 0;
4046 }
4047 
4048 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4049 				       unsigned long val, void *v)
4050 {
4051 	struct memory_notify *mhp = v;
4052 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4053 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4054 			mhp->nr_pages - 1);
4055 
4056 	switch (val) {
4057 	case MEM_GOING_ONLINE:
4058 		if (iommu_domain_identity_map(si_domain,
4059 					      start_vpfn, last_vpfn)) {
4060 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4061 				start_vpfn, last_vpfn);
4062 			return NOTIFY_BAD;
4063 		}
4064 		break;
4065 
4066 	case MEM_OFFLINE:
4067 	case MEM_CANCEL_ONLINE:
4068 		{
4069 			struct dmar_drhd_unit *drhd;
4070 			struct intel_iommu *iommu;
4071 			struct page *freelist;
4072 
4073 			freelist = domain_unmap(si_domain,
4074 						start_vpfn, last_vpfn,
4075 						NULL);
4076 
4077 			rcu_read_lock();
4078 			for_each_active_iommu(iommu, drhd)
4079 				iommu_flush_iotlb_psi(iommu, si_domain,
4080 					start_vpfn, mhp->nr_pages,
4081 					!freelist, 0);
4082 			rcu_read_unlock();
4083 			dma_free_pagelist(freelist);
4084 		}
4085 		break;
4086 	}
4087 
4088 	return NOTIFY_OK;
4089 }
4090 
4091 static struct notifier_block intel_iommu_memory_nb = {
4092 	.notifier_call = intel_iommu_memory_notifier,
4093 	.priority = 0
4094 };
4095 
4096 static void intel_disable_iommus(void)
4097 {
4098 	struct intel_iommu *iommu = NULL;
4099 	struct dmar_drhd_unit *drhd;
4100 
4101 	for_each_iommu(iommu, drhd)
4102 		iommu_disable_translation(iommu);
4103 }
4104 
4105 void intel_iommu_shutdown(void)
4106 {
4107 	struct dmar_drhd_unit *drhd;
4108 	struct intel_iommu *iommu = NULL;
4109 
4110 	if (no_iommu || dmar_disabled)
4111 		return;
4112 
4113 	down_write(&dmar_global_lock);
4114 
4115 	/* Disable PMRs explicitly here. */
4116 	for_each_iommu(iommu, drhd)
4117 		iommu_disable_protect_mem_regions(iommu);
4118 
4119 	/* Make sure the IOMMUs are switched off */
4120 	intel_disable_iommus();
4121 
4122 	up_write(&dmar_global_lock);
4123 }
4124 
4125 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4126 {
4127 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4128 
4129 	return container_of(iommu_dev, struct intel_iommu, iommu);
4130 }
4131 
4132 static ssize_t version_show(struct device *dev,
4133 			    struct device_attribute *attr, char *buf)
4134 {
4135 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4136 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4137 	return sprintf(buf, "%d:%d\n",
4138 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4139 }
4140 static DEVICE_ATTR_RO(version);
4141 
4142 static ssize_t address_show(struct device *dev,
4143 			    struct device_attribute *attr, char *buf)
4144 {
4145 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4146 	return sprintf(buf, "%llx\n", iommu->reg_phys);
4147 }
4148 static DEVICE_ATTR_RO(address);
4149 
4150 static ssize_t cap_show(struct device *dev,
4151 			struct device_attribute *attr, char *buf)
4152 {
4153 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4154 	return sprintf(buf, "%llx\n", iommu->cap);
4155 }
4156 static DEVICE_ATTR_RO(cap);
4157 
4158 static ssize_t ecap_show(struct device *dev,
4159 			 struct device_attribute *attr, char *buf)
4160 {
4161 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4162 	return sprintf(buf, "%llx\n", iommu->ecap);
4163 }
4164 static DEVICE_ATTR_RO(ecap);
4165 
4166 static ssize_t domains_supported_show(struct device *dev,
4167 				      struct device_attribute *attr, char *buf)
4168 {
4169 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4170 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4171 }
4172 static DEVICE_ATTR_RO(domains_supported);
4173 
4174 static ssize_t domains_used_show(struct device *dev,
4175 				 struct device_attribute *attr, char *buf)
4176 {
4177 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4178 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4179 						  cap_ndoms(iommu->cap)));
4180 }
4181 static DEVICE_ATTR_RO(domains_used);
4182 
4183 static struct attribute *intel_iommu_attrs[] = {
4184 	&dev_attr_version.attr,
4185 	&dev_attr_address.attr,
4186 	&dev_attr_cap.attr,
4187 	&dev_attr_ecap.attr,
4188 	&dev_attr_domains_supported.attr,
4189 	&dev_attr_domains_used.attr,
4190 	NULL,
4191 };
4192 
4193 static struct attribute_group intel_iommu_group = {
4194 	.name = "intel-iommu",
4195 	.attrs = intel_iommu_attrs,
4196 };
4197 
4198 const struct attribute_group *intel_iommu_groups[] = {
4199 	&intel_iommu_group,
4200 	NULL,
4201 };
4202 
4203 static inline bool has_external_pci(void)
4204 {
4205 	struct pci_dev *pdev = NULL;
4206 
4207 	for_each_pci_dev(pdev)
4208 		if (pdev->external_facing)
4209 			return true;
4210 
4211 	return false;
4212 }
4213 
4214 static int __init platform_optin_force_iommu(void)
4215 {
4216 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4217 		return 0;
4218 
4219 	if (no_iommu || dmar_disabled)
4220 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4221 
4222 	/*
4223 	 * If Intel-IOMMU is disabled by default, we will apply identity
4224 	 * map for all devices except those marked as being untrusted.
4225 	 */
4226 	if (dmar_disabled)
4227 		iommu_set_default_passthrough(false);
4228 
4229 	dmar_disabled = 0;
4230 	no_iommu = 0;
4231 
4232 	return 1;
4233 }
4234 
4235 static int __init probe_acpi_namespace_devices(void)
4236 {
4237 	struct dmar_drhd_unit *drhd;
4238 	/* To avoid a -Wunused-but-set-variable warning. */
4239 	struct intel_iommu *iommu __maybe_unused;
4240 	struct device *dev;
4241 	int i, ret = 0;
4242 
4243 	for_each_active_iommu(iommu, drhd) {
4244 		for_each_active_dev_scope(drhd->devices,
4245 					  drhd->devices_cnt, i, dev) {
4246 			struct acpi_device_physical_node *pn;
4247 			struct iommu_group *group;
4248 			struct acpi_device *adev;
4249 
4250 			if (dev->bus != &acpi_bus_type)
4251 				continue;
4252 
4253 			adev = to_acpi_device(dev);
4254 			mutex_lock(&adev->physical_node_lock);
4255 			list_for_each_entry(pn,
4256 					    &adev->physical_node_list, node) {
4257 				group = iommu_group_get(pn->dev);
4258 				if (group) {
4259 					iommu_group_put(group);
4260 					continue;
4261 				}
4262 
4263 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4264 				ret = iommu_probe_device(pn->dev);
4265 				if (ret)
4266 					break;
4267 			}
4268 			mutex_unlock(&adev->physical_node_lock);
4269 
4270 			if (ret)
4271 				return ret;
4272 		}
4273 	}
4274 
4275 	return 0;
4276 }
4277 
4278 int __init intel_iommu_init(void)
4279 {
4280 	int ret = -ENODEV;
4281 	struct dmar_drhd_unit *drhd;
4282 	struct intel_iommu *iommu;
4283 
4284 	/*
4285 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4286 	 * opt in, so enforce that.
4287 	 */
4288 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4289 		    platform_optin_force_iommu();
4290 
4291 	if (iommu_init_mempool()) {
4292 		if (force_on)
4293 			panic("tboot: Failed to initialize iommu memory\n");
4294 		return -ENOMEM;
4295 	}
4296 
4297 	down_write(&dmar_global_lock);
4298 	if (dmar_table_init()) {
4299 		if (force_on)
4300 			panic("tboot: Failed to initialize DMAR table\n");
4301 		goto out_free_dmar;
4302 	}
4303 
4304 	if (dmar_dev_scope_init() < 0) {
4305 		if (force_on)
4306 			panic("tboot: Failed to initialize DMAR device scope\n");
4307 		goto out_free_dmar;
4308 	}
4309 
4310 	up_write(&dmar_global_lock);
4311 
4312 	/*
4313 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4314 	 * complain later when we register it under the lock.
4315 	 */
4316 	dmar_register_bus_notifier();
4317 
4318 	down_write(&dmar_global_lock);
4319 
4320 	if (!no_iommu)
4321 		intel_iommu_debugfs_init();
4322 
4323 	if (no_iommu || dmar_disabled) {
4324 		/*
4325 		 * We exit the function here to ensure IOMMU's remapping and
4326 		 * mempool aren't setup, which means that the IOMMU's PMRs
4327 		 * won't be disabled via the call to init_dmars(). So disable
4328 		 * it explicitly here. The PMRs were setup by tboot prior to
4329 		 * calling SENTER, but the kernel is expected to reset/tear
4330 		 * down the PMRs.
4331 		 */
4332 		if (intel_iommu_tboot_noforce) {
4333 			for_each_iommu(iommu, drhd)
4334 				iommu_disable_protect_mem_regions(iommu);
4335 		}
4336 
4337 		/*
4338 		 * Make sure the IOMMUs are switched off, even when we
4339 		 * boot into a kexec kernel and the previous kernel left
4340 		 * them enabled
4341 		 */
4342 		intel_disable_iommus();
4343 		goto out_free_dmar;
4344 	}
4345 
4346 	if (list_empty(&dmar_rmrr_units))
4347 		pr_info("No RMRR found\n");
4348 
4349 	if (list_empty(&dmar_atsr_units))
4350 		pr_info("No ATSR found\n");
4351 
4352 	if (list_empty(&dmar_satc_units))
4353 		pr_info("No SATC found\n");
4354 
4355 	if (dmar_map_gfx)
4356 		intel_iommu_gfx_mapped = 1;
4357 
4358 	init_no_remapping_devices();
4359 
4360 	ret = init_dmars();
4361 	if (ret) {
4362 		if (force_on)
4363 			panic("tboot: Failed to initialize DMARs\n");
4364 		pr_err("Initialization failed\n");
4365 		goto out_free_dmar;
4366 	}
4367 	up_write(&dmar_global_lock);
4368 
4369 	init_iommu_pm_ops();
4370 
4371 	down_read(&dmar_global_lock);
4372 	for_each_active_iommu(iommu, drhd) {
4373 		/*
4374 		 * The flush queue implementation does not perform
4375 		 * page-selective invalidations that are required for efficient
4376 		 * TLB flushes in virtual environments.  The benefit of batching
4377 		 * is likely to be much lower than the overhead of synchronizing
4378 		 * the virtual and physical IOMMU page-tables.
4379 		 */
4380 		if (cap_caching_mode(iommu->cap)) {
4381 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4382 			iommu_set_dma_strict();
4383 		}
4384 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4385 				       intel_iommu_groups,
4386 				       "%s", iommu->name);
4387 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4388 	}
4389 	up_read(&dmar_global_lock);
4390 
4391 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4392 	if (si_domain && !hw_pass_through)
4393 		register_memory_notifier(&intel_iommu_memory_nb);
4394 
4395 	down_read(&dmar_global_lock);
4396 	if (probe_acpi_namespace_devices())
4397 		pr_warn("ACPI name space devices didn't probe correctly\n");
4398 
4399 	/* Finally, we enable the DMA remapping hardware. */
4400 	for_each_iommu(iommu, drhd) {
4401 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4402 			iommu_enable_translation(iommu);
4403 
4404 		iommu_disable_protect_mem_regions(iommu);
4405 	}
4406 	up_read(&dmar_global_lock);
4407 
4408 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4409 
4410 	intel_iommu_enabled = 1;
4411 
4412 	return 0;
4413 
4414 out_free_dmar:
4415 	intel_iommu_free_dmars();
4416 	up_write(&dmar_global_lock);
4417 	iommu_exit_mempool();
4418 	return ret;
4419 }
4420 
4421 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4422 {
4423 	struct device_domain_info *info = opaque;
4424 
4425 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4426 	return 0;
4427 }
4428 
4429 /*
4430  * NB - intel-iommu lacks any sort of reference counting for the users of
4431  * dependent devices.  If multiple endpoints have intersecting dependent
4432  * devices, unbinding the driver from any one of them will possibly leave
4433  * the others unable to operate.
4434  */
4435 static void domain_context_clear(struct device_domain_info *info)
4436 {
4437 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4438 		return;
4439 
4440 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4441 			       &domain_context_clear_one_cb, info);
4442 }
4443 
4444 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4445 {
4446 	struct dmar_domain *domain;
4447 	struct intel_iommu *iommu;
4448 	unsigned long flags;
4449 
4450 	assert_spin_locked(&device_domain_lock);
4451 
4452 	if (WARN_ON(!info))
4453 		return;
4454 
4455 	iommu = info->iommu;
4456 	domain = info->domain;
4457 
4458 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4459 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4460 			intel_pasid_tear_down_entry(iommu, info->dev,
4461 					PASID_RID2PASID, false);
4462 
4463 		iommu_disable_dev_iotlb(info);
4464 		domain_context_clear(info);
4465 		intel_pasid_free_table(info->dev);
4466 	}
4467 
4468 	unlink_domain_info(info);
4469 
4470 	spin_lock_irqsave(&iommu->lock, flags);
4471 	domain_detach_iommu(domain, iommu);
4472 	spin_unlock_irqrestore(&iommu->lock, flags);
4473 
4474 	free_devinfo_mem(info);
4475 }
4476 
4477 static void dmar_remove_one_dev_info(struct device *dev)
4478 {
4479 	struct device_domain_info *info;
4480 	unsigned long flags;
4481 
4482 	spin_lock_irqsave(&device_domain_lock, flags);
4483 	info = get_domain_info(dev);
4484 	if (info)
4485 		__dmar_remove_one_dev_info(info);
4486 	spin_unlock_irqrestore(&device_domain_lock, flags);
4487 }
4488 
4489 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4490 {
4491 	int adjust_width;
4492 
4493 	/* calculate AGAW */
4494 	domain->gaw = guest_width;
4495 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4496 	domain->agaw = width_to_agaw(adjust_width);
4497 
4498 	domain->iommu_coherency = false;
4499 	domain->iommu_snooping = false;
4500 	domain->iommu_superpage = 0;
4501 	domain->max_addr = 0;
4502 
4503 	/* always allocate the top pgd */
4504 	domain->pgd = alloc_pgtable_page(domain->nid);
4505 	if (!domain->pgd)
4506 		return -ENOMEM;
4507 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4508 	return 0;
4509 }
4510 
4511 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4512 {
4513 	struct dmar_domain *dmar_domain;
4514 	struct iommu_domain *domain;
4515 
4516 	switch (type) {
4517 	case IOMMU_DOMAIN_DMA:
4518 	case IOMMU_DOMAIN_DMA_FQ:
4519 	case IOMMU_DOMAIN_UNMANAGED:
4520 		dmar_domain = alloc_domain(0);
4521 		if (!dmar_domain) {
4522 			pr_err("Can't allocate dmar_domain\n");
4523 			return NULL;
4524 		}
4525 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4526 			pr_err("Domain initialization failed\n");
4527 			domain_exit(dmar_domain);
4528 			return NULL;
4529 		}
4530 
4531 		domain = &dmar_domain->domain;
4532 		domain->geometry.aperture_start = 0;
4533 		domain->geometry.aperture_end   =
4534 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4535 		domain->geometry.force_aperture = true;
4536 
4537 		return domain;
4538 	case IOMMU_DOMAIN_IDENTITY:
4539 		return &si_domain->domain;
4540 	default:
4541 		return NULL;
4542 	}
4543 
4544 	return NULL;
4545 }
4546 
4547 static void intel_iommu_domain_free(struct iommu_domain *domain)
4548 {
4549 	if (domain != &si_domain->domain)
4550 		domain_exit(to_dmar_domain(domain));
4551 }
4552 
4553 /*
4554  * Check whether a @domain could be attached to the @dev through the
4555  * aux-domain attach/detach APIs.
4556  */
4557 static inline bool
4558 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4559 {
4560 	struct device_domain_info *info = get_domain_info(dev);
4561 
4562 	return info && info->auxd_enabled &&
4563 			domain->type == IOMMU_DOMAIN_UNMANAGED;
4564 }
4565 
4566 static inline struct subdev_domain_info *
4567 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4568 {
4569 	struct subdev_domain_info *sinfo;
4570 
4571 	if (!list_empty(&domain->subdevices)) {
4572 		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4573 			if (sinfo->pdev == dev)
4574 				return sinfo;
4575 		}
4576 	}
4577 
4578 	return NULL;
4579 }
4580 
4581 static int auxiliary_link_device(struct dmar_domain *domain,
4582 				 struct device *dev)
4583 {
4584 	struct device_domain_info *info = get_domain_info(dev);
4585 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4586 
4587 	assert_spin_locked(&device_domain_lock);
4588 	if (WARN_ON(!info))
4589 		return -EINVAL;
4590 
4591 	if (!sinfo) {
4592 		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4593 		if (!sinfo)
4594 			return -ENOMEM;
4595 		sinfo->domain = domain;
4596 		sinfo->pdev = dev;
4597 		list_add(&sinfo->link_phys, &info->subdevices);
4598 		list_add(&sinfo->link_domain, &domain->subdevices);
4599 	}
4600 
4601 	return ++sinfo->users;
4602 }
4603 
4604 static int auxiliary_unlink_device(struct dmar_domain *domain,
4605 				   struct device *dev)
4606 {
4607 	struct device_domain_info *info = get_domain_info(dev);
4608 	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4609 	int ret;
4610 
4611 	assert_spin_locked(&device_domain_lock);
4612 	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4613 		return -EINVAL;
4614 
4615 	ret = --sinfo->users;
4616 	if (!ret) {
4617 		list_del(&sinfo->link_phys);
4618 		list_del(&sinfo->link_domain);
4619 		kfree(sinfo);
4620 	}
4621 
4622 	return ret;
4623 }
4624 
4625 static int aux_domain_add_dev(struct dmar_domain *domain,
4626 			      struct device *dev)
4627 {
4628 	int ret;
4629 	unsigned long flags;
4630 	struct intel_iommu *iommu;
4631 
4632 	iommu = device_to_iommu(dev, NULL, NULL);
4633 	if (!iommu)
4634 		return -ENODEV;
4635 
4636 	if (domain->default_pasid <= 0) {
4637 		u32 pasid;
4638 
4639 		/* No private data needed for the default pasid */
4640 		pasid = ioasid_alloc(NULL, PASID_MIN,
4641 				     pci_max_pasids(to_pci_dev(dev)) - 1,
4642 				     NULL);
4643 		if (pasid == INVALID_IOASID) {
4644 			pr_err("Can't allocate default pasid\n");
4645 			return -ENODEV;
4646 		}
4647 		domain->default_pasid = pasid;
4648 	}
4649 
4650 	spin_lock_irqsave(&device_domain_lock, flags);
4651 	ret = auxiliary_link_device(domain, dev);
4652 	if (ret <= 0)
4653 		goto link_failed;
4654 
4655 	/*
4656 	 * Subdevices from the same physical device can be attached to the
4657 	 * same domain. For such cases, only the first subdevice attachment
4658 	 * needs to go through the full steps in this function. So if ret >
4659 	 * 1, just goto out.
4660 	 */
4661 	if (ret > 1)
4662 		goto out;
4663 
4664 	/*
4665 	 * iommu->lock must be held to attach domain to iommu and setup the
4666 	 * pasid entry for second level translation.
4667 	 */
4668 	spin_lock(&iommu->lock);
4669 	ret = domain_attach_iommu(domain, iommu);
4670 	if (ret)
4671 		goto attach_failed;
4672 
4673 	/* Setup the PASID entry for mediated devices: */
4674 	if (domain_use_first_level(domain))
4675 		ret = domain_setup_first_level(iommu, domain, dev,
4676 					       domain->default_pasid);
4677 	else
4678 		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4679 						     domain->default_pasid);
4680 	if (ret)
4681 		goto table_failed;
4682 
4683 	spin_unlock(&iommu->lock);
4684 out:
4685 	spin_unlock_irqrestore(&device_domain_lock, flags);
4686 
4687 	return 0;
4688 
4689 table_failed:
4690 	domain_detach_iommu(domain, iommu);
4691 attach_failed:
4692 	spin_unlock(&iommu->lock);
4693 	auxiliary_unlink_device(domain, dev);
4694 link_failed:
4695 	spin_unlock_irqrestore(&device_domain_lock, flags);
4696 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4697 		ioasid_put(domain->default_pasid);
4698 
4699 	return ret;
4700 }
4701 
4702 static void aux_domain_remove_dev(struct dmar_domain *domain,
4703 				  struct device *dev)
4704 {
4705 	struct device_domain_info *info;
4706 	struct intel_iommu *iommu;
4707 	unsigned long flags;
4708 
4709 	if (!is_aux_domain(dev, &domain->domain))
4710 		return;
4711 
4712 	spin_lock_irqsave(&device_domain_lock, flags);
4713 	info = get_domain_info(dev);
4714 	iommu = info->iommu;
4715 
4716 	if (!auxiliary_unlink_device(domain, dev)) {
4717 		spin_lock(&iommu->lock);
4718 		intel_pasid_tear_down_entry(iommu, dev,
4719 					    domain->default_pasid, false);
4720 		domain_detach_iommu(domain, iommu);
4721 		spin_unlock(&iommu->lock);
4722 	}
4723 
4724 	spin_unlock_irqrestore(&device_domain_lock, flags);
4725 
4726 	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4727 		ioasid_put(domain->default_pasid);
4728 }
4729 
4730 static int prepare_domain_attach_device(struct iommu_domain *domain,
4731 					struct device *dev)
4732 {
4733 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4734 	struct intel_iommu *iommu;
4735 	int addr_width;
4736 
4737 	iommu = device_to_iommu(dev, NULL, NULL);
4738 	if (!iommu)
4739 		return -ENODEV;
4740 
4741 	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4742 	    !ecap_nest(iommu->ecap)) {
4743 		dev_err(dev, "%s: iommu not support nested translation\n",
4744 			iommu->name);
4745 		return -EINVAL;
4746 	}
4747 
4748 	/* check if this iommu agaw is sufficient for max mapped address */
4749 	addr_width = agaw_to_width(iommu->agaw);
4750 	if (addr_width > cap_mgaw(iommu->cap))
4751 		addr_width = cap_mgaw(iommu->cap);
4752 
4753 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4754 		dev_err(dev, "%s: iommu width (%d) is not "
4755 		        "sufficient for the mapped address (%llx)\n",
4756 		        __func__, addr_width, dmar_domain->max_addr);
4757 		return -EFAULT;
4758 	}
4759 	dmar_domain->gaw = addr_width;
4760 
4761 	/*
4762 	 * Knock out extra levels of page tables if necessary
4763 	 */
4764 	while (iommu->agaw < dmar_domain->agaw) {
4765 		struct dma_pte *pte;
4766 
4767 		pte = dmar_domain->pgd;
4768 		if (dma_pte_present(pte)) {
4769 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4770 			free_pgtable_page(pte);
4771 		}
4772 		dmar_domain->agaw--;
4773 	}
4774 
4775 	return 0;
4776 }
4777 
4778 static int intel_iommu_attach_device(struct iommu_domain *domain,
4779 				     struct device *dev)
4780 {
4781 	int ret;
4782 
4783 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4784 	    device_is_rmrr_locked(dev)) {
4785 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4786 		return -EPERM;
4787 	}
4788 
4789 	if (is_aux_domain(dev, domain))
4790 		return -EPERM;
4791 
4792 	/* normally dev is not mapped */
4793 	if (unlikely(domain_context_mapped(dev))) {
4794 		struct dmar_domain *old_domain;
4795 
4796 		old_domain = find_domain(dev);
4797 		if (old_domain)
4798 			dmar_remove_one_dev_info(dev);
4799 	}
4800 
4801 	ret = prepare_domain_attach_device(domain, dev);
4802 	if (ret)
4803 		return ret;
4804 
4805 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4806 }
4807 
4808 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4809 					 struct device *dev)
4810 {
4811 	int ret;
4812 
4813 	if (!is_aux_domain(dev, domain))
4814 		return -EPERM;
4815 
4816 	ret = prepare_domain_attach_device(domain, dev);
4817 	if (ret)
4818 		return ret;
4819 
4820 	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4821 }
4822 
4823 static void intel_iommu_detach_device(struct iommu_domain *domain,
4824 				      struct device *dev)
4825 {
4826 	dmar_remove_one_dev_info(dev);
4827 }
4828 
4829 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4830 					  struct device *dev)
4831 {
4832 	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4833 }
4834 
4835 #ifdef CONFIG_INTEL_IOMMU_SVM
4836 /*
4837  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4838  * VT-d granularity. Invalidation is typically included in the unmap operation
4839  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4840  * owns the first level page tables. Invalidations of translation caches in the
4841  * guest are trapped and passed down to the host.
4842  *
4843  * vIOMMU in the guest will only expose first level page tables, therefore
4844  * we do not support IOTLB granularity for request without PASID (second level).
4845  *
4846  * For example, to find the VT-d granularity encoding for IOTLB
4847  * type and page selective granularity within PASID:
4848  * X: indexed by iommu cache type
4849  * Y: indexed by enum iommu_inv_granularity
4850  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4851  */
4852 
4853 static const int
4854 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4855 	/*
4856 	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4857 	 * page selective (address granularity)
4858 	 */
4859 	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4860 	/* PASID based dev TLBs */
4861 	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4862 	/* PASID cache */
4863 	{-EINVAL, -EINVAL, -EINVAL}
4864 };
4865 
4866 static inline int to_vtd_granularity(int type, int granu)
4867 {
4868 	return inv_type_granu_table[type][granu];
4869 }
4870 
4871 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4872 {
4873 	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4874 
4875 	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4876 	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4877 	 * granu size in contiguous memory.
4878 	 */
4879 	return order_base_2(nr_pages);
4880 }
4881 
4882 static int
4883 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4884 			   struct iommu_cache_invalidate_info *inv_info)
4885 {
4886 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4887 	struct device_domain_info *info;
4888 	struct intel_iommu *iommu;
4889 	unsigned long flags;
4890 	int cache_type;
4891 	u8 bus, devfn;
4892 	u16 did, sid;
4893 	int ret = 0;
4894 	u64 size = 0;
4895 
4896 	if (!inv_info || !dmar_domain)
4897 		return -EINVAL;
4898 
4899 	if (!dev || !dev_is_pci(dev))
4900 		return -ENODEV;
4901 
4902 	iommu = device_to_iommu(dev, &bus, &devfn);
4903 	if (!iommu)
4904 		return -ENODEV;
4905 
4906 	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4907 		return -EINVAL;
4908 
4909 	spin_lock_irqsave(&device_domain_lock, flags);
4910 	spin_lock(&iommu->lock);
4911 	info = get_domain_info(dev);
4912 	if (!info) {
4913 		ret = -EINVAL;
4914 		goto out_unlock;
4915 	}
4916 	did = dmar_domain->iommu_did[iommu->seq_id];
4917 	sid = PCI_DEVID(bus, devfn);
4918 
4919 	/* Size is only valid in address selective invalidation */
4920 	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4921 		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4922 				   inv_info->granu.addr_info.nb_granules);
4923 
4924 	for_each_set_bit(cache_type,
4925 			 (unsigned long *)&inv_info->cache,
4926 			 IOMMU_CACHE_INV_TYPE_NR) {
4927 		int granu = 0;
4928 		u64 pasid = 0;
4929 		u64 addr = 0;
4930 
4931 		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4932 		if (granu == -EINVAL) {
4933 			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4934 					   cache_type, inv_info->granularity);
4935 			break;
4936 		}
4937 
4938 		/*
4939 		 * PASID is stored in different locations based on the
4940 		 * granularity.
4941 		 */
4942 		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4943 		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4944 			pasid = inv_info->granu.pasid_info.pasid;
4945 		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4946 			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4947 			pasid = inv_info->granu.addr_info.pasid;
4948 
4949 		switch (BIT(cache_type)) {
4950 		case IOMMU_CACHE_INV_TYPE_IOTLB:
4951 			/* HW will ignore LSB bits based on address mask */
4952 			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4953 			    size &&
4954 			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4955 				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4956 						   inv_info->granu.addr_info.addr, size);
4957 			}
4958 
4959 			/*
4960 			 * If granu is PASID-selective, address is ignored.
4961 			 * We use npages = -1 to indicate that.
4962 			 */
4963 			qi_flush_piotlb(iommu, did, pasid,
4964 					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4965 					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4966 					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4967 
4968 			if (!info->ats_enabled)
4969 				break;
4970 			/*
4971 			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4972 			 * in the guest may assume IOTLB flush is inclusive,
4973 			 * which is more efficient.
4974 			 */
4975 			fallthrough;
4976 		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4977 			/*
4978 			 * PASID based device TLB invalidation does not support
4979 			 * IOMMU_INV_GRANU_PASID granularity but only supports
4980 			 * IOMMU_INV_GRANU_ADDR.
4981 			 * The equivalent of that is we set the size to be the
4982 			 * entire range of 64 bit. User only provides PASID info
4983 			 * without address info. So we set addr to 0.
4984 			 */
4985 			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4986 				size = 64 - VTD_PAGE_SHIFT;
4987 				addr = 0;
4988 			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4989 				addr = inv_info->granu.addr_info.addr;
4990 			}
4991 
4992 			if (info->ats_enabled)
4993 				qi_flush_dev_iotlb_pasid(iommu, sid,
4994 						info->pfsid, pasid,
4995 						info->ats_qdep, addr,
4996 						size);
4997 			else
4998 				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4999 			break;
5000 		default:
5001 			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5002 					    cache_type);
5003 			ret = -EINVAL;
5004 		}
5005 	}
5006 out_unlock:
5007 	spin_unlock(&iommu->lock);
5008 	spin_unlock_irqrestore(&device_domain_lock, flags);
5009 
5010 	return ret;
5011 }
5012 #endif
5013 
5014 static int intel_iommu_map(struct iommu_domain *domain,
5015 			   unsigned long iova, phys_addr_t hpa,
5016 			   size_t size, int iommu_prot, gfp_t gfp)
5017 {
5018 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5019 	u64 max_addr;
5020 	int prot = 0;
5021 
5022 	if (iommu_prot & IOMMU_READ)
5023 		prot |= DMA_PTE_READ;
5024 	if (iommu_prot & IOMMU_WRITE)
5025 		prot |= DMA_PTE_WRITE;
5026 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5027 		prot |= DMA_PTE_SNP;
5028 
5029 	max_addr = iova + size;
5030 	if (dmar_domain->max_addr < max_addr) {
5031 		u64 end;
5032 
5033 		/* check if minimum agaw is sufficient for mapped address */
5034 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5035 		if (end < max_addr) {
5036 			pr_err("%s: iommu width (%d) is not "
5037 			       "sufficient for the mapped address (%llx)\n",
5038 			       __func__, dmar_domain->gaw, max_addr);
5039 			return -EFAULT;
5040 		}
5041 		dmar_domain->max_addr = max_addr;
5042 	}
5043 	/* Round up size to next multiple of PAGE_SIZE, if it and
5044 	   the low bits of hpa would take us onto the next page */
5045 	size = aligned_nrpages(hpa, size);
5046 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5047 				hpa >> VTD_PAGE_SHIFT, size, prot);
5048 }
5049 
5050 static int intel_iommu_map_pages(struct iommu_domain *domain,
5051 				 unsigned long iova, phys_addr_t paddr,
5052 				 size_t pgsize, size_t pgcount,
5053 				 int prot, gfp_t gfp, size_t *mapped)
5054 {
5055 	unsigned long pgshift = __ffs(pgsize);
5056 	size_t size = pgcount << pgshift;
5057 	int ret;
5058 
5059 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5060 		return -EINVAL;
5061 
5062 	if (!IS_ALIGNED(iova | paddr, pgsize))
5063 		return -EINVAL;
5064 
5065 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5066 	if (!ret && mapped)
5067 		*mapped = size;
5068 
5069 	return ret;
5070 }
5071 
5072 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5073 				unsigned long iova, size_t size,
5074 				struct iommu_iotlb_gather *gather)
5075 {
5076 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5077 	unsigned long start_pfn, last_pfn;
5078 	int level = 0;
5079 
5080 	/* Cope with horrid API which requires us to unmap more than the
5081 	   size argument if it happens to be a large-page mapping. */
5082 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5083 
5084 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5085 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5086 
5087 	start_pfn = iova >> VTD_PAGE_SHIFT;
5088 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5089 
5090 	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5091 					last_pfn, gather->freelist);
5092 
5093 	if (dmar_domain->max_addr == iova + size)
5094 		dmar_domain->max_addr = iova;
5095 
5096 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5097 
5098 	return size;
5099 }
5100 
5101 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5102 				      unsigned long iova,
5103 				      size_t pgsize, size_t pgcount,
5104 				      struct iommu_iotlb_gather *gather)
5105 {
5106 	unsigned long pgshift = __ffs(pgsize);
5107 	size_t size = pgcount << pgshift;
5108 
5109 	return intel_iommu_unmap(domain, iova, size, gather);
5110 }
5111 
5112 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5113 				 struct iommu_iotlb_gather *gather)
5114 {
5115 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5116 	unsigned long iova_pfn = IOVA_PFN(gather->start);
5117 	size_t size = gather->end - gather->start;
5118 	unsigned long start_pfn;
5119 	unsigned long nrpages;
5120 	int iommu_id;
5121 
5122 	nrpages = aligned_nrpages(gather->start, size);
5123 	start_pfn = mm_to_dma_pfn(iova_pfn);
5124 
5125 	for_each_domain_iommu(iommu_id, dmar_domain)
5126 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5127 				      start_pfn, nrpages, !gather->freelist, 0);
5128 
5129 	dma_free_pagelist(gather->freelist);
5130 }
5131 
5132 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5133 					    dma_addr_t iova)
5134 {
5135 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5136 	struct dma_pte *pte;
5137 	int level = 0;
5138 	u64 phys = 0;
5139 
5140 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5141 	if (pte && dma_pte_present(pte))
5142 		phys = dma_pte_addr(pte) +
5143 			(iova & (BIT_MASK(level_to_offset_bits(level) +
5144 						VTD_PAGE_SHIFT) - 1));
5145 
5146 	return phys;
5147 }
5148 
5149 static bool intel_iommu_capable(enum iommu_cap cap)
5150 {
5151 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5152 		return domain_update_iommu_snooping(NULL);
5153 	if (cap == IOMMU_CAP_INTR_REMAP)
5154 		return irq_remapping_enabled == 1;
5155 
5156 	return false;
5157 }
5158 
5159 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5160 {
5161 	struct intel_iommu *iommu;
5162 
5163 	iommu = device_to_iommu(dev, NULL, NULL);
5164 	if (!iommu)
5165 		return ERR_PTR(-ENODEV);
5166 
5167 	if (translation_pre_enabled(iommu))
5168 		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5169 
5170 	return &iommu->iommu;
5171 }
5172 
5173 static void intel_iommu_release_device(struct device *dev)
5174 {
5175 	struct intel_iommu *iommu;
5176 
5177 	iommu = device_to_iommu(dev, NULL, NULL);
5178 	if (!iommu)
5179 		return;
5180 
5181 	dmar_remove_one_dev_info(dev);
5182 
5183 	set_dma_ops(dev, NULL);
5184 }
5185 
5186 static void intel_iommu_probe_finalize(struct device *dev)
5187 {
5188 	set_dma_ops(dev, NULL);
5189 	iommu_setup_dma_ops(dev, 0, U64_MAX);
5190 }
5191 
5192 static void intel_iommu_get_resv_regions(struct device *device,
5193 					 struct list_head *head)
5194 {
5195 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5196 	struct iommu_resv_region *reg;
5197 	struct dmar_rmrr_unit *rmrr;
5198 	struct device *i_dev;
5199 	int i;
5200 
5201 	down_read(&dmar_global_lock);
5202 	for_each_rmrr_units(rmrr) {
5203 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5204 					  i, i_dev) {
5205 			struct iommu_resv_region *resv;
5206 			enum iommu_resv_type type;
5207 			size_t length;
5208 
5209 			if (i_dev != device &&
5210 			    !is_downstream_to_pci_bridge(device, i_dev))
5211 				continue;
5212 
5213 			length = rmrr->end_address - rmrr->base_address + 1;
5214 
5215 			type = device_rmrr_is_relaxable(device) ?
5216 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5217 
5218 			resv = iommu_alloc_resv_region(rmrr->base_address,
5219 						       length, prot, type);
5220 			if (!resv)
5221 				break;
5222 
5223 			list_add_tail(&resv->list, head);
5224 		}
5225 	}
5226 	up_read(&dmar_global_lock);
5227 
5228 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5229 	if (dev_is_pci(device)) {
5230 		struct pci_dev *pdev = to_pci_dev(device);
5231 
5232 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5233 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5234 						   IOMMU_RESV_DIRECT_RELAXABLE);
5235 			if (reg)
5236 				list_add_tail(&reg->list, head);
5237 		}
5238 	}
5239 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5240 
5241 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5242 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5243 				      0, IOMMU_RESV_MSI);
5244 	if (!reg)
5245 		return;
5246 	list_add_tail(&reg->list, head);
5247 }
5248 
5249 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5250 {
5251 	struct device_domain_info *info;
5252 	struct context_entry *context;
5253 	struct dmar_domain *domain;
5254 	unsigned long flags;
5255 	u64 ctx_lo;
5256 	int ret;
5257 
5258 	domain = find_domain(dev);
5259 	if (!domain)
5260 		return -EINVAL;
5261 
5262 	spin_lock_irqsave(&device_domain_lock, flags);
5263 	spin_lock(&iommu->lock);
5264 
5265 	ret = -EINVAL;
5266 	info = get_domain_info(dev);
5267 	if (!info || !info->pasid_supported)
5268 		goto out;
5269 
5270 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5271 	if (WARN_ON(!context))
5272 		goto out;
5273 
5274 	ctx_lo = context[0].lo;
5275 
5276 	if (!(ctx_lo & CONTEXT_PASIDE)) {
5277 		ctx_lo |= CONTEXT_PASIDE;
5278 		context[0].lo = ctx_lo;
5279 		wmb();
5280 		iommu->flush.flush_context(iommu,
5281 					   domain->iommu_did[iommu->seq_id],
5282 					   PCI_DEVID(info->bus, info->devfn),
5283 					   DMA_CCMD_MASK_NOBIT,
5284 					   DMA_CCMD_DEVICE_INVL);
5285 	}
5286 
5287 	/* Enable PASID support in the device, if it wasn't already */
5288 	if (!info->pasid_enabled)
5289 		iommu_enable_dev_iotlb(info);
5290 
5291 	ret = 0;
5292 
5293  out:
5294 	spin_unlock(&iommu->lock);
5295 	spin_unlock_irqrestore(&device_domain_lock, flags);
5296 
5297 	return ret;
5298 }
5299 
5300 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5301 {
5302 	if (dev_is_pci(dev))
5303 		return pci_device_group(dev);
5304 	return generic_device_group(dev);
5305 }
5306 
5307 static int intel_iommu_enable_auxd(struct device *dev)
5308 {
5309 	struct device_domain_info *info;
5310 	struct intel_iommu *iommu;
5311 	unsigned long flags;
5312 	int ret;
5313 
5314 	iommu = device_to_iommu(dev, NULL, NULL);
5315 	if (!iommu || dmar_disabled)
5316 		return -EINVAL;
5317 
5318 	if (!sm_supported(iommu) || !pasid_supported(iommu))
5319 		return -EINVAL;
5320 
5321 	ret = intel_iommu_enable_pasid(iommu, dev);
5322 	if (ret)
5323 		return -ENODEV;
5324 
5325 	spin_lock_irqsave(&device_domain_lock, flags);
5326 	info = get_domain_info(dev);
5327 	info->auxd_enabled = 1;
5328 	spin_unlock_irqrestore(&device_domain_lock, flags);
5329 
5330 	return 0;
5331 }
5332 
5333 static int intel_iommu_disable_auxd(struct device *dev)
5334 {
5335 	struct device_domain_info *info;
5336 	unsigned long flags;
5337 
5338 	spin_lock_irqsave(&device_domain_lock, flags);
5339 	info = get_domain_info(dev);
5340 	if (!WARN_ON(!info))
5341 		info->auxd_enabled = 0;
5342 	spin_unlock_irqrestore(&device_domain_lock, flags);
5343 
5344 	return 0;
5345 }
5346 
5347 static int intel_iommu_enable_sva(struct device *dev)
5348 {
5349 	struct device_domain_info *info = get_domain_info(dev);
5350 	struct intel_iommu *iommu;
5351 	int ret;
5352 
5353 	if (!info || dmar_disabled)
5354 		return -EINVAL;
5355 
5356 	iommu = info->iommu;
5357 	if (!iommu)
5358 		return -EINVAL;
5359 
5360 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5361 		return -ENODEV;
5362 
5363 	if (intel_iommu_enable_pasid(iommu, dev))
5364 		return -ENODEV;
5365 
5366 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5367 		return -EINVAL;
5368 
5369 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5370 	if (!ret)
5371 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5372 
5373 	return ret;
5374 }
5375 
5376 static int intel_iommu_disable_sva(struct device *dev)
5377 {
5378 	struct device_domain_info *info = get_domain_info(dev);
5379 	struct intel_iommu *iommu = info->iommu;
5380 	int ret;
5381 
5382 	ret = iommu_unregister_device_fault_handler(dev);
5383 	if (!ret)
5384 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5385 
5386 	return ret;
5387 }
5388 
5389 /*
5390  * A PCI express designated vendor specific extended capability is defined
5391  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5392  * for system software and tools to detect endpoint devices supporting the
5393  * Intel scalable IO virtualization without host driver dependency.
5394  *
5395  * Returns the address of the matching extended capability structure within
5396  * the device's PCI configuration space or 0 if the device does not support
5397  * it.
5398  */
5399 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5400 {
5401 	int pos;
5402 	u16 vendor, id;
5403 
5404 	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5405 	while (pos) {
5406 		pci_read_config_word(pdev, pos + 4, &vendor);
5407 		pci_read_config_word(pdev, pos + 8, &id);
5408 		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5409 			return pos;
5410 
5411 		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5412 	}
5413 
5414 	return 0;
5415 }
5416 
5417 static bool
5418 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5419 {
5420 	struct device_domain_info *info = get_domain_info(dev);
5421 
5422 	if (feat == IOMMU_DEV_FEAT_AUX) {
5423 		int ret;
5424 
5425 		if (!dev_is_pci(dev) || dmar_disabled ||
5426 		    !scalable_mode_support() || !pasid_mode_support())
5427 			return false;
5428 
5429 		ret = pci_pasid_features(to_pci_dev(dev));
5430 		if (ret < 0)
5431 			return false;
5432 
5433 		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5434 	}
5435 
5436 	if (feat == IOMMU_DEV_FEAT_IOPF)
5437 		return info && info->pri_supported;
5438 
5439 	if (feat == IOMMU_DEV_FEAT_SVA)
5440 		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5441 			info->pasid_supported && info->pri_supported &&
5442 			info->ats_supported;
5443 
5444 	return false;
5445 }
5446 
5447 static int
5448 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5449 {
5450 	switch (feat) {
5451 	case IOMMU_DEV_FEAT_AUX:
5452 		return intel_iommu_enable_auxd(dev);
5453 
5454 	case IOMMU_DEV_FEAT_IOPF:
5455 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5456 
5457 	case IOMMU_DEV_FEAT_SVA:
5458 		return intel_iommu_enable_sva(dev);
5459 
5460 	default:
5461 		return -ENODEV;
5462 	}
5463 }
5464 
5465 static int
5466 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5467 {
5468 	switch (feat) {
5469 	case IOMMU_DEV_FEAT_AUX:
5470 		return intel_iommu_disable_auxd(dev);
5471 
5472 	case IOMMU_DEV_FEAT_IOPF:
5473 		return 0;
5474 
5475 	case IOMMU_DEV_FEAT_SVA:
5476 		return intel_iommu_disable_sva(dev);
5477 
5478 	default:
5479 		return -ENODEV;
5480 	}
5481 }
5482 
5483 static bool
5484 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5485 {
5486 	struct device_domain_info *info = get_domain_info(dev);
5487 
5488 	if (feat == IOMMU_DEV_FEAT_AUX)
5489 		return scalable_mode_support() && info && info->auxd_enabled;
5490 
5491 	return false;
5492 }
5493 
5494 static int
5495 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5496 {
5497 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5498 
5499 	return dmar_domain->default_pasid > 0 ?
5500 			dmar_domain->default_pasid : -EINVAL;
5501 }
5502 
5503 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5504 					   struct device *dev)
5505 {
5506 	return attach_deferred(dev);
5507 }
5508 
5509 static int
5510 intel_iommu_enable_nesting(struct iommu_domain *domain)
5511 {
5512 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5513 	unsigned long flags;
5514 	int ret = -ENODEV;
5515 
5516 	spin_lock_irqsave(&device_domain_lock, flags);
5517 	if (list_empty(&dmar_domain->devices)) {
5518 		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5519 		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5520 		ret = 0;
5521 	}
5522 	spin_unlock_irqrestore(&device_domain_lock, flags);
5523 
5524 	return ret;
5525 }
5526 
5527 /*
5528  * Check that the device does not live on an external facing PCI port that is
5529  * marked as untrusted. Such devices should not be able to apply quirks and
5530  * thus not be able to bypass the IOMMU restrictions.
5531  */
5532 static bool risky_device(struct pci_dev *pdev)
5533 {
5534 	if (pdev->untrusted) {
5535 		pci_info(pdev,
5536 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5537 			 pdev->vendor, pdev->device);
5538 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5539 		return true;
5540 	}
5541 	return false;
5542 }
5543 
5544 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5545 				       unsigned long iova, size_t size)
5546 {
5547 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5548 	unsigned long pages = aligned_nrpages(iova, size);
5549 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5550 	struct intel_iommu *iommu;
5551 	int iommu_id;
5552 
5553 	for_each_domain_iommu(iommu_id, dmar_domain) {
5554 		iommu = g_iommus[iommu_id];
5555 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
5556 	}
5557 }
5558 
5559 const struct iommu_ops intel_iommu_ops = {
5560 	.capable		= intel_iommu_capable,
5561 	.domain_alloc		= intel_iommu_domain_alloc,
5562 	.domain_free		= intel_iommu_domain_free,
5563 	.enable_nesting		= intel_iommu_enable_nesting,
5564 	.attach_dev		= intel_iommu_attach_device,
5565 	.detach_dev		= intel_iommu_detach_device,
5566 	.aux_attach_dev		= intel_iommu_aux_attach_device,
5567 	.aux_detach_dev		= intel_iommu_aux_detach_device,
5568 	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5569 	.map_pages		= intel_iommu_map_pages,
5570 	.unmap_pages		= intel_iommu_unmap_pages,
5571 	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5572 	.flush_iotlb_all        = intel_flush_iotlb_all,
5573 	.iotlb_sync		= intel_iommu_tlb_sync,
5574 	.iova_to_phys		= intel_iommu_iova_to_phys,
5575 	.probe_device		= intel_iommu_probe_device,
5576 	.probe_finalize		= intel_iommu_probe_finalize,
5577 	.release_device		= intel_iommu_release_device,
5578 	.get_resv_regions	= intel_iommu_get_resv_regions,
5579 	.put_resv_regions	= generic_iommu_put_resv_regions,
5580 	.device_group		= intel_iommu_device_group,
5581 	.dev_has_feat		= intel_iommu_dev_has_feat,
5582 	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5583 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5584 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5585 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5586 	.def_domain_type	= device_def_domain_type,
5587 	.pgsize_bitmap		= SZ_4K,
5588 #ifdef CONFIG_INTEL_IOMMU_SVM
5589 	.cache_invalidate	= intel_iommu_sva_invalidate,
5590 	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5591 	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5592 	.sva_bind		= intel_svm_bind,
5593 	.sva_unbind		= intel_svm_unbind,
5594 	.sva_get_pasid		= intel_svm_get_pasid,
5595 	.page_response		= intel_svm_page_response,
5596 #endif
5597 };
5598 
5599 static void quirk_iommu_igfx(struct pci_dev *dev)
5600 {
5601 	if (risky_device(dev))
5602 		return;
5603 
5604 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5605 	dmar_map_gfx = 0;
5606 }
5607 
5608 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5616 
5617 /* Broadwell igfx malfunctions with dmar */
5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5619 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5620 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5621 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5622 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5623 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5624 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5625 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5642 
5643 static void quirk_iommu_rwbf(struct pci_dev *dev)
5644 {
5645 	if (risky_device(dev))
5646 		return;
5647 
5648 	/*
5649 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5650 	 * but needs it. Same seems to hold for the desktop versions.
5651 	 */
5652 	pci_info(dev, "Forcing write-buffer flush capability\n");
5653 	rwbf_quirk = 1;
5654 }
5655 
5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5663 
5664 #define GGC 0x52
5665 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5666 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5667 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5668 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5669 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5670 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5671 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5672 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5673 
5674 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5675 {
5676 	unsigned short ggc;
5677 
5678 	if (risky_device(dev))
5679 		return;
5680 
5681 	if (pci_read_config_word(dev, GGC, &ggc))
5682 		return;
5683 
5684 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5685 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5686 		dmar_map_gfx = 0;
5687 	} else if (dmar_map_gfx) {
5688 		/* we have to ensure the gfx device is idle before we flush */
5689 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5690 		iommu_set_dma_strict();
5691 	}
5692 }
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5697 
5698 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5699 {
5700 	unsigned short ver;
5701 
5702 	if (!IS_GFX_DEVICE(dev))
5703 		return;
5704 
5705 	ver = (dev->device >> 8) & 0xff;
5706 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5707 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5708 	    ver != 0x9a)
5709 		return;
5710 
5711 	if (risky_device(dev))
5712 		return;
5713 
5714 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5715 	iommu_skip_te_disable = 1;
5716 }
5717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5718 
5719 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5720    ISOCH DMAR unit for the Azalia sound device, but not give it any
5721    TLB entries, which causes it to deadlock. Check for that.  We do
5722    this in a function called from init_dmars(), instead of in a PCI
5723    quirk, because we don't want to print the obnoxious "BIOS broken"
5724    message if VT-d is actually disabled.
5725 */
5726 static void __init check_tylersburg_isoch(void)
5727 {
5728 	struct pci_dev *pdev;
5729 	uint32_t vtisochctrl;
5730 
5731 	/* If there's no Azalia in the system anyway, forget it. */
5732 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5733 	if (!pdev)
5734 		return;
5735 
5736 	if (risky_device(pdev)) {
5737 		pci_dev_put(pdev);
5738 		return;
5739 	}
5740 
5741 	pci_dev_put(pdev);
5742 
5743 	/* System Management Registers. Might be hidden, in which case
5744 	   we can't do the sanity check. But that's OK, because the
5745 	   known-broken BIOSes _don't_ actually hide it, so far. */
5746 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5747 	if (!pdev)
5748 		return;
5749 
5750 	if (risky_device(pdev)) {
5751 		pci_dev_put(pdev);
5752 		return;
5753 	}
5754 
5755 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5756 		pci_dev_put(pdev);
5757 		return;
5758 	}
5759 
5760 	pci_dev_put(pdev);
5761 
5762 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5763 	if (vtisochctrl & 1)
5764 		return;
5765 
5766 	/* Drop all bits other than the number of TLB entries */
5767 	vtisochctrl &= 0x1c;
5768 
5769 	/* If we have the recommended number of TLB entries (16), fine. */
5770 	if (vtisochctrl == 0x10)
5771 		return;
5772 
5773 	/* Zero TLB entries? You get to ride the short bus to school. */
5774 	if (!vtisochctrl) {
5775 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5776 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5777 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5778 		     dmi_get_system_info(DMI_BIOS_VERSION),
5779 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5780 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5781 		return;
5782 	}
5783 
5784 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5785 	       vtisochctrl);
5786 }
5787