xref: /linux/drivers/iommu/intel/iommu.c (revision c9933d494c54f72290831191c09bb8488bfd5905)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28 
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131 
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173 	context->lo &= ~(1ULL << 11);
174 }
175 
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178 	return !!(context->lo & (1ULL << 11));
179 }
180 
181 static inline void context_set_copied(struct context_entry *context)
182 {
183 	context->hi |= (1ull << 3);
184 }
185 
186 static inline bool context_copied(struct context_entry *context)
187 {
188 	return !!(context->hi & (1ULL << 3));
189 }
190 
191 static inline bool __context_present(struct context_entry *context)
192 {
193 	return (context->lo & 1);
194 }
195 
196 bool context_present(struct context_entry *context)
197 {
198 	return context_pasid_enabled(context) ?
199 	     __context_present(context) :
200 	     __context_present(context) && !context_copied(context);
201 }
202 
203 static inline void context_set_present(struct context_entry *context)
204 {
205 	context->lo |= 1;
206 }
207 
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210 	context->lo &= (((u64)-1) << 2) | 1;
211 }
212 
213 static inline void context_set_translation_type(struct context_entry *context,
214 						unsigned long value)
215 {
216 	context->lo &= (((u64)-1) << 4) | 3;
217 	context->lo |= (value & 3) << 2;
218 }
219 
220 static inline void context_set_address_root(struct context_entry *context,
221 					    unsigned long value)
222 {
223 	context->lo &= ~VTD_PAGE_MASK;
224 	context->lo |= value & VTD_PAGE_MASK;
225 }
226 
227 static inline void context_set_address_width(struct context_entry *context,
228 					     unsigned long value)
229 {
230 	context->hi |= value & 7;
231 }
232 
233 static inline void context_set_domain_id(struct context_entry *context,
234 					 unsigned long value)
235 {
236 	context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238 
239 static inline int context_domain_id(struct context_entry *c)
240 {
241 	return((c->hi >> 8) & 0xffff);
242 }
243 
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246 	context->lo = 0;
247 	context->hi = 0;
248 }
249 
250 /*
251  * This domain is a statically identity mapping domain.
252  *	1. This domain creats a static 1:1 mapping to all usable memory.
253  * 	2. It maps to each iommu if successful.
254  *	3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258 
259 #define for_each_domain_iommu(idx, domain)			\
260 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
261 		if (domain->iommu_refcnt[idx])
262 
263 struct dmar_rmrr_unit {
264 	struct list_head list;		/* list of rmrr units	*/
265 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
266 	u64	base_address;		/* reserved base address*/
267 	u64	end_address;		/* reserved end address */
268 	struct dmar_dev_scope *devices;	/* target devices */
269 	int	devices_cnt;		/* target device count */
270 };
271 
272 struct dmar_atsr_unit {
273 	struct list_head list;		/* list of ATSR units */
274 	struct acpi_dmar_header *hdr;	/* ACPI header */
275 	struct dmar_dev_scope *devices;	/* target devices */
276 	int devices_cnt;		/* target device count */
277 	u8 include_all:1;		/* include all ports */
278 };
279 
280 struct dmar_satc_unit {
281 	struct list_head list;		/* list of SATC units */
282 	struct acpi_dmar_header *hdr;	/* ACPI header */
283 	struct dmar_dev_scope *devices;	/* target devices */
284 	struct intel_iommu *iommu;	/* the corresponding iommu */
285 	int devices_cnt;		/* target device count */
286 	u8 atc_required:1;		/* ATS is required */
287 };
288 
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292 
293 #define for_each_rmrr_units(rmrr) \
294 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295 
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298 
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302 
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305 
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308 
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313 
314 #define IDENTMAP_GFX		2
315 #define IDENTMAP_AZALIA		4
316 
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319 
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322 
323 /*
324  * Iterate over elements in device_domain_list and call the specified
325  * callback @fn against each element.
326  */
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328 				     void *data), void *data)
329 {
330 	int ret = 0;
331 	unsigned long flags;
332 	struct device_domain_info *info;
333 
334 	spin_lock_irqsave(&device_domain_lock, flags);
335 	list_for_each_entry(info, &device_domain_list, global) {
336 		ret = fn(info, data);
337 		if (ret) {
338 			spin_unlock_irqrestore(&device_domain_lock, flags);
339 			return ret;
340 		}
341 	}
342 	spin_unlock_irqrestore(&device_domain_lock, flags);
343 
344 	return 0;
345 }
346 
347 const struct iommu_ops intel_iommu_ops;
348 
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
350 {
351 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
352 }
353 
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
355 {
356 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
357 }
358 
359 static void init_translation_status(struct intel_iommu *iommu)
360 {
361 	u32 gsts;
362 
363 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
364 	if (gsts & DMA_GSTS_TES)
365 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
366 }
367 
368 static int __init intel_iommu_setup(char *str)
369 {
370 	if (!str)
371 		return -EINVAL;
372 
373 	while (*str) {
374 		if (!strncmp(str, "on", 2)) {
375 			dmar_disabled = 0;
376 			pr_info("IOMMU enabled\n");
377 		} else if (!strncmp(str, "off", 3)) {
378 			dmar_disabled = 1;
379 			no_platform_optin = 1;
380 			pr_info("IOMMU disabled\n");
381 		} else if (!strncmp(str, "igfx_off", 8)) {
382 			dmar_map_gfx = 0;
383 			pr_info("Disable GFX device mapping\n");
384 		} else if (!strncmp(str, "forcedac", 8)) {
385 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386 			iommu_dma_forcedac = true;
387 		} else if (!strncmp(str, "strict", 6)) {
388 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389 			iommu_set_dma_strict();
390 		} else if (!strncmp(str, "sp_off", 6)) {
391 			pr_info("Disable supported super page\n");
392 			intel_iommu_superpage = 0;
393 		} else if (!strncmp(str, "sm_on", 5)) {
394 			pr_info("Enable scalable mode if hardware supports\n");
395 			intel_iommu_sm = 1;
396 		} else if (!strncmp(str, "sm_off", 6)) {
397 			pr_info("Scalable mode is disallowed\n");
398 			intel_iommu_sm = 0;
399 		} else if (!strncmp(str, "tboot_noforce", 13)) {
400 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401 			intel_iommu_tboot_noforce = 1;
402 		} else {
403 			pr_notice("Unknown option - '%s'\n", str);
404 		}
405 
406 		str += strcspn(str, ",");
407 		while (*str == ',')
408 			str++;
409 	}
410 
411 	return 1;
412 }
413 __setup("intel_iommu=", intel_iommu_setup);
414 
415 void *alloc_pgtable_page(int node)
416 {
417 	struct page *page;
418 	void *vaddr = NULL;
419 
420 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
421 	if (page)
422 		vaddr = page_address(page);
423 	return vaddr;
424 }
425 
426 void free_pgtable_page(void *vaddr)
427 {
428 	free_page((unsigned long)vaddr);
429 }
430 
431 static inline int domain_type_is_si(struct dmar_domain *domain)
432 {
433 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
434 }
435 
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
437 {
438 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
439 }
440 
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
442 				       unsigned long pfn)
443 {
444 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
445 
446 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
447 }
448 
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
450 {
451 	unsigned long sagaw;
452 	int agaw;
453 
454 	sagaw = cap_sagaw(iommu->cap);
455 	for (agaw = width_to_agaw(max_gaw);
456 	     agaw >= 0; agaw--) {
457 		if (test_bit(agaw, &sagaw))
458 			break;
459 	}
460 
461 	return agaw;
462 }
463 
464 /*
465  * Calculate max SAGAW for each iommu.
466  */
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
468 {
469 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
470 }
471 
472 /*
473  * calculate agaw for each iommu.
474  * "SAGAW" may be different across iommus, use a default agaw, and
475  * get a supported less agaw for iommus that don't support the default agaw.
476  */
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
478 {
479 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
480 }
481 
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
484 {
485 	int iommu_id;
486 
487 	/* si_domain and vm domain should not get here. */
488 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
489 		return NULL;
490 
491 	for_each_domain_iommu(iommu_id, domain)
492 		break;
493 
494 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
495 		return NULL;
496 
497 	return g_iommus[iommu_id];
498 }
499 
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
501 {
502 	return sm_supported(iommu) ?
503 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
504 }
505 
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
507 {
508 	struct dmar_drhd_unit *drhd;
509 	struct intel_iommu *iommu;
510 	bool found = false;
511 	int i;
512 
513 	domain->iommu_coherency = true;
514 
515 	for_each_domain_iommu(i, domain) {
516 		found = true;
517 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
518 			domain->iommu_coherency = false;
519 			break;
520 		}
521 	}
522 	if (found)
523 		return;
524 
525 	/* No hardware attached; use lowest common denominator */
526 	rcu_read_lock();
527 	for_each_active_iommu(iommu, drhd) {
528 		if (!iommu_paging_structure_coherency(iommu)) {
529 			domain->iommu_coherency = false;
530 			break;
531 		}
532 	}
533 	rcu_read_unlock();
534 }
535 
536 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
537 {
538 	struct dmar_drhd_unit *drhd;
539 	struct intel_iommu *iommu;
540 	bool ret = true;
541 
542 	rcu_read_lock();
543 	for_each_active_iommu(iommu, drhd) {
544 		if (iommu != skip) {
545 			/*
546 			 * If the hardware is operating in the scalable mode,
547 			 * the snooping control is always supported since we
548 			 * always set PASID-table-entry.PGSNP bit if the domain
549 			 * is managed outside (UNMANAGED).
550 			 */
551 			if (!sm_supported(iommu) &&
552 			    !ecap_sc_support(iommu->ecap)) {
553 				ret = false;
554 				break;
555 			}
556 		}
557 	}
558 	rcu_read_unlock();
559 
560 	return ret;
561 }
562 
563 static int domain_update_iommu_superpage(struct dmar_domain *domain,
564 					 struct intel_iommu *skip)
565 {
566 	struct dmar_drhd_unit *drhd;
567 	struct intel_iommu *iommu;
568 	int mask = 0x3;
569 
570 	if (!intel_iommu_superpage)
571 		return 0;
572 
573 	/* set iommu_superpage to the smallest common denominator */
574 	rcu_read_lock();
575 	for_each_active_iommu(iommu, drhd) {
576 		if (iommu != skip) {
577 			if (domain && domain_use_first_level(domain)) {
578 				if (!cap_fl1gp_support(iommu->cap))
579 					mask = 0x1;
580 			} else {
581 				mask &= cap_super_page_val(iommu->cap);
582 			}
583 
584 			if (!mask)
585 				break;
586 		}
587 	}
588 	rcu_read_unlock();
589 
590 	return fls(mask);
591 }
592 
593 static int domain_update_device_node(struct dmar_domain *domain)
594 {
595 	struct device_domain_info *info;
596 	int nid = NUMA_NO_NODE;
597 
598 	assert_spin_locked(&device_domain_lock);
599 
600 	if (list_empty(&domain->devices))
601 		return NUMA_NO_NODE;
602 
603 	list_for_each_entry(info, &domain->devices, link) {
604 		if (!info->dev)
605 			continue;
606 
607 		/*
608 		 * There could possibly be multiple device numa nodes as devices
609 		 * within the same domain may sit behind different IOMMUs. There
610 		 * isn't perfect answer in such situation, so we select first
611 		 * come first served policy.
612 		 */
613 		nid = dev_to_node(info->dev);
614 		if (nid != NUMA_NO_NODE)
615 			break;
616 	}
617 
618 	return nid;
619 }
620 
621 static void domain_update_iotlb(struct dmar_domain *domain);
622 
623 /* Return the super pagesize bitmap if supported. */
624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
625 {
626 	unsigned long bitmap = 0;
627 
628 	/*
629 	 * 1-level super page supports page size of 2MiB, 2-level super page
630 	 * supports page size of both 2MiB and 1GiB.
631 	 */
632 	if (domain->iommu_superpage == 1)
633 		bitmap |= SZ_2M;
634 	else if (domain->iommu_superpage == 2)
635 		bitmap |= SZ_2M | SZ_1G;
636 
637 	return bitmap;
638 }
639 
640 /* Some capabilities may be different across iommus */
641 static void domain_update_iommu_cap(struct dmar_domain *domain)
642 {
643 	domain_update_iommu_coherency(domain);
644 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
645 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
646 
647 	/*
648 	 * If RHSA is missing, we should default to the device numa domain
649 	 * as fall back.
650 	 */
651 	if (domain->nid == NUMA_NO_NODE)
652 		domain->nid = domain_update_device_node(domain);
653 
654 	/*
655 	 * First-level translation restricts the input-address to a
656 	 * canonical address (i.e., address bits 63:N have the same
657 	 * value as address bit [N-1], where N is 48-bits with 4-level
658 	 * paging and 57-bits with 5-level paging). Hence, skip bit
659 	 * [N-1].
660 	 */
661 	if (domain_use_first_level(domain))
662 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
663 	else
664 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
665 
666 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
667 	domain_update_iotlb(domain);
668 }
669 
670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
671 					 u8 devfn, int alloc)
672 {
673 	struct root_entry *root = &iommu->root_entry[bus];
674 	struct context_entry *context;
675 	u64 *entry;
676 
677 	entry = &root->lo;
678 	if (sm_supported(iommu)) {
679 		if (devfn >= 0x80) {
680 			devfn -= 0x80;
681 			entry = &root->hi;
682 		}
683 		devfn *= 2;
684 	}
685 	if (*entry & 1)
686 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
687 	else {
688 		unsigned long phy_addr;
689 		if (!alloc)
690 			return NULL;
691 
692 		context = alloc_pgtable_page(iommu->node);
693 		if (!context)
694 			return NULL;
695 
696 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
697 		phy_addr = virt_to_phys((void *)context);
698 		*entry = phy_addr | 1;
699 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
700 	}
701 	return &context[devfn];
702 }
703 
704 /**
705  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
706  *				 sub-hierarchy of a candidate PCI-PCI bridge
707  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
708  * @bridge: the candidate PCI-PCI bridge
709  *
710  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
711  */
712 static bool
713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
714 {
715 	struct pci_dev *pdev, *pbridge;
716 
717 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
718 		return false;
719 
720 	pdev = to_pci_dev(dev);
721 	pbridge = to_pci_dev(bridge);
722 
723 	if (pbridge->subordinate &&
724 	    pbridge->subordinate->number <= pdev->bus->number &&
725 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
726 		return true;
727 
728 	return false;
729 }
730 
731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
732 {
733 	struct dmar_drhd_unit *drhd;
734 	u32 vtbar;
735 	int rc;
736 
737 	/* We know that this device on this chipset has its own IOMMU.
738 	 * If we find it under a different IOMMU, then the BIOS is lying
739 	 * to us. Hope that the IOMMU for this device is actually
740 	 * disabled, and it needs no translation...
741 	 */
742 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
743 	if (rc) {
744 		/* "can't" happen */
745 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
746 		return false;
747 	}
748 	vtbar &= 0xffff0000;
749 
750 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
751 	drhd = dmar_find_matched_drhd_unit(pdev);
752 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
753 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
754 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
755 		return true;
756 	}
757 
758 	return false;
759 }
760 
761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
762 {
763 	if (!iommu || iommu->drhd->ignored)
764 		return true;
765 
766 	if (dev_is_pci(dev)) {
767 		struct pci_dev *pdev = to_pci_dev(dev);
768 
769 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
770 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
771 		    quirk_ioat_snb_local_iommu(pdev))
772 			return true;
773 	}
774 
775 	return false;
776 }
777 
778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
779 {
780 	struct dmar_drhd_unit *drhd = NULL;
781 	struct pci_dev *pdev = NULL;
782 	struct intel_iommu *iommu;
783 	struct device *tmp;
784 	u16 segment = 0;
785 	int i;
786 
787 	if (!dev)
788 		return NULL;
789 
790 	if (dev_is_pci(dev)) {
791 		struct pci_dev *pf_pdev;
792 
793 		pdev = pci_real_dma_dev(to_pci_dev(dev));
794 
795 		/* VFs aren't listed in scope tables; we need to look up
796 		 * the PF instead to find the IOMMU. */
797 		pf_pdev = pci_physfn(pdev);
798 		dev = &pf_pdev->dev;
799 		segment = pci_domain_nr(pdev->bus);
800 	} else if (has_acpi_companion(dev))
801 		dev = &ACPI_COMPANION(dev)->dev;
802 
803 	rcu_read_lock();
804 	for_each_iommu(iommu, drhd) {
805 		if (pdev && segment != drhd->segment)
806 			continue;
807 
808 		for_each_active_dev_scope(drhd->devices,
809 					  drhd->devices_cnt, i, tmp) {
810 			if (tmp == dev) {
811 				/* For a VF use its original BDF# not that of the PF
812 				 * which we used for the IOMMU lookup. Strictly speaking
813 				 * we could do this for all PCI devices; we only need to
814 				 * get the BDF# from the scope table for ACPI matches. */
815 				if (pdev && pdev->is_virtfn)
816 					goto got_pdev;
817 
818 				if (bus && devfn) {
819 					*bus = drhd->devices[i].bus;
820 					*devfn = drhd->devices[i].devfn;
821 				}
822 				goto out;
823 			}
824 
825 			if (is_downstream_to_pci_bridge(dev, tmp))
826 				goto got_pdev;
827 		}
828 
829 		if (pdev && drhd->include_all) {
830 got_pdev:
831 			if (bus && devfn) {
832 				*bus = pdev->bus->number;
833 				*devfn = pdev->devfn;
834 			}
835 			goto out;
836 		}
837 	}
838 	iommu = NULL;
839 out:
840 	if (iommu_is_dummy(iommu, dev))
841 		iommu = NULL;
842 
843 	rcu_read_unlock();
844 
845 	return iommu;
846 }
847 
848 static void domain_flush_cache(struct dmar_domain *domain,
849 			       void *addr, int size)
850 {
851 	if (!domain->iommu_coherency)
852 		clflush_cache_range(addr, size);
853 }
854 
855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
856 {
857 	struct context_entry *context;
858 	int ret = 0;
859 	unsigned long flags;
860 
861 	spin_lock_irqsave(&iommu->lock, flags);
862 	context = iommu_context_addr(iommu, bus, devfn, 0);
863 	if (context)
864 		ret = context_present(context);
865 	spin_unlock_irqrestore(&iommu->lock, flags);
866 	return ret;
867 }
868 
869 static void free_context_table(struct intel_iommu *iommu)
870 {
871 	int i;
872 	unsigned long flags;
873 	struct context_entry *context;
874 
875 	spin_lock_irqsave(&iommu->lock, flags);
876 	if (!iommu->root_entry) {
877 		goto out;
878 	}
879 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
880 		context = iommu_context_addr(iommu, i, 0, 0);
881 		if (context)
882 			free_pgtable_page(context);
883 
884 		if (!sm_supported(iommu))
885 			continue;
886 
887 		context = iommu_context_addr(iommu, i, 0x80, 0);
888 		if (context)
889 			free_pgtable_page(context);
890 
891 	}
892 	free_pgtable_page(iommu->root_entry);
893 	iommu->root_entry = NULL;
894 out:
895 	spin_unlock_irqrestore(&iommu->lock, flags);
896 }
897 
898 #ifdef CONFIG_DMAR_DEBUG
899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
900 {
901 	struct device_domain_info *info;
902 	struct dma_pte *parent, *pte;
903 	struct dmar_domain *domain;
904 	int offset, level;
905 
906 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
907 	if (!info || !info->domain) {
908 		pr_info("device [%02x:%02x.%d] not probed\n",
909 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
910 		return;
911 	}
912 
913 	domain = info->domain;
914 	level = agaw_to_level(domain->agaw);
915 	parent = domain->pgd;
916 	if (!parent) {
917 		pr_info("no page table setup\n");
918 		return;
919 	}
920 
921 	while (1) {
922 		offset = pfn_level_offset(pfn, level);
923 		pte = &parent[offset];
924 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
925 			pr_info("PTE not present at level %d\n", level);
926 			break;
927 		}
928 
929 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
930 
931 		if (level == 1)
932 			break;
933 
934 		parent = phys_to_virt(dma_pte_addr(pte));
935 		level--;
936 	}
937 }
938 
939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
940 			  unsigned long long addr, u32 pasid)
941 {
942 	struct pasid_dir_entry *dir, *pde;
943 	struct pasid_entry *entries, *pte;
944 	struct context_entry *ctx_entry;
945 	struct root_entry *rt_entry;
946 	u8 devfn = source_id & 0xff;
947 	u8 bus = source_id >> 8;
948 	int i, dir_index, index;
949 
950 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
951 
952 	/* root entry dump */
953 	rt_entry = &iommu->root_entry[bus];
954 	if (!rt_entry) {
955 		pr_info("root table entry is not present\n");
956 		return;
957 	}
958 
959 	if (sm_supported(iommu))
960 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
961 			rt_entry->hi, rt_entry->lo);
962 	else
963 		pr_info("root entry: 0x%016llx", rt_entry->lo);
964 
965 	/* context entry dump */
966 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
967 	if (!ctx_entry) {
968 		pr_info("context table entry is not present\n");
969 		return;
970 	}
971 
972 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
973 		ctx_entry->hi, ctx_entry->lo);
974 
975 	/* legacy mode does not require PASID entries */
976 	if (!sm_supported(iommu))
977 		goto pgtable_walk;
978 
979 	/* get the pointer to pasid directory entry */
980 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
981 	if (!dir) {
982 		pr_info("pasid directory entry is not present\n");
983 		return;
984 	}
985 	/* For request-without-pasid, get the pasid from context entry */
986 	if (intel_iommu_sm && pasid == INVALID_IOASID)
987 		pasid = PASID_RID2PASID;
988 
989 	dir_index = pasid >> PASID_PDE_SHIFT;
990 	pde = &dir[dir_index];
991 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
992 
993 	/* get the pointer to the pasid table entry */
994 	entries = get_pasid_table_from_pde(pde);
995 	if (!entries) {
996 		pr_info("pasid table entry is not present\n");
997 		return;
998 	}
999 	index = pasid & PASID_PTE_MASK;
1000 	pte = &entries[index];
1001 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1002 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1003 
1004 pgtable_walk:
1005 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1006 }
1007 #endif
1008 
1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1010 				      unsigned long pfn, int *target_level)
1011 {
1012 	struct dma_pte *parent, *pte;
1013 	int level = agaw_to_level(domain->agaw);
1014 	int offset;
1015 
1016 	BUG_ON(!domain->pgd);
1017 
1018 	if (!domain_pfn_supported(domain, pfn))
1019 		/* Address beyond IOMMU's addressing capabilities. */
1020 		return NULL;
1021 
1022 	parent = domain->pgd;
1023 
1024 	while (1) {
1025 		void *tmp_page;
1026 
1027 		offset = pfn_level_offset(pfn, level);
1028 		pte = &parent[offset];
1029 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1030 			break;
1031 		if (level == *target_level)
1032 			break;
1033 
1034 		if (!dma_pte_present(pte)) {
1035 			uint64_t pteval;
1036 
1037 			tmp_page = alloc_pgtable_page(domain->nid);
1038 
1039 			if (!tmp_page)
1040 				return NULL;
1041 
1042 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1043 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1044 			if (domain_use_first_level(domain)) {
1045 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1046 				if (iommu_is_dma_domain(&domain->domain))
1047 					pteval |= DMA_FL_PTE_ACCESS;
1048 			}
1049 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1050 				/* Someone else set it while we were thinking; use theirs. */
1051 				free_pgtable_page(tmp_page);
1052 			else
1053 				domain_flush_cache(domain, pte, sizeof(*pte));
1054 		}
1055 		if (level == 1)
1056 			break;
1057 
1058 		parent = phys_to_virt(dma_pte_addr(pte));
1059 		level--;
1060 	}
1061 
1062 	if (!*target_level)
1063 		*target_level = level;
1064 
1065 	return pte;
1066 }
1067 
1068 /* return address's pte at specific level */
1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1070 					 unsigned long pfn,
1071 					 int level, int *large_page)
1072 {
1073 	struct dma_pte *parent, *pte;
1074 	int total = agaw_to_level(domain->agaw);
1075 	int offset;
1076 
1077 	parent = domain->pgd;
1078 	while (level <= total) {
1079 		offset = pfn_level_offset(pfn, total);
1080 		pte = &parent[offset];
1081 		if (level == total)
1082 			return pte;
1083 
1084 		if (!dma_pte_present(pte)) {
1085 			*large_page = total;
1086 			break;
1087 		}
1088 
1089 		if (dma_pte_superpage(pte)) {
1090 			*large_page = total;
1091 			return pte;
1092 		}
1093 
1094 		parent = phys_to_virt(dma_pte_addr(pte));
1095 		total--;
1096 	}
1097 	return NULL;
1098 }
1099 
1100 /* clear last level pte, a tlb flush should be followed */
1101 static void dma_pte_clear_range(struct dmar_domain *domain,
1102 				unsigned long start_pfn,
1103 				unsigned long last_pfn)
1104 {
1105 	unsigned int large_page;
1106 	struct dma_pte *first_pte, *pte;
1107 
1108 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1109 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1110 	BUG_ON(start_pfn > last_pfn);
1111 
1112 	/* we don't need lock here; nobody else touches the iova range */
1113 	do {
1114 		large_page = 1;
1115 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1116 		if (!pte) {
1117 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1118 			continue;
1119 		}
1120 		do {
1121 			dma_clear_pte(pte);
1122 			start_pfn += lvl_to_nr_pages(large_page);
1123 			pte++;
1124 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1125 
1126 		domain_flush_cache(domain, first_pte,
1127 				   (void *)pte - (void *)first_pte);
1128 
1129 	} while (start_pfn && start_pfn <= last_pfn);
1130 }
1131 
1132 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1133 			       int retain_level, struct dma_pte *pte,
1134 			       unsigned long pfn, unsigned long start_pfn,
1135 			       unsigned long last_pfn)
1136 {
1137 	pfn = max(start_pfn, pfn);
1138 	pte = &pte[pfn_level_offset(pfn, level)];
1139 
1140 	do {
1141 		unsigned long level_pfn;
1142 		struct dma_pte *level_pte;
1143 
1144 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1145 			goto next;
1146 
1147 		level_pfn = pfn & level_mask(level);
1148 		level_pte = phys_to_virt(dma_pte_addr(pte));
1149 
1150 		if (level > 2) {
1151 			dma_pte_free_level(domain, level - 1, retain_level,
1152 					   level_pte, level_pfn, start_pfn,
1153 					   last_pfn);
1154 		}
1155 
1156 		/*
1157 		 * Free the page table if we're below the level we want to
1158 		 * retain and the range covers the entire table.
1159 		 */
1160 		if (level < retain_level && !(start_pfn > level_pfn ||
1161 		      last_pfn < level_pfn + level_size(level) - 1)) {
1162 			dma_clear_pte(pte);
1163 			domain_flush_cache(domain, pte, sizeof(*pte));
1164 			free_pgtable_page(level_pte);
1165 		}
1166 next:
1167 		pfn += level_size(level);
1168 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1169 }
1170 
1171 /*
1172  * clear last level (leaf) ptes and free page table pages below the
1173  * level we wish to keep intact.
1174  */
1175 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1176 				   unsigned long start_pfn,
1177 				   unsigned long last_pfn,
1178 				   int retain_level)
1179 {
1180 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1181 
1182 	/* We don't need lock here; nobody else touches the iova range */
1183 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1184 			   domain->pgd, 0, start_pfn, last_pfn);
1185 
1186 	/* free pgd */
1187 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188 		free_pgtable_page(domain->pgd);
1189 		domain->pgd = NULL;
1190 	}
1191 }
1192 
1193 /* When a page at a given level is being unlinked from its parent, we don't
1194    need to *modify* it at all. All we need to do is make a list of all the
1195    pages which can be freed just as soon as we've flushed the IOTLB and we
1196    know the hardware page-walk will no longer touch them.
1197    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1198    be freed. */
1199 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1200 				    int level, struct dma_pte *pte,
1201 				    struct list_head *freelist)
1202 {
1203 	struct page *pg;
1204 
1205 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1206 	list_add_tail(&pg->lru, freelist);
1207 
1208 	if (level == 1)
1209 		return;
1210 
1211 	pte = page_address(pg);
1212 	do {
1213 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1214 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1215 		pte++;
1216 	} while (!first_pte_in_page(pte));
1217 }
1218 
1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 				struct dma_pte *pte, unsigned long pfn,
1221 				unsigned long start_pfn, unsigned long last_pfn,
1222 				struct list_head *freelist)
1223 {
1224 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1225 
1226 	pfn = max(start_pfn, pfn);
1227 	pte = &pte[pfn_level_offset(pfn, level)];
1228 
1229 	do {
1230 		unsigned long level_pfn = pfn & level_mask(level);
1231 
1232 		if (!dma_pte_present(pte))
1233 			goto next;
1234 
1235 		/* If range covers entire pagetable, free it */
1236 		if (start_pfn <= level_pfn &&
1237 		    last_pfn >= level_pfn + level_size(level) - 1) {
1238 			/* These suborbinate page tables are going away entirely. Don't
1239 			   bother to clear them; we're just going to *free* them. */
1240 			if (level > 1 && !dma_pte_superpage(pte))
1241 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1242 
1243 			dma_clear_pte(pte);
1244 			if (!first_pte)
1245 				first_pte = pte;
1246 			last_pte = pte;
1247 		} else if (level > 1) {
1248 			/* Recurse down into a level that isn't *entirely* obsolete */
1249 			dma_pte_clear_level(domain, level - 1,
1250 					    phys_to_virt(dma_pte_addr(pte)),
1251 					    level_pfn, start_pfn, last_pfn,
1252 					    freelist);
1253 		}
1254 next:
1255 		pfn = level_pfn + level_size(level);
1256 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1257 
1258 	if (first_pte)
1259 		domain_flush_cache(domain, first_pte,
1260 				   (void *)++last_pte - (void *)first_pte);
1261 }
1262 
1263 /* We can't just free the pages because the IOMMU may still be walking
1264    the page tables, and may have cached the intermediate levels. The
1265    pages can only be freed after the IOTLB flush has been done. */
1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1267 			 unsigned long last_pfn, struct list_head *freelist)
1268 {
1269 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1270 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1271 	BUG_ON(start_pfn > last_pfn);
1272 
1273 	/* we don't need lock here; nobody else touches the iova range */
1274 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1275 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1276 
1277 	/* free pgd */
1278 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1279 		struct page *pgd_page = virt_to_page(domain->pgd);
1280 		list_add_tail(&pgd_page->lru, freelist);
1281 		domain->pgd = NULL;
1282 	}
1283 }
1284 
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1287 {
1288 	struct root_entry *root;
1289 	unsigned long flags;
1290 
1291 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1292 	if (!root) {
1293 		pr_err("Allocating root entry for %s failed\n",
1294 			iommu->name);
1295 		return -ENOMEM;
1296 	}
1297 
1298 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1299 
1300 	spin_lock_irqsave(&iommu->lock, flags);
1301 	iommu->root_entry = root;
1302 	spin_unlock_irqrestore(&iommu->lock, flags);
1303 
1304 	return 0;
1305 }
1306 
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1308 {
1309 	u64 addr;
1310 	u32 sts;
1311 	unsigned long flag;
1312 
1313 	addr = virt_to_phys(iommu->root_entry);
1314 	if (sm_supported(iommu))
1315 		addr |= DMA_RTADDR_SMT;
1316 
1317 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1319 
1320 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1321 
1322 	/* Make sure hardware complete it */
1323 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324 		      readl, (sts & DMA_GSTS_RTPS), sts);
1325 
1326 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 
1328 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1329 	if (sm_supported(iommu))
1330 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1331 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1332 }
1333 
1334 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1335 {
1336 	u32 val;
1337 	unsigned long flag;
1338 
1339 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1340 		return;
1341 
1342 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1343 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1344 
1345 	/* Make sure hardware complete it */
1346 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1347 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1348 
1349 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1350 }
1351 
1352 /* return value determine if we need a write buffer flush */
1353 static void __iommu_flush_context(struct intel_iommu *iommu,
1354 				  u16 did, u16 source_id, u8 function_mask,
1355 				  u64 type)
1356 {
1357 	u64 val = 0;
1358 	unsigned long flag;
1359 
1360 	switch (type) {
1361 	case DMA_CCMD_GLOBAL_INVL:
1362 		val = DMA_CCMD_GLOBAL_INVL;
1363 		break;
1364 	case DMA_CCMD_DOMAIN_INVL:
1365 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1366 		break;
1367 	case DMA_CCMD_DEVICE_INVL:
1368 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1369 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1370 		break;
1371 	default:
1372 		BUG();
1373 	}
1374 	val |= DMA_CCMD_ICC;
1375 
1376 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1377 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1378 
1379 	/* Make sure hardware complete it */
1380 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1381 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1382 
1383 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1384 }
1385 
1386 /* return value determine if we need a write buffer flush */
1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1388 				u64 addr, unsigned int size_order, u64 type)
1389 {
1390 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1391 	u64 val = 0, val_iva = 0;
1392 	unsigned long flag;
1393 
1394 	switch (type) {
1395 	case DMA_TLB_GLOBAL_FLUSH:
1396 		/* global flush doesn't need set IVA_REG */
1397 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1398 		break;
1399 	case DMA_TLB_DSI_FLUSH:
1400 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401 		break;
1402 	case DMA_TLB_PSI_FLUSH:
1403 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1404 		/* IH bit is passed in as part of address */
1405 		val_iva = size_order | addr;
1406 		break;
1407 	default:
1408 		BUG();
1409 	}
1410 	/* Note: set drain read/write */
1411 #if 0
1412 	/*
1413 	 * This is probably to be super secure.. Looks like we can
1414 	 * ignore it without any impact.
1415 	 */
1416 	if (cap_read_drain(iommu->cap))
1417 		val |= DMA_TLB_READ_DRAIN;
1418 #endif
1419 	if (cap_write_drain(iommu->cap))
1420 		val |= DMA_TLB_WRITE_DRAIN;
1421 
1422 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 	/* Note: Only uses first TLB reg currently */
1424 	if (val_iva)
1425 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1426 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1427 
1428 	/* Make sure hardware complete it */
1429 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1430 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1431 
1432 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1433 
1434 	/* check IOTLB invalidation granularity */
1435 	if (DMA_TLB_IAIG(val) == 0)
1436 		pr_err("Flush IOTLB failed\n");
1437 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1438 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1439 			(unsigned long long)DMA_TLB_IIRG(type),
1440 			(unsigned long long)DMA_TLB_IAIG(val));
1441 }
1442 
1443 static struct device_domain_info *
1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1445 			 u8 bus, u8 devfn)
1446 {
1447 	struct device_domain_info *info;
1448 
1449 	assert_spin_locked(&device_domain_lock);
1450 
1451 	if (!iommu->qi)
1452 		return NULL;
1453 
1454 	list_for_each_entry(info, &domain->devices, link)
1455 		if (info->iommu == iommu && info->bus == bus &&
1456 		    info->devfn == devfn) {
1457 			if (info->ats_supported && info->dev)
1458 				return info;
1459 			break;
1460 		}
1461 
1462 	return NULL;
1463 }
1464 
1465 static void domain_update_iotlb(struct dmar_domain *domain)
1466 {
1467 	struct device_domain_info *info;
1468 	bool has_iotlb_device = false;
1469 
1470 	assert_spin_locked(&device_domain_lock);
1471 
1472 	list_for_each_entry(info, &domain->devices, link)
1473 		if (info->ats_enabled) {
1474 			has_iotlb_device = true;
1475 			break;
1476 		}
1477 
1478 	domain->has_iotlb_device = has_iotlb_device;
1479 }
1480 
1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1482 {
1483 	struct pci_dev *pdev;
1484 
1485 	assert_spin_locked(&device_domain_lock);
1486 
1487 	if (!info || !dev_is_pci(info->dev))
1488 		return;
1489 
1490 	pdev = to_pci_dev(info->dev);
1491 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1492 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1493 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1494 	 * reserved, which should be set to 0.
1495 	 */
1496 	if (!ecap_dit(info->iommu->ecap))
1497 		info->pfsid = 0;
1498 	else {
1499 		struct pci_dev *pf_pdev;
1500 
1501 		/* pdev will be returned if device is not a vf */
1502 		pf_pdev = pci_physfn(pdev);
1503 		info->pfsid = pci_dev_id(pf_pdev);
1504 	}
1505 
1506 #ifdef CONFIG_INTEL_IOMMU_SVM
1507 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1508 	   the device if you enable PASID support after ATS support is
1509 	   undefined. So always enable PASID support on devices which
1510 	   have it, even if we can't yet know if we're ever going to
1511 	   use it. */
1512 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513 		info->pasid_enabled = 1;
1514 
1515 	if (info->pri_supported &&
1516 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1517 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1518 		info->pri_enabled = 1;
1519 #endif
1520 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1521 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1522 		info->ats_enabled = 1;
1523 		domain_update_iotlb(info->domain);
1524 		info->ats_qdep = pci_ats_queue_depth(pdev);
1525 	}
1526 }
1527 
1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1529 {
1530 	struct pci_dev *pdev;
1531 
1532 	assert_spin_locked(&device_domain_lock);
1533 
1534 	if (!dev_is_pci(info->dev))
1535 		return;
1536 
1537 	pdev = to_pci_dev(info->dev);
1538 
1539 	if (info->ats_enabled) {
1540 		pci_disable_ats(pdev);
1541 		info->ats_enabled = 0;
1542 		domain_update_iotlb(info->domain);
1543 	}
1544 #ifdef CONFIG_INTEL_IOMMU_SVM
1545 	if (info->pri_enabled) {
1546 		pci_disable_pri(pdev);
1547 		info->pri_enabled = 0;
1548 	}
1549 	if (info->pasid_enabled) {
1550 		pci_disable_pasid(pdev);
1551 		info->pasid_enabled = 0;
1552 	}
1553 #endif
1554 }
1555 
1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1557 				    u64 addr, unsigned int mask)
1558 {
1559 	u16 sid, qdep;
1560 
1561 	if (!info || !info->ats_enabled)
1562 		return;
1563 
1564 	sid = info->bus << 8 | info->devfn;
1565 	qdep = info->ats_qdep;
1566 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1567 			   qdep, addr, mask);
1568 }
1569 
1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1571 				  u64 addr, unsigned mask)
1572 {
1573 	unsigned long flags;
1574 	struct device_domain_info *info;
1575 
1576 	if (!domain->has_iotlb_device)
1577 		return;
1578 
1579 	spin_lock_irqsave(&device_domain_lock, flags);
1580 	list_for_each_entry(info, &domain->devices, link)
1581 		__iommu_flush_dev_iotlb(info, addr, mask);
1582 
1583 	spin_unlock_irqrestore(&device_domain_lock, flags);
1584 }
1585 
1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1587 				  struct dmar_domain *domain,
1588 				  unsigned long pfn, unsigned int pages,
1589 				  int ih, int map)
1590 {
1591 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1592 	unsigned int mask = ilog2(aligned_pages);
1593 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1594 	u16 did = domain->iommu_did[iommu->seq_id];
1595 
1596 	BUG_ON(pages == 0);
1597 
1598 	if (ih)
1599 		ih = 1 << 6;
1600 
1601 	if (domain_use_first_level(domain)) {
1602 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1603 	} else {
1604 		unsigned long bitmask = aligned_pages - 1;
1605 
1606 		/*
1607 		 * PSI masks the low order bits of the base address. If the
1608 		 * address isn't aligned to the mask, then compute a mask value
1609 		 * needed to ensure the target range is flushed.
1610 		 */
1611 		if (unlikely(bitmask & pfn)) {
1612 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1613 
1614 			/*
1615 			 * Since end_pfn <= pfn + bitmask, the only way bits
1616 			 * higher than bitmask can differ in pfn and end_pfn is
1617 			 * by carrying. This means after masking out bitmask,
1618 			 * high bits starting with the first set bit in
1619 			 * shared_bits are all equal in both pfn and end_pfn.
1620 			 */
1621 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1622 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1623 		}
1624 
1625 		/*
1626 		 * Fallback to domain selective flush if no PSI support or
1627 		 * the size is too big.
1628 		 */
1629 		if (!cap_pgsel_inv(iommu->cap) ||
1630 		    mask > cap_max_amask_val(iommu->cap))
1631 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1632 							DMA_TLB_DSI_FLUSH);
1633 		else
1634 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1635 							DMA_TLB_PSI_FLUSH);
1636 	}
1637 
1638 	/*
1639 	 * In caching mode, changes of pages from non-present to present require
1640 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1641 	 */
1642 	if (!cap_caching_mode(iommu->cap) || !map)
1643 		iommu_flush_dev_iotlb(domain, addr, mask);
1644 }
1645 
1646 /* Notification for newly created mappings */
1647 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1648 					struct dmar_domain *domain,
1649 					unsigned long pfn, unsigned int pages)
1650 {
1651 	/*
1652 	 * It's a non-present to present mapping. Only flush if caching mode
1653 	 * and second level.
1654 	 */
1655 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1656 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1657 	else
1658 		iommu_flush_write_buffer(iommu);
1659 }
1660 
1661 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1662 {
1663 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1664 	int idx;
1665 
1666 	for_each_domain_iommu(idx, dmar_domain) {
1667 		struct intel_iommu *iommu = g_iommus[idx];
1668 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1669 
1670 		if (domain_use_first_level(dmar_domain))
1671 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1672 		else
1673 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1674 						 DMA_TLB_DSI_FLUSH);
1675 
1676 		if (!cap_caching_mode(iommu->cap))
1677 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1678 	}
1679 }
1680 
1681 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1682 {
1683 	u32 pmen;
1684 	unsigned long flags;
1685 
1686 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1687 		return;
1688 
1689 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1690 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1691 	pmen &= ~DMA_PMEN_EPM;
1692 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1693 
1694 	/* wait for the protected region status bit to clear */
1695 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1696 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1697 
1698 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1699 }
1700 
1701 static void iommu_enable_translation(struct intel_iommu *iommu)
1702 {
1703 	u32 sts;
1704 	unsigned long flags;
1705 
1706 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1707 	iommu->gcmd |= DMA_GCMD_TE;
1708 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1709 
1710 	/* Make sure hardware complete it */
1711 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1712 		      readl, (sts & DMA_GSTS_TES), sts);
1713 
1714 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1715 }
1716 
1717 static void iommu_disable_translation(struct intel_iommu *iommu)
1718 {
1719 	u32 sts;
1720 	unsigned long flag;
1721 
1722 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1723 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1724 		return;
1725 
1726 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1727 	iommu->gcmd &= ~DMA_GCMD_TE;
1728 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1729 
1730 	/* Make sure hardware complete it */
1731 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1732 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1733 
1734 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1735 }
1736 
1737 static int iommu_init_domains(struct intel_iommu *iommu)
1738 {
1739 	u32 ndomains;
1740 
1741 	ndomains = cap_ndoms(iommu->cap);
1742 	pr_debug("%s: Number of Domains supported <%d>\n",
1743 		 iommu->name, ndomains);
1744 
1745 	spin_lock_init(&iommu->lock);
1746 
1747 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1748 	if (!iommu->domain_ids)
1749 		return -ENOMEM;
1750 
1751 	/*
1752 	 * If Caching mode is set, then invalid translations are tagged
1753 	 * with domain-id 0, hence we need to pre-allocate it. We also
1754 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1755 	 * make sure it is not used for a real domain.
1756 	 */
1757 	set_bit(0, iommu->domain_ids);
1758 
1759 	/*
1760 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1761 	 * entry for first-level or pass-through translation modes should
1762 	 * be programmed with a domain id different from those used for
1763 	 * second-level or nested translation. We reserve a domain id for
1764 	 * this purpose.
1765 	 */
1766 	if (sm_supported(iommu))
1767 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1768 
1769 	return 0;
1770 }
1771 
1772 static void disable_dmar_iommu(struct intel_iommu *iommu)
1773 {
1774 	struct device_domain_info *info, *tmp;
1775 	unsigned long flags;
1776 
1777 	if (!iommu->domain_ids)
1778 		return;
1779 
1780 	spin_lock_irqsave(&device_domain_lock, flags);
1781 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1782 		if (info->iommu != iommu)
1783 			continue;
1784 
1785 		if (!info->dev || !info->domain)
1786 			continue;
1787 
1788 		__dmar_remove_one_dev_info(info);
1789 	}
1790 	spin_unlock_irqrestore(&device_domain_lock, flags);
1791 
1792 	if (iommu->gcmd & DMA_GCMD_TE)
1793 		iommu_disable_translation(iommu);
1794 }
1795 
1796 static void free_dmar_iommu(struct intel_iommu *iommu)
1797 {
1798 	if (iommu->domain_ids) {
1799 		bitmap_free(iommu->domain_ids);
1800 		iommu->domain_ids = NULL;
1801 	}
1802 
1803 	g_iommus[iommu->seq_id] = NULL;
1804 
1805 	/* free context mapping */
1806 	free_context_table(iommu);
1807 
1808 #ifdef CONFIG_INTEL_IOMMU_SVM
1809 	if (pasid_supported(iommu)) {
1810 		if (ecap_prs(iommu->ecap))
1811 			intel_svm_finish_prq(iommu);
1812 	}
1813 	if (vccap_pasid(iommu->vccap))
1814 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1815 
1816 #endif
1817 }
1818 
1819 /*
1820  * Check and return whether first level is used by default for
1821  * DMA translation.
1822  */
1823 static bool first_level_by_default(unsigned int type)
1824 {
1825 	/* Only SL is available in legacy mode */
1826 	if (!scalable_mode_support())
1827 		return false;
1828 
1829 	/* Only level (either FL or SL) is available, just use it */
1830 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1831 		return intel_cap_flts_sanity();
1832 
1833 	/* Both levels are available, decide it based on domain type */
1834 	return type != IOMMU_DOMAIN_UNMANAGED;
1835 }
1836 
1837 static struct dmar_domain *alloc_domain(unsigned int type)
1838 {
1839 	struct dmar_domain *domain;
1840 
1841 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1842 	if (!domain)
1843 		return NULL;
1844 
1845 	domain->nid = NUMA_NO_NODE;
1846 	if (first_level_by_default(type))
1847 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1848 	domain->has_iotlb_device = false;
1849 	INIT_LIST_HEAD(&domain->devices);
1850 
1851 	return domain;
1852 }
1853 
1854 /* Must be called with iommu->lock */
1855 static int domain_attach_iommu(struct dmar_domain *domain,
1856 			       struct intel_iommu *iommu)
1857 {
1858 	unsigned long ndomains;
1859 	int num;
1860 
1861 	assert_spin_locked(&device_domain_lock);
1862 	assert_spin_locked(&iommu->lock);
1863 
1864 	domain->iommu_refcnt[iommu->seq_id] += 1;
1865 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866 		ndomains = cap_ndoms(iommu->cap);
1867 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1868 
1869 		if (num >= ndomains) {
1870 			pr_err("%s: No free domain ids\n", iommu->name);
1871 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 			return -ENOSPC;
1873 		}
1874 
1875 		set_bit(num, iommu->domain_ids);
1876 		domain->iommu_did[iommu->seq_id] = num;
1877 		domain->nid			 = iommu->node;
1878 		domain_update_iommu_cap(domain);
1879 	}
1880 
1881 	return 0;
1882 }
1883 
1884 static void domain_detach_iommu(struct dmar_domain *domain,
1885 				struct intel_iommu *iommu)
1886 {
1887 	int num;
1888 
1889 	assert_spin_locked(&device_domain_lock);
1890 	assert_spin_locked(&iommu->lock);
1891 
1892 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1893 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1894 		num = domain->iommu_did[iommu->seq_id];
1895 		clear_bit(num, iommu->domain_ids);
1896 		domain_update_iommu_cap(domain);
1897 		domain->iommu_did[iommu->seq_id] = 0;
1898 	}
1899 }
1900 
1901 static inline int guestwidth_to_adjustwidth(int gaw)
1902 {
1903 	int agaw;
1904 	int r = (gaw - 12) % 9;
1905 
1906 	if (r == 0)
1907 		agaw = gaw;
1908 	else
1909 		agaw = gaw + 9 - r;
1910 	if (agaw > 64)
1911 		agaw = 64;
1912 	return agaw;
1913 }
1914 
1915 static void domain_exit(struct dmar_domain *domain)
1916 {
1917 
1918 	/* Remove associated devices and clear attached or cached domains */
1919 	domain_remove_dev_info(domain);
1920 
1921 	if (domain->pgd) {
1922 		LIST_HEAD(freelist);
1923 
1924 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1925 		put_pages_list(&freelist);
1926 	}
1927 
1928 	kfree(domain);
1929 }
1930 
1931 /*
1932  * Get the PASID directory size for scalable mode context entry.
1933  * Value of X in the PDTS field of a scalable mode context entry
1934  * indicates PASID directory with 2^(X + 7) entries.
1935  */
1936 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1937 {
1938 	unsigned long pds, max_pde;
1939 
1940 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1941 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1942 	if (pds < 7)
1943 		return 0;
1944 
1945 	return pds - 7;
1946 }
1947 
1948 /*
1949  * Set the RID_PASID field of a scalable mode context entry. The
1950  * IOMMU hardware will use the PASID value set in this field for
1951  * DMA translations of DMA requests without PASID.
1952  */
1953 static inline void
1954 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1955 {
1956 	context->hi |= pasid & ((1 << 20) - 1);
1957 }
1958 
1959 /*
1960  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1961  * entry.
1962  */
1963 static inline void context_set_sm_dte(struct context_entry *context)
1964 {
1965 	context->lo |= (1 << 2);
1966 }
1967 
1968 /*
1969  * Set the PRE(Page Request Enable) field of a scalable mode context
1970  * entry.
1971  */
1972 static inline void context_set_sm_pre(struct context_entry *context)
1973 {
1974 	context->lo |= (1 << 4);
1975 }
1976 
1977 /* Convert value to context PASID directory size field coding. */
1978 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1979 
1980 static int domain_context_mapping_one(struct dmar_domain *domain,
1981 				      struct intel_iommu *iommu,
1982 				      struct pasid_table *table,
1983 				      u8 bus, u8 devfn)
1984 {
1985 	u16 did = domain->iommu_did[iommu->seq_id];
1986 	int translation = CONTEXT_TT_MULTI_LEVEL;
1987 	struct device_domain_info *info = NULL;
1988 	struct context_entry *context;
1989 	unsigned long flags;
1990 	int ret;
1991 
1992 	WARN_ON(did == 0);
1993 
1994 	if (hw_pass_through && domain_type_is_si(domain))
1995 		translation = CONTEXT_TT_PASS_THROUGH;
1996 
1997 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1998 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1999 
2000 	BUG_ON(!domain->pgd);
2001 
2002 	spin_lock_irqsave(&device_domain_lock, flags);
2003 	spin_lock(&iommu->lock);
2004 
2005 	ret = -ENOMEM;
2006 	context = iommu_context_addr(iommu, bus, devfn, 1);
2007 	if (!context)
2008 		goto out_unlock;
2009 
2010 	ret = 0;
2011 	if (context_present(context))
2012 		goto out_unlock;
2013 
2014 	/*
2015 	 * For kdump cases, old valid entries may be cached due to the
2016 	 * in-flight DMA and copied pgtable, but there is no unmapping
2017 	 * behaviour for them, thus we need an explicit cache flush for
2018 	 * the newly-mapped device. For kdump, at this point, the device
2019 	 * is supposed to finish reset at its driver probe stage, so no
2020 	 * in-flight DMA will exist, and we don't need to worry anymore
2021 	 * hereafter.
2022 	 */
2023 	if (context_copied(context)) {
2024 		u16 did_old = context_domain_id(context);
2025 
2026 		if (did_old < cap_ndoms(iommu->cap)) {
2027 			iommu->flush.flush_context(iommu, did_old,
2028 						   (((u16)bus) << 8) | devfn,
2029 						   DMA_CCMD_MASK_NOBIT,
2030 						   DMA_CCMD_DEVICE_INVL);
2031 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2032 						 DMA_TLB_DSI_FLUSH);
2033 		}
2034 	}
2035 
2036 	context_clear_entry(context);
2037 
2038 	if (sm_supported(iommu)) {
2039 		unsigned long pds;
2040 
2041 		WARN_ON(!table);
2042 
2043 		/* Setup the PASID DIR pointer: */
2044 		pds = context_get_sm_pds(table);
2045 		context->lo = (u64)virt_to_phys(table->table) |
2046 				context_pdts(pds);
2047 
2048 		/* Setup the RID_PASID field: */
2049 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2050 
2051 		/*
2052 		 * Setup the Device-TLB enable bit and Page request
2053 		 * Enable bit:
2054 		 */
2055 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2056 		if (info && info->ats_supported)
2057 			context_set_sm_dte(context);
2058 		if (info && info->pri_supported)
2059 			context_set_sm_pre(context);
2060 	} else {
2061 		struct dma_pte *pgd = domain->pgd;
2062 		int agaw;
2063 
2064 		context_set_domain_id(context, did);
2065 
2066 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2067 			/*
2068 			 * Skip top levels of page tables for iommu which has
2069 			 * less agaw than default. Unnecessary for PT mode.
2070 			 */
2071 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2072 				ret = -ENOMEM;
2073 				pgd = phys_to_virt(dma_pte_addr(pgd));
2074 				if (!dma_pte_present(pgd))
2075 					goto out_unlock;
2076 			}
2077 
2078 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2079 			if (info && info->ats_supported)
2080 				translation = CONTEXT_TT_DEV_IOTLB;
2081 			else
2082 				translation = CONTEXT_TT_MULTI_LEVEL;
2083 
2084 			context_set_address_root(context, virt_to_phys(pgd));
2085 			context_set_address_width(context, agaw);
2086 		} else {
2087 			/*
2088 			 * In pass through mode, AW must be programmed to
2089 			 * indicate the largest AGAW value supported by
2090 			 * hardware. And ASR is ignored by hardware.
2091 			 */
2092 			context_set_address_width(context, iommu->msagaw);
2093 		}
2094 
2095 		context_set_translation_type(context, translation);
2096 	}
2097 
2098 	context_set_fault_enable(context);
2099 	context_set_present(context);
2100 	if (!ecap_coherent(iommu->ecap))
2101 		clflush_cache_range(context, sizeof(*context));
2102 
2103 	/*
2104 	 * It's a non-present to present mapping. If hardware doesn't cache
2105 	 * non-present entry we only need to flush the write-buffer. If the
2106 	 * _does_ cache non-present entries, then it does so in the special
2107 	 * domain #0, which we have to flush:
2108 	 */
2109 	if (cap_caching_mode(iommu->cap)) {
2110 		iommu->flush.flush_context(iommu, 0,
2111 					   (((u16)bus) << 8) | devfn,
2112 					   DMA_CCMD_MASK_NOBIT,
2113 					   DMA_CCMD_DEVICE_INVL);
2114 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2115 	} else {
2116 		iommu_flush_write_buffer(iommu);
2117 	}
2118 	iommu_enable_dev_iotlb(info);
2119 
2120 	ret = 0;
2121 
2122 out_unlock:
2123 	spin_unlock(&iommu->lock);
2124 	spin_unlock_irqrestore(&device_domain_lock, flags);
2125 
2126 	return ret;
2127 }
2128 
2129 struct domain_context_mapping_data {
2130 	struct dmar_domain *domain;
2131 	struct intel_iommu *iommu;
2132 	struct pasid_table *table;
2133 };
2134 
2135 static int domain_context_mapping_cb(struct pci_dev *pdev,
2136 				     u16 alias, void *opaque)
2137 {
2138 	struct domain_context_mapping_data *data = opaque;
2139 
2140 	return domain_context_mapping_one(data->domain, data->iommu,
2141 					  data->table, PCI_BUS_NUM(alias),
2142 					  alias & 0xff);
2143 }
2144 
2145 static int
2146 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2147 {
2148 	struct domain_context_mapping_data data;
2149 	struct pasid_table *table;
2150 	struct intel_iommu *iommu;
2151 	u8 bus, devfn;
2152 
2153 	iommu = device_to_iommu(dev, &bus, &devfn);
2154 	if (!iommu)
2155 		return -ENODEV;
2156 
2157 	table = intel_pasid_get_table(dev);
2158 
2159 	if (!dev_is_pci(dev))
2160 		return domain_context_mapping_one(domain, iommu, table,
2161 						  bus, devfn);
2162 
2163 	data.domain = domain;
2164 	data.iommu = iommu;
2165 	data.table = table;
2166 
2167 	return pci_for_each_dma_alias(to_pci_dev(dev),
2168 				      &domain_context_mapping_cb, &data);
2169 }
2170 
2171 static int domain_context_mapped_cb(struct pci_dev *pdev,
2172 				    u16 alias, void *opaque)
2173 {
2174 	struct intel_iommu *iommu = opaque;
2175 
2176 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2177 }
2178 
2179 static int domain_context_mapped(struct device *dev)
2180 {
2181 	struct intel_iommu *iommu;
2182 	u8 bus, devfn;
2183 
2184 	iommu = device_to_iommu(dev, &bus, &devfn);
2185 	if (!iommu)
2186 		return -ENODEV;
2187 
2188 	if (!dev_is_pci(dev))
2189 		return device_context_mapped(iommu, bus, devfn);
2190 
2191 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2192 				       domain_context_mapped_cb, iommu);
2193 }
2194 
2195 /* Returns a number of VTD pages, but aligned to MM page size */
2196 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2197 					    size_t size)
2198 {
2199 	host_addr &= ~PAGE_MASK;
2200 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2201 }
2202 
2203 /* Return largest possible superpage level for a given mapping */
2204 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2205 					  unsigned long iov_pfn,
2206 					  unsigned long phy_pfn,
2207 					  unsigned long pages)
2208 {
2209 	int support, level = 1;
2210 	unsigned long pfnmerge;
2211 
2212 	support = domain->iommu_superpage;
2213 
2214 	/* To use a large page, the virtual *and* physical addresses
2215 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2216 	   of them will mean we have to use smaller pages. So just
2217 	   merge them and check both at once. */
2218 	pfnmerge = iov_pfn | phy_pfn;
2219 
2220 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2221 		pages >>= VTD_STRIDE_SHIFT;
2222 		if (!pages)
2223 			break;
2224 		pfnmerge >>= VTD_STRIDE_SHIFT;
2225 		level++;
2226 		support--;
2227 	}
2228 	return level;
2229 }
2230 
2231 /*
2232  * Ensure that old small page tables are removed to make room for superpage(s).
2233  * We're going to add new large pages, so make sure we don't remove their parent
2234  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2235  */
2236 static void switch_to_super_page(struct dmar_domain *domain,
2237 				 unsigned long start_pfn,
2238 				 unsigned long end_pfn, int level)
2239 {
2240 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2241 	struct dma_pte *pte = NULL;
2242 	int i;
2243 
2244 	while (start_pfn <= end_pfn) {
2245 		if (!pte)
2246 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2247 
2248 		if (dma_pte_present(pte)) {
2249 			dma_pte_free_pagetable(domain, start_pfn,
2250 					       start_pfn + lvl_pages - 1,
2251 					       level + 1);
2252 
2253 			for_each_domain_iommu(i, domain)
2254 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2255 						      start_pfn, lvl_pages,
2256 						      0, 0);
2257 		}
2258 
2259 		pte++;
2260 		start_pfn += lvl_pages;
2261 		if (first_pte_in_page(pte))
2262 			pte = NULL;
2263 	}
2264 }
2265 
2266 static int
2267 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2268 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2269 {
2270 	struct dma_pte *first_pte = NULL, *pte = NULL;
2271 	unsigned int largepage_lvl = 0;
2272 	unsigned long lvl_pages = 0;
2273 	phys_addr_t pteval;
2274 	u64 attr;
2275 
2276 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2277 
2278 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2279 		return -EINVAL;
2280 
2281 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2282 	attr |= DMA_FL_PTE_PRESENT;
2283 	if (domain_use_first_level(domain)) {
2284 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2285 		if (prot & DMA_PTE_WRITE)
2286 			attr |= DMA_FL_PTE_DIRTY;
2287 	}
2288 
2289 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2290 
2291 	while (nr_pages > 0) {
2292 		uint64_t tmp;
2293 
2294 		if (!pte) {
2295 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2296 					phys_pfn, nr_pages);
2297 
2298 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2299 			if (!pte)
2300 				return -ENOMEM;
2301 			first_pte = pte;
2302 
2303 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2304 
2305 			/* It is large page*/
2306 			if (largepage_lvl > 1) {
2307 				unsigned long end_pfn;
2308 				unsigned long pages_to_remove;
2309 
2310 				pteval |= DMA_PTE_LARGE_PAGE;
2311 				pages_to_remove = min_t(unsigned long, nr_pages,
2312 							nr_pte_to_next_page(pte) * lvl_pages);
2313 				end_pfn = iov_pfn + pages_to_remove - 1;
2314 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2315 			} else {
2316 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2317 			}
2318 
2319 		}
2320 		/* We don't need lock here, nobody else
2321 		 * touches the iova range
2322 		 */
2323 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2324 		if (tmp) {
2325 			static int dumps = 5;
2326 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2327 				iov_pfn, tmp, (unsigned long long)pteval);
2328 			if (dumps) {
2329 				dumps--;
2330 				debug_dma_dump_mappings(NULL);
2331 			}
2332 			WARN_ON(1);
2333 		}
2334 
2335 		nr_pages -= lvl_pages;
2336 		iov_pfn += lvl_pages;
2337 		phys_pfn += lvl_pages;
2338 		pteval += lvl_pages * VTD_PAGE_SIZE;
2339 
2340 		/* If the next PTE would be the first in a new page, then we
2341 		 * need to flush the cache on the entries we've just written.
2342 		 * And then we'll need to recalculate 'pte', so clear it and
2343 		 * let it get set again in the if (!pte) block above.
2344 		 *
2345 		 * If we're done (!nr_pages) we need to flush the cache too.
2346 		 *
2347 		 * Also if we've been setting superpages, we may need to
2348 		 * recalculate 'pte' and switch back to smaller pages for the
2349 		 * end of the mapping, if the trailing size is not enough to
2350 		 * use another superpage (i.e. nr_pages < lvl_pages).
2351 		 */
2352 		pte++;
2353 		if (!nr_pages || first_pte_in_page(pte) ||
2354 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2355 			domain_flush_cache(domain, first_pte,
2356 					   (void *)pte - (void *)first_pte);
2357 			pte = NULL;
2358 		}
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2365 {
2366 	struct intel_iommu *iommu = info->iommu;
2367 	struct context_entry *context;
2368 	unsigned long flags;
2369 	u16 did_old;
2370 
2371 	if (!iommu)
2372 		return;
2373 
2374 	spin_lock_irqsave(&iommu->lock, flags);
2375 	context = iommu_context_addr(iommu, bus, devfn, 0);
2376 	if (!context) {
2377 		spin_unlock_irqrestore(&iommu->lock, flags);
2378 		return;
2379 	}
2380 
2381 	if (sm_supported(iommu)) {
2382 		if (hw_pass_through && domain_type_is_si(info->domain))
2383 			did_old = FLPT_DEFAULT_DID;
2384 		else
2385 			did_old = info->domain->iommu_did[iommu->seq_id];
2386 	} else {
2387 		did_old = context_domain_id(context);
2388 	}
2389 
2390 	context_clear_entry(context);
2391 	__iommu_flush_cache(iommu, context, sizeof(*context));
2392 	spin_unlock_irqrestore(&iommu->lock, flags);
2393 	iommu->flush.flush_context(iommu,
2394 				   did_old,
2395 				   (((u16)bus) << 8) | devfn,
2396 				   DMA_CCMD_MASK_NOBIT,
2397 				   DMA_CCMD_DEVICE_INVL);
2398 
2399 	if (sm_supported(iommu))
2400 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2401 
2402 	iommu->flush.flush_iotlb(iommu,
2403 				 did_old,
2404 				 0,
2405 				 0,
2406 				 DMA_TLB_DSI_FLUSH);
2407 
2408 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2409 }
2410 
2411 static void domain_remove_dev_info(struct dmar_domain *domain)
2412 {
2413 	struct device_domain_info *info, *tmp;
2414 	unsigned long flags;
2415 
2416 	spin_lock_irqsave(&device_domain_lock, flags);
2417 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2418 		__dmar_remove_one_dev_info(info);
2419 	spin_unlock_irqrestore(&device_domain_lock, flags);
2420 }
2421 
2422 static inline struct device_domain_info *
2423 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2424 {
2425 	struct device_domain_info *info;
2426 
2427 	list_for_each_entry(info, &device_domain_list, global)
2428 		if (info->segment == segment && info->bus == bus &&
2429 		    info->devfn == devfn)
2430 			return info;
2431 
2432 	return NULL;
2433 }
2434 
2435 static int domain_setup_first_level(struct intel_iommu *iommu,
2436 				    struct dmar_domain *domain,
2437 				    struct device *dev,
2438 				    u32 pasid)
2439 {
2440 	struct dma_pte *pgd = domain->pgd;
2441 	int agaw, level;
2442 	int flags = 0;
2443 
2444 	/*
2445 	 * Skip top levels of page tables for iommu which has
2446 	 * less agaw than default. Unnecessary for PT mode.
2447 	 */
2448 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2449 		pgd = phys_to_virt(dma_pte_addr(pgd));
2450 		if (!dma_pte_present(pgd))
2451 			return -ENOMEM;
2452 	}
2453 
2454 	level = agaw_to_level(agaw);
2455 	if (level != 4 && level != 5)
2456 		return -EINVAL;
2457 
2458 	if (pasid != PASID_RID2PASID)
2459 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2460 	if (level == 5)
2461 		flags |= PASID_FLAG_FL5LP;
2462 
2463 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2464 		flags |= PASID_FLAG_PAGE_SNOOP;
2465 
2466 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2467 					     domain->iommu_did[iommu->seq_id],
2468 					     flags);
2469 }
2470 
2471 static bool dev_is_real_dma_subdevice(struct device *dev)
2472 {
2473 	return dev && dev_is_pci(dev) &&
2474 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2475 }
2476 
2477 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2478 						    int bus, int devfn,
2479 						    struct device *dev,
2480 						    struct dmar_domain *domain)
2481 {
2482 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2483 	unsigned long flags;
2484 	int ret;
2485 
2486 	spin_lock_irqsave(&device_domain_lock, flags);
2487 	info->domain = domain;
2488 	spin_lock(&iommu->lock);
2489 	ret = domain_attach_iommu(domain, iommu);
2490 	spin_unlock(&iommu->lock);
2491 	if (ret) {
2492 		spin_unlock_irqrestore(&device_domain_lock, flags);
2493 		return NULL;
2494 	}
2495 	list_add(&info->link, &domain->devices);
2496 	spin_unlock_irqrestore(&device_domain_lock, flags);
2497 
2498 	/* PASID table is mandatory for a PCI device in scalable mode. */
2499 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2500 		ret = intel_pasid_alloc_table(dev);
2501 		if (ret) {
2502 			dev_err(dev, "PASID table allocation failed\n");
2503 			dmar_remove_one_dev_info(dev);
2504 			return NULL;
2505 		}
2506 
2507 		/* Setup the PASID entry for requests without PASID: */
2508 		spin_lock_irqsave(&iommu->lock, flags);
2509 		if (hw_pass_through && domain_type_is_si(domain))
2510 			ret = intel_pasid_setup_pass_through(iommu, domain,
2511 					dev, PASID_RID2PASID);
2512 		else if (domain_use_first_level(domain))
2513 			ret = domain_setup_first_level(iommu, domain, dev,
2514 					PASID_RID2PASID);
2515 		else
2516 			ret = intel_pasid_setup_second_level(iommu, domain,
2517 					dev, PASID_RID2PASID);
2518 		spin_unlock_irqrestore(&iommu->lock, flags);
2519 		if (ret) {
2520 			dev_err(dev, "Setup RID2PASID failed\n");
2521 			dmar_remove_one_dev_info(dev);
2522 			return NULL;
2523 		}
2524 	}
2525 
2526 	if (dev && domain_context_mapping(domain, dev)) {
2527 		dev_err(dev, "Domain context map failed\n");
2528 		dmar_remove_one_dev_info(dev);
2529 		return NULL;
2530 	}
2531 
2532 	return domain;
2533 }
2534 
2535 static int iommu_domain_identity_map(struct dmar_domain *domain,
2536 				     unsigned long first_vpfn,
2537 				     unsigned long last_vpfn)
2538 {
2539 	/*
2540 	 * RMRR range might have overlap with physical memory range,
2541 	 * clear it first
2542 	 */
2543 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2544 
2545 	return __domain_mapping(domain, first_vpfn,
2546 				first_vpfn, last_vpfn - first_vpfn + 1,
2547 				DMA_PTE_READ|DMA_PTE_WRITE);
2548 }
2549 
2550 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2551 
2552 static int __init si_domain_init(int hw)
2553 {
2554 	struct dmar_rmrr_unit *rmrr;
2555 	struct device *dev;
2556 	int i, nid, ret;
2557 
2558 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2559 	if (!si_domain)
2560 		return -EFAULT;
2561 
2562 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2563 		domain_exit(si_domain);
2564 		return -EFAULT;
2565 	}
2566 
2567 	if (hw)
2568 		return 0;
2569 
2570 	for_each_online_node(nid) {
2571 		unsigned long start_pfn, end_pfn;
2572 		int i;
2573 
2574 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2575 			ret = iommu_domain_identity_map(si_domain,
2576 					mm_to_dma_pfn(start_pfn),
2577 					mm_to_dma_pfn(end_pfn));
2578 			if (ret)
2579 				return ret;
2580 		}
2581 	}
2582 
2583 	/*
2584 	 * Identity map the RMRRs so that devices with RMRRs could also use
2585 	 * the si_domain.
2586 	 */
2587 	for_each_rmrr_units(rmrr) {
2588 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2589 					  i, dev) {
2590 			unsigned long long start = rmrr->base_address;
2591 			unsigned long long end = rmrr->end_address;
2592 
2593 			if (WARN_ON(end < start ||
2594 				    end >> agaw_to_width(si_domain->agaw)))
2595 				continue;
2596 
2597 			ret = iommu_domain_identity_map(si_domain,
2598 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2599 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2600 			if (ret)
2601 				return ret;
2602 		}
2603 	}
2604 
2605 	return 0;
2606 }
2607 
2608 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2609 {
2610 	struct dmar_domain *ndomain;
2611 	struct intel_iommu *iommu;
2612 	u8 bus, devfn;
2613 
2614 	iommu = device_to_iommu(dev, &bus, &devfn);
2615 	if (!iommu)
2616 		return -ENODEV;
2617 
2618 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2619 	if (ndomain != domain)
2620 		return -EBUSY;
2621 
2622 	return 0;
2623 }
2624 
2625 static bool device_has_rmrr(struct device *dev)
2626 {
2627 	struct dmar_rmrr_unit *rmrr;
2628 	struct device *tmp;
2629 	int i;
2630 
2631 	rcu_read_lock();
2632 	for_each_rmrr_units(rmrr) {
2633 		/*
2634 		 * Return TRUE if this RMRR contains the device that
2635 		 * is passed in.
2636 		 */
2637 		for_each_active_dev_scope(rmrr->devices,
2638 					  rmrr->devices_cnt, i, tmp)
2639 			if (tmp == dev ||
2640 			    is_downstream_to_pci_bridge(dev, tmp)) {
2641 				rcu_read_unlock();
2642 				return true;
2643 			}
2644 	}
2645 	rcu_read_unlock();
2646 	return false;
2647 }
2648 
2649 /**
2650  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2651  * is relaxable (ie. is allowed to be not enforced under some conditions)
2652  * @dev: device handle
2653  *
2654  * We assume that PCI USB devices with RMRRs have them largely
2655  * for historical reasons and that the RMRR space is not actively used post
2656  * boot.  This exclusion may change if vendors begin to abuse it.
2657  *
2658  * The same exception is made for graphics devices, with the requirement that
2659  * any use of the RMRR regions will be torn down before assigning the device
2660  * to a guest.
2661  *
2662  * Return: true if the RMRR is relaxable, false otherwise
2663  */
2664 static bool device_rmrr_is_relaxable(struct device *dev)
2665 {
2666 	struct pci_dev *pdev;
2667 
2668 	if (!dev_is_pci(dev))
2669 		return false;
2670 
2671 	pdev = to_pci_dev(dev);
2672 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2673 		return true;
2674 	else
2675 		return false;
2676 }
2677 
2678 /*
2679  * There are a couple cases where we need to restrict the functionality of
2680  * devices associated with RMRRs.  The first is when evaluating a device for
2681  * identity mapping because problems exist when devices are moved in and out
2682  * of domains and their respective RMRR information is lost.  This means that
2683  * a device with associated RMRRs will never be in a "passthrough" domain.
2684  * The second is use of the device through the IOMMU API.  This interface
2685  * expects to have full control of the IOVA space for the device.  We cannot
2686  * satisfy both the requirement that RMRR access is maintained and have an
2687  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2688  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2689  * We therefore prevent devices associated with an RMRR from participating in
2690  * the IOMMU API, which eliminates them from device assignment.
2691  *
2692  * In both cases, devices which have relaxable RMRRs are not concerned by this
2693  * restriction. See device_rmrr_is_relaxable comment.
2694  */
2695 static bool device_is_rmrr_locked(struct device *dev)
2696 {
2697 	if (!device_has_rmrr(dev))
2698 		return false;
2699 
2700 	if (device_rmrr_is_relaxable(dev))
2701 		return false;
2702 
2703 	return true;
2704 }
2705 
2706 /*
2707  * Return the required default domain type for a specific device.
2708  *
2709  * @dev: the device in query
2710  * @startup: true if this is during early boot
2711  *
2712  * Returns:
2713  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2714  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2715  *  - 0: both identity and dynamic domains work for this device
2716  */
2717 static int device_def_domain_type(struct device *dev)
2718 {
2719 	if (dev_is_pci(dev)) {
2720 		struct pci_dev *pdev = to_pci_dev(dev);
2721 
2722 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2723 			return IOMMU_DOMAIN_IDENTITY;
2724 
2725 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2726 			return IOMMU_DOMAIN_IDENTITY;
2727 	}
2728 
2729 	return 0;
2730 }
2731 
2732 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2733 {
2734 	/*
2735 	 * Start from the sane iommu hardware state.
2736 	 * If the queued invalidation is already initialized by us
2737 	 * (for example, while enabling interrupt-remapping) then
2738 	 * we got the things already rolling from a sane state.
2739 	 */
2740 	if (!iommu->qi) {
2741 		/*
2742 		 * Clear any previous faults.
2743 		 */
2744 		dmar_fault(-1, iommu);
2745 		/*
2746 		 * Disable queued invalidation if supported and already enabled
2747 		 * before OS handover.
2748 		 */
2749 		dmar_disable_qi(iommu);
2750 	}
2751 
2752 	if (dmar_enable_qi(iommu)) {
2753 		/*
2754 		 * Queued Invalidate not enabled, use Register Based Invalidate
2755 		 */
2756 		iommu->flush.flush_context = __iommu_flush_context;
2757 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2758 		pr_info("%s: Using Register based invalidation\n",
2759 			iommu->name);
2760 	} else {
2761 		iommu->flush.flush_context = qi_flush_context;
2762 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2763 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2764 	}
2765 }
2766 
2767 static int copy_context_table(struct intel_iommu *iommu,
2768 			      struct root_entry *old_re,
2769 			      struct context_entry **tbl,
2770 			      int bus, bool ext)
2771 {
2772 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2773 	struct context_entry *new_ce = NULL, ce;
2774 	struct context_entry *old_ce = NULL;
2775 	struct root_entry re;
2776 	phys_addr_t old_ce_phys;
2777 
2778 	tbl_idx = ext ? bus * 2 : bus;
2779 	memcpy(&re, old_re, sizeof(re));
2780 
2781 	for (devfn = 0; devfn < 256; devfn++) {
2782 		/* First calculate the correct index */
2783 		idx = (ext ? devfn * 2 : devfn) % 256;
2784 
2785 		if (idx == 0) {
2786 			/* First save what we may have and clean up */
2787 			if (new_ce) {
2788 				tbl[tbl_idx] = new_ce;
2789 				__iommu_flush_cache(iommu, new_ce,
2790 						    VTD_PAGE_SIZE);
2791 				pos = 1;
2792 			}
2793 
2794 			if (old_ce)
2795 				memunmap(old_ce);
2796 
2797 			ret = 0;
2798 			if (devfn < 0x80)
2799 				old_ce_phys = root_entry_lctp(&re);
2800 			else
2801 				old_ce_phys = root_entry_uctp(&re);
2802 
2803 			if (!old_ce_phys) {
2804 				if (ext && devfn == 0) {
2805 					/* No LCTP, try UCTP */
2806 					devfn = 0x7f;
2807 					continue;
2808 				} else {
2809 					goto out;
2810 				}
2811 			}
2812 
2813 			ret = -ENOMEM;
2814 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2815 					MEMREMAP_WB);
2816 			if (!old_ce)
2817 				goto out;
2818 
2819 			new_ce = alloc_pgtable_page(iommu->node);
2820 			if (!new_ce)
2821 				goto out_unmap;
2822 
2823 			ret = 0;
2824 		}
2825 
2826 		/* Now copy the context entry */
2827 		memcpy(&ce, old_ce + idx, sizeof(ce));
2828 
2829 		if (!__context_present(&ce))
2830 			continue;
2831 
2832 		did = context_domain_id(&ce);
2833 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2834 			set_bit(did, iommu->domain_ids);
2835 
2836 		/*
2837 		 * We need a marker for copied context entries. This
2838 		 * marker needs to work for the old format as well as
2839 		 * for extended context entries.
2840 		 *
2841 		 * Bit 67 of the context entry is used. In the old
2842 		 * format this bit is available to software, in the
2843 		 * extended format it is the PGE bit, but PGE is ignored
2844 		 * by HW if PASIDs are disabled (and thus still
2845 		 * available).
2846 		 *
2847 		 * So disable PASIDs first and then mark the entry
2848 		 * copied. This means that we don't copy PASID
2849 		 * translations from the old kernel, but this is fine as
2850 		 * faults there are not fatal.
2851 		 */
2852 		context_clear_pasid_enable(&ce);
2853 		context_set_copied(&ce);
2854 
2855 		new_ce[idx] = ce;
2856 	}
2857 
2858 	tbl[tbl_idx + pos] = new_ce;
2859 
2860 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2861 
2862 out_unmap:
2863 	memunmap(old_ce);
2864 
2865 out:
2866 	return ret;
2867 }
2868 
2869 static int copy_translation_tables(struct intel_iommu *iommu)
2870 {
2871 	struct context_entry **ctxt_tbls;
2872 	struct root_entry *old_rt;
2873 	phys_addr_t old_rt_phys;
2874 	int ctxt_table_entries;
2875 	unsigned long flags;
2876 	u64 rtaddr_reg;
2877 	int bus, ret;
2878 	bool new_ext, ext;
2879 
2880 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2881 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2882 	new_ext    = !!ecap_ecs(iommu->ecap);
2883 
2884 	/*
2885 	 * The RTT bit can only be changed when translation is disabled,
2886 	 * but disabling translation means to open a window for data
2887 	 * corruption. So bail out and don't copy anything if we would
2888 	 * have to change the bit.
2889 	 */
2890 	if (new_ext != ext)
2891 		return -EINVAL;
2892 
2893 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2894 	if (!old_rt_phys)
2895 		return -EINVAL;
2896 
2897 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2898 	if (!old_rt)
2899 		return -ENOMEM;
2900 
2901 	/* This is too big for the stack - allocate it from slab */
2902 	ctxt_table_entries = ext ? 512 : 256;
2903 	ret = -ENOMEM;
2904 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2905 	if (!ctxt_tbls)
2906 		goto out_unmap;
2907 
2908 	for (bus = 0; bus < 256; bus++) {
2909 		ret = copy_context_table(iommu, &old_rt[bus],
2910 					 ctxt_tbls, bus, ext);
2911 		if (ret) {
2912 			pr_err("%s: Failed to copy context table for bus %d\n",
2913 				iommu->name, bus);
2914 			continue;
2915 		}
2916 	}
2917 
2918 	spin_lock_irqsave(&iommu->lock, flags);
2919 
2920 	/* Context tables are copied, now write them to the root_entry table */
2921 	for (bus = 0; bus < 256; bus++) {
2922 		int idx = ext ? bus * 2 : bus;
2923 		u64 val;
2924 
2925 		if (ctxt_tbls[idx]) {
2926 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2927 			iommu->root_entry[bus].lo = val;
2928 		}
2929 
2930 		if (!ext || !ctxt_tbls[idx + 1])
2931 			continue;
2932 
2933 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2934 		iommu->root_entry[bus].hi = val;
2935 	}
2936 
2937 	spin_unlock_irqrestore(&iommu->lock, flags);
2938 
2939 	kfree(ctxt_tbls);
2940 
2941 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2942 
2943 	ret = 0;
2944 
2945 out_unmap:
2946 	memunmap(old_rt);
2947 
2948 	return ret;
2949 }
2950 
2951 #ifdef CONFIG_INTEL_IOMMU_SVM
2952 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2953 {
2954 	struct intel_iommu *iommu = data;
2955 	ioasid_t ioasid;
2956 
2957 	if (!iommu)
2958 		return INVALID_IOASID;
2959 	/*
2960 	 * VT-d virtual command interface always uses the full 20 bit
2961 	 * PASID range. Host can partition guest PASID range based on
2962 	 * policies but it is out of guest's control.
2963 	 */
2964 	if (min < PASID_MIN || max > intel_pasid_max_id)
2965 		return INVALID_IOASID;
2966 
2967 	if (vcmd_alloc_pasid(iommu, &ioasid))
2968 		return INVALID_IOASID;
2969 
2970 	return ioasid;
2971 }
2972 
2973 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2974 {
2975 	struct intel_iommu *iommu = data;
2976 
2977 	if (!iommu)
2978 		return;
2979 	/*
2980 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2981 	 * We can only free the PASID when all the devices are unbound.
2982 	 */
2983 	if (ioasid_find(NULL, ioasid, NULL)) {
2984 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2985 		return;
2986 	}
2987 	vcmd_free_pasid(iommu, ioasid);
2988 }
2989 
2990 static void register_pasid_allocator(struct intel_iommu *iommu)
2991 {
2992 	/*
2993 	 * If we are running in the host, no need for custom allocator
2994 	 * in that PASIDs are allocated from the host system-wide.
2995 	 */
2996 	if (!cap_caching_mode(iommu->cap))
2997 		return;
2998 
2999 	if (!sm_supported(iommu)) {
3000 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3001 		return;
3002 	}
3003 
3004 	/*
3005 	 * Register a custom PASID allocator if we are running in a guest,
3006 	 * guest PASID must be obtained via virtual command interface.
3007 	 * There can be multiple vIOMMUs in each guest but only one allocator
3008 	 * is active. All vIOMMU allocators will eventually be calling the same
3009 	 * host allocator.
3010 	 */
3011 	if (!vccap_pasid(iommu->vccap))
3012 		return;
3013 
3014 	pr_info("Register custom PASID allocator\n");
3015 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3016 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3017 	iommu->pasid_allocator.pdata = (void *)iommu;
3018 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3019 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3020 		/*
3021 		 * Disable scalable mode on this IOMMU if there
3022 		 * is no custom allocator. Mixing SM capable vIOMMU
3023 		 * and non-SM vIOMMU are not supported.
3024 		 */
3025 		intel_iommu_sm = 0;
3026 	}
3027 }
3028 #endif
3029 
3030 static int __init init_dmars(void)
3031 {
3032 	struct dmar_drhd_unit *drhd;
3033 	struct intel_iommu *iommu;
3034 	int ret;
3035 
3036 	/*
3037 	 * for each drhd
3038 	 *    allocate root
3039 	 *    initialize and program root entry to not present
3040 	 * endfor
3041 	 */
3042 	for_each_drhd_unit(drhd) {
3043 		/*
3044 		 * lock not needed as this is only incremented in the single
3045 		 * threaded kernel __init code path all other access are read
3046 		 * only
3047 		 */
3048 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3049 			g_num_of_iommus++;
3050 			continue;
3051 		}
3052 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3053 	}
3054 
3055 	/* Preallocate enough resources for IOMMU hot-addition */
3056 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3057 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3058 
3059 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3060 			GFP_KERNEL);
3061 	if (!g_iommus) {
3062 		ret = -ENOMEM;
3063 		goto error;
3064 	}
3065 
3066 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3067 	if (ret)
3068 		goto free_iommu;
3069 
3070 	for_each_iommu(iommu, drhd) {
3071 		if (drhd->ignored) {
3072 			iommu_disable_translation(iommu);
3073 			continue;
3074 		}
3075 
3076 		/*
3077 		 * Find the max pasid size of all IOMMU's in the system.
3078 		 * We need to ensure the system pasid table is no bigger
3079 		 * than the smallest supported.
3080 		 */
3081 		if (pasid_supported(iommu)) {
3082 			u32 temp = 2 << ecap_pss(iommu->ecap);
3083 
3084 			intel_pasid_max_id = min_t(u32, temp,
3085 						   intel_pasid_max_id);
3086 		}
3087 
3088 		g_iommus[iommu->seq_id] = iommu;
3089 
3090 		intel_iommu_init_qi(iommu);
3091 
3092 		ret = iommu_init_domains(iommu);
3093 		if (ret)
3094 			goto free_iommu;
3095 
3096 		init_translation_status(iommu);
3097 
3098 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3099 			iommu_disable_translation(iommu);
3100 			clear_translation_pre_enabled(iommu);
3101 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3102 				iommu->name);
3103 		}
3104 
3105 		/*
3106 		 * TBD:
3107 		 * we could share the same root & context tables
3108 		 * among all IOMMU's. Need to Split it later.
3109 		 */
3110 		ret = iommu_alloc_root_entry(iommu);
3111 		if (ret)
3112 			goto free_iommu;
3113 
3114 		if (translation_pre_enabled(iommu)) {
3115 			pr_info("Translation already enabled - trying to copy translation structures\n");
3116 
3117 			ret = copy_translation_tables(iommu);
3118 			if (ret) {
3119 				/*
3120 				 * We found the IOMMU with translation
3121 				 * enabled - but failed to copy over the
3122 				 * old root-entry table. Try to proceed
3123 				 * by disabling translation now and
3124 				 * allocating a clean root-entry table.
3125 				 * This might cause DMAR faults, but
3126 				 * probably the dump will still succeed.
3127 				 */
3128 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3129 				       iommu->name);
3130 				iommu_disable_translation(iommu);
3131 				clear_translation_pre_enabled(iommu);
3132 			} else {
3133 				pr_info("Copied translation tables from previous kernel for %s\n",
3134 					iommu->name);
3135 			}
3136 		}
3137 
3138 		if (!ecap_pass_through(iommu->ecap))
3139 			hw_pass_through = 0;
3140 		intel_svm_check(iommu);
3141 	}
3142 
3143 	/*
3144 	 * Now that qi is enabled on all iommus, set the root entry and flush
3145 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3146 	 * flush_context function will loop forever and the boot hangs.
3147 	 */
3148 	for_each_active_iommu(iommu, drhd) {
3149 		iommu_flush_write_buffer(iommu);
3150 #ifdef CONFIG_INTEL_IOMMU_SVM
3151 		register_pasid_allocator(iommu);
3152 #endif
3153 		iommu_set_root_entry(iommu);
3154 	}
3155 
3156 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3157 	dmar_map_gfx = 0;
3158 #endif
3159 
3160 	if (!dmar_map_gfx)
3161 		iommu_identity_mapping |= IDENTMAP_GFX;
3162 
3163 	check_tylersburg_isoch();
3164 
3165 	ret = si_domain_init(hw_pass_through);
3166 	if (ret)
3167 		goto free_iommu;
3168 
3169 	/*
3170 	 * for each drhd
3171 	 *   enable fault log
3172 	 *   global invalidate context cache
3173 	 *   global invalidate iotlb
3174 	 *   enable translation
3175 	 */
3176 	for_each_iommu(iommu, drhd) {
3177 		if (drhd->ignored) {
3178 			/*
3179 			 * we always have to disable PMRs or DMA may fail on
3180 			 * this device
3181 			 */
3182 			if (force_on)
3183 				iommu_disable_protect_mem_regions(iommu);
3184 			continue;
3185 		}
3186 
3187 		iommu_flush_write_buffer(iommu);
3188 
3189 #ifdef CONFIG_INTEL_IOMMU_SVM
3190 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3191 			/*
3192 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3193 			 * could cause possible lock race condition.
3194 			 */
3195 			up_write(&dmar_global_lock);
3196 			ret = intel_svm_enable_prq(iommu);
3197 			down_write(&dmar_global_lock);
3198 			if (ret)
3199 				goto free_iommu;
3200 		}
3201 #endif
3202 		ret = dmar_set_interrupt(iommu);
3203 		if (ret)
3204 			goto free_iommu;
3205 	}
3206 
3207 	return 0;
3208 
3209 free_iommu:
3210 	for_each_active_iommu(iommu, drhd) {
3211 		disable_dmar_iommu(iommu);
3212 		free_dmar_iommu(iommu);
3213 	}
3214 
3215 	kfree(g_iommus);
3216 
3217 error:
3218 	return ret;
3219 }
3220 
3221 static void __init init_no_remapping_devices(void)
3222 {
3223 	struct dmar_drhd_unit *drhd;
3224 	struct device *dev;
3225 	int i;
3226 
3227 	for_each_drhd_unit(drhd) {
3228 		if (!drhd->include_all) {
3229 			for_each_active_dev_scope(drhd->devices,
3230 						  drhd->devices_cnt, i, dev)
3231 				break;
3232 			/* ignore DMAR unit if no devices exist */
3233 			if (i == drhd->devices_cnt)
3234 				drhd->ignored = 1;
3235 		}
3236 	}
3237 
3238 	for_each_active_drhd_unit(drhd) {
3239 		if (drhd->include_all)
3240 			continue;
3241 
3242 		for_each_active_dev_scope(drhd->devices,
3243 					  drhd->devices_cnt, i, dev)
3244 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3245 				break;
3246 		if (i < drhd->devices_cnt)
3247 			continue;
3248 
3249 		/* This IOMMU has *only* gfx devices. Either bypass it or
3250 		   set the gfx_mapped flag, as appropriate */
3251 		drhd->gfx_dedicated = 1;
3252 		if (!dmar_map_gfx)
3253 			drhd->ignored = 1;
3254 	}
3255 }
3256 
3257 #ifdef CONFIG_SUSPEND
3258 static int init_iommu_hw(void)
3259 {
3260 	struct dmar_drhd_unit *drhd;
3261 	struct intel_iommu *iommu = NULL;
3262 
3263 	for_each_active_iommu(iommu, drhd)
3264 		if (iommu->qi)
3265 			dmar_reenable_qi(iommu);
3266 
3267 	for_each_iommu(iommu, drhd) {
3268 		if (drhd->ignored) {
3269 			/*
3270 			 * we always have to disable PMRs or DMA may fail on
3271 			 * this device
3272 			 */
3273 			if (force_on)
3274 				iommu_disable_protect_mem_regions(iommu);
3275 			continue;
3276 		}
3277 
3278 		iommu_flush_write_buffer(iommu);
3279 		iommu_set_root_entry(iommu);
3280 		iommu_enable_translation(iommu);
3281 		iommu_disable_protect_mem_regions(iommu);
3282 	}
3283 
3284 	return 0;
3285 }
3286 
3287 static void iommu_flush_all(void)
3288 {
3289 	struct dmar_drhd_unit *drhd;
3290 	struct intel_iommu *iommu;
3291 
3292 	for_each_active_iommu(iommu, drhd) {
3293 		iommu->flush.flush_context(iommu, 0, 0, 0,
3294 					   DMA_CCMD_GLOBAL_INVL);
3295 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3296 					 DMA_TLB_GLOBAL_FLUSH);
3297 	}
3298 }
3299 
3300 static int iommu_suspend(void)
3301 {
3302 	struct dmar_drhd_unit *drhd;
3303 	struct intel_iommu *iommu = NULL;
3304 	unsigned long flag;
3305 
3306 	for_each_active_iommu(iommu, drhd) {
3307 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3308 					     GFP_KERNEL);
3309 		if (!iommu->iommu_state)
3310 			goto nomem;
3311 	}
3312 
3313 	iommu_flush_all();
3314 
3315 	for_each_active_iommu(iommu, drhd) {
3316 		iommu_disable_translation(iommu);
3317 
3318 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3319 
3320 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3321 			readl(iommu->reg + DMAR_FECTL_REG);
3322 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3323 			readl(iommu->reg + DMAR_FEDATA_REG);
3324 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3325 			readl(iommu->reg + DMAR_FEADDR_REG);
3326 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3327 			readl(iommu->reg + DMAR_FEUADDR_REG);
3328 
3329 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3330 	}
3331 	return 0;
3332 
3333 nomem:
3334 	for_each_active_iommu(iommu, drhd)
3335 		kfree(iommu->iommu_state);
3336 
3337 	return -ENOMEM;
3338 }
3339 
3340 static void iommu_resume(void)
3341 {
3342 	struct dmar_drhd_unit *drhd;
3343 	struct intel_iommu *iommu = NULL;
3344 	unsigned long flag;
3345 
3346 	if (init_iommu_hw()) {
3347 		if (force_on)
3348 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3349 		else
3350 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3351 		return;
3352 	}
3353 
3354 	for_each_active_iommu(iommu, drhd) {
3355 
3356 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3357 
3358 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3359 			iommu->reg + DMAR_FECTL_REG);
3360 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3361 			iommu->reg + DMAR_FEDATA_REG);
3362 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3363 			iommu->reg + DMAR_FEADDR_REG);
3364 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3365 			iommu->reg + DMAR_FEUADDR_REG);
3366 
3367 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3368 	}
3369 
3370 	for_each_active_iommu(iommu, drhd)
3371 		kfree(iommu->iommu_state);
3372 }
3373 
3374 static struct syscore_ops iommu_syscore_ops = {
3375 	.resume		= iommu_resume,
3376 	.suspend	= iommu_suspend,
3377 };
3378 
3379 static void __init init_iommu_pm_ops(void)
3380 {
3381 	register_syscore_ops(&iommu_syscore_ops);
3382 }
3383 
3384 #else
3385 static inline void init_iommu_pm_ops(void) {}
3386 #endif	/* CONFIG_PM */
3387 
3388 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3389 {
3390 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3391 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3392 	    rmrr->end_address <= rmrr->base_address ||
3393 	    arch_rmrr_sanity_check(rmrr))
3394 		return -EINVAL;
3395 
3396 	return 0;
3397 }
3398 
3399 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3400 {
3401 	struct acpi_dmar_reserved_memory *rmrr;
3402 	struct dmar_rmrr_unit *rmrru;
3403 
3404 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3405 	if (rmrr_sanity_check(rmrr)) {
3406 		pr_warn(FW_BUG
3407 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3408 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3409 			   rmrr->base_address, rmrr->end_address,
3410 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3411 			   dmi_get_system_info(DMI_BIOS_VERSION),
3412 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3413 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3414 	}
3415 
3416 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3417 	if (!rmrru)
3418 		goto out;
3419 
3420 	rmrru->hdr = header;
3421 
3422 	rmrru->base_address = rmrr->base_address;
3423 	rmrru->end_address = rmrr->end_address;
3424 
3425 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3426 				((void *)rmrr) + rmrr->header.length,
3427 				&rmrru->devices_cnt);
3428 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3429 		goto free_rmrru;
3430 
3431 	list_add(&rmrru->list, &dmar_rmrr_units);
3432 
3433 	return 0;
3434 free_rmrru:
3435 	kfree(rmrru);
3436 out:
3437 	return -ENOMEM;
3438 }
3439 
3440 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3441 {
3442 	struct dmar_atsr_unit *atsru;
3443 	struct acpi_dmar_atsr *tmp;
3444 
3445 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3446 				dmar_rcu_check()) {
3447 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3448 		if (atsr->segment != tmp->segment)
3449 			continue;
3450 		if (atsr->header.length != tmp->header.length)
3451 			continue;
3452 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3453 			return atsru;
3454 	}
3455 
3456 	return NULL;
3457 }
3458 
3459 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3460 {
3461 	struct acpi_dmar_atsr *atsr;
3462 	struct dmar_atsr_unit *atsru;
3463 
3464 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3465 		return 0;
3466 
3467 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3468 	atsru = dmar_find_atsr(atsr);
3469 	if (atsru)
3470 		return 0;
3471 
3472 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3473 	if (!atsru)
3474 		return -ENOMEM;
3475 
3476 	/*
3477 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3478 	 * copy the memory content because the memory buffer will be freed
3479 	 * on return.
3480 	 */
3481 	atsru->hdr = (void *)(atsru + 1);
3482 	memcpy(atsru->hdr, hdr, hdr->length);
3483 	atsru->include_all = atsr->flags & 0x1;
3484 	if (!atsru->include_all) {
3485 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3486 				(void *)atsr + atsr->header.length,
3487 				&atsru->devices_cnt);
3488 		if (atsru->devices_cnt && atsru->devices == NULL) {
3489 			kfree(atsru);
3490 			return -ENOMEM;
3491 		}
3492 	}
3493 
3494 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3495 
3496 	return 0;
3497 }
3498 
3499 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3500 {
3501 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3502 	kfree(atsru);
3503 }
3504 
3505 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3506 {
3507 	struct acpi_dmar_atsr *atsr;
3508 	struct dmar_atsr_unit *atsru;
3509 
3510 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3511 	atsru = dmar_find_atsr(atsr);
3512 	if (atsru) {
3513 		list_del_rcu(&atsru->list);
3514 		synchronize_rcu();
3515 		intel_iommu_free_atsr(atsru);
3516 	}
3517 
3518 	return 0;
3519 }
3520 
3521 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3522 {
3523 	int i;
3524 	struct device *dev;
3525 	struct acpi_dmar_atsr *atsr;
3526 	struct dmar_atsr_unit *atsru;
3527 
3528 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3529 	atsru = dmar_find_atsr(atsr);
3530 	if (!atsru)
3531 		return 0;
3532 
3533 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3534 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3535 					  i, dev)
3536 			return -EBUSY;
3537 	}
3538 
3539 	return 0;
3540 }
3541 
3542 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3543 {
3544 	struct dmar_satc_unit *satcu;
3545 	struct acpi_dmar_satc *tmp;
3546 
3547 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3548 				dmar_rcu_check()) {
3549 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3550 		if (satc->segment != tmp->segment)
3551 			continue;
3552 		if (satc->header.length != tmp->header.length)
3553 			continue;
3554 		if (memcmp(satc, tmp, satc->header.length) == 0)
3555 			return satcu;
3556 	}
3557 
3558 	return NULL;
3559 }
3560 
3561 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3562 {
3563 	struct acpi_dmar_satc *satc;
3564 	struct dmar_satc_unit *satcu;
3565 
3566 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3567 		return 0;
3568 
3569 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3570 	satcu = dmar_find_satc(satc);
3571 	if (satcu)
3572 		return 0;
3573 
3574 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3575 	if (!satcu)
3576 		return -ENOMEM;
3577 
3578 	satcu->hdr = (void *)(satcu + 1);
3579 	memcpy(satcu->hdr, hdr, hdr->length);
3580 	satcu->atc_required = satc->flags & 0x1;
3581 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3582 					      (void *)satc + satc->header.length,
3583 					      &satcu->devices_cnt);
3584 	if (satcu->devices_cnt && !satcu->devices) {
3585 		kfree(satcu);
3586 		return -ENOMEM;
3587 	}
3588 	list_add_rcu(&satcu->list, &dmar_satc_units);
3589 
3590 	return 0;
3591 }
3592 
3593 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3594 {
3595 	int sp, ret;
3596 	struct intel_iommu *iommu = dmaru->iommu;
3597 
3598 	if (g_iommus[iommu->seq_id])
3599 		return 0;
3600 
3601 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3602 	if (ret)
3603 		goto out;
3604 
3605 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3606 		pr_warn("%s: Doesn't support hardware pass through.\n",
3607 			iommu->name);
3608 		return -ENXIO;
3609 	}
3610 	if (!ecap_sc_support(iommu->ecap) &&
3611 	    domain_update_iommu_snooping(iommu)) {
3612 		pr_warn("%s: Doesn't support snooping.\n",
3613 			iommu->name);
3614 		return -ENXIO;
3615 	}
3616 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3617 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3618 		pr_warn("%s: Doesn't support large page.\n",
3619 			iommu->name);
3620 		return -ENXIO;
3621 	}
3622 
3623 	/*
3624 	 * Disable translation if already enabled prior to OS handover.
3625 	 */
3626 	if (iommu->gcmd & DMA_GCMD_TE)
3627 		iommu_disable_translation(iommu);
3628 
3629 	g_iommus[iommu->seq_id] = iommu;
3630 	ret = iommu_init_domains(iommu);
3631 	if (ret == 0)
3632 		ret = iommu_alloc_root_entry(iommu);
3633 	if (ret)
3634 		goto out;
3635 
3636 	intel_svm_check(iommu);
3637 
3638 	if (dmaru->ignored) {
3639 		/*
3640 		 * we always have to disable PMRs or DMA may fail on this device
3641 		 */
3642 		if (force_on)
3643 			iommu_disable_protect_mem_regions(iommu);
3644 		return 0;
3645 	}
3646 
3647 	intel_iommu_init_qi(iommu);
3648 	iommu_flush_write_buffer(iommu);
3649 
3650 #ifdef CONFIG_INTEL_IOMMU_SVM
3651 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3652 		ret = intel_svm_enable_prq(iommu);
3653 		if (ret)
3654 			goto disable_iommu;
3655 	}
3656 #endif
3657 	ret = dmar_set_interrupt(iommu);
3658 	if (ret)
3659 		goto disable_iommu;
3660 
3661 	iommu_set_root_entry(iommu);
3662 	iommu_enable_translation(iommu);
3663 
3664 	iommu_disable_protect_mem_regions(iommu);
3665 	return 0;
3666 
3667 disable_iommu:
3668 	disable_dmar_iommu(iommu);
3669 out:
3670 	free_dmar_iommu(iommu);
3671 	return ret;
3672 }
3673 
3674 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3675 {
3676 	int ret = 0;
3677 	struct intel_iommu *iommu = dmaru->iommu;
3678 
3679 	if (!intel_iommu_enabled)
3680 		return 0;
3681 	if (iommu == NULL)
3682 		return -EINVAL;
3683 
3684 	if (insert) {
3685 		ret = intel_iommu_add(dmaru);
3686 	} else {
3687 		disable_dmar_iommu(iommu);
3688 		free_dmar_iommu(iommu);
3689 	}
3690 
3691 	return ret;
3692 }
3693 
3694 static void intel_iommu_free_dmars(void)
3695 {
3696 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3697 	struct dmar_atsr_unit *atsru, *atsr_n;
3698 	struct dmar_satc_unit *satcu, *satc_n;
3699 
3700 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3701 		list_del(&rmrru->list);
3702 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3703 		kfree(rmrru);
3704 	}
3705 
3706 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3707 		list_del(&atsru->list);
3708 		intel_iommu_free_atsr(atsru);
3709 	}
3710 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3711 		list_del(&satcu->list);
3712 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3713 		kfree(satcu);
3714 	}
3715 }
3716 
3717 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3718 {
3719 	struct dmar_satc_unit *satcu;
3720 	struct acpi_dmar_satc *satc;
3721 	struct device *tmp;
3722 	int i;
3723 
3724 	dev = pci_physfn(dev);
3725 	rcu_read_lock();
3726 
3727 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3728 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3729 		if (satc->segment != pci_domain_nr(dev->bus))
3730 			continue;
3731 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3732 			if (to_pci_dev(tmp) == dev)
3733 				goto out;
3734 	}
3735 	satcu = NULL;
3736 out:
3737 	rcu_read_unlock();
3738 	return satcu;
3739 }
3740 
3741 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3742 {
3743 	int i, ret = 1;
3744 	struct pci_bus *bus;
3745 	struct pci_dev *bridge = NULL;
3746 	struct device *tmp;
3747 	struct acpi_dmar_atsr *atsr;
3748 	struct dmar_atsr_unit *atsru;
3749 	struct dmar_satc_unit *satcu;
3750 
3751 	dev = pci_physfn(dev);
3752 	satcu = dmar_find_matched_satc_unit(dev);
3753 	if (satcu)
3754 		/*
3755 		 * This device supports ATS as it is in SATC table.
3756 		 * When IOMMU is in legacy mode, enabling ATS is done
3757 		 * automatically by HW for the device that requires
3758 		 * ATS, hence OS should not enable this device ATS
3759 		 * to avoid duplicated TLB invalidation.
3760 		 */
3761 		return !(satcu->atc_required && !sm_supported(iommu));
3762 
3763 	for (bus = dev->bus; bus; bus = bus->parent) {
3764 		bridge = bus->self;
3765 		/* If it's an integrated device, allow ATS */
3766 		if (!bridge)
3767 			return 1;
3768 		/* Connected via non-PCIe: no ATS */
3769 		if (!pci_is_pcie(bridge) ||
3770 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3771 			return 0;
3772 		/* If we found the root port, look it up in the ATSR */
3773 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3774 			break;
3775 	}
3776 
3777 	rcu_read_lock();
3778 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3779 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3780 		if (atsr->segment != pci_domain_nr(dev->bus))
3781 			continue;
3782 
3783 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3784 			if (tmp == &bridge->dev)
3785 				goto out;
3786 
3787 		if (atsru->include_all)
3788 			goto out;
3789 	}
3790 	ret = 0;
3791 out:
3792 	rcu_read_unlock();
3793 
3794 	return ret;
3795 }
3796 
3797 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3798 {
3799 	int ret;
3800 	struct dmar_rmrr_unit *rmrru;
3801 	struct dmar_atsr_unit *atsru;
3802 	struct dmar_satc_unit *satcu;
3803 	struct acpi_dmar_atsr *atsr;
3804 	struct acpi_dmar_reserved_memory *rmrr;
3805 	struct acpi_dmar_satc *satc;
3806 
3807 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3808 		return 0;
3809 
3810 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3811 		rmrr = container_of(rmrru->hdr,
3812 				    struct acpi_dmar_reserved_memory, header);
3813 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3814 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3815 				((void *)rmrr) + rmrr->header.length,
3816 				rmrr->segment, rmrru->devices,
3817 				rmrru->devices_cnt);
3818 			if (ret < 0)
3819 				return ret;
3820 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3821 			dmar_remove_dev_scope(info, rmrr->segment,
3822 				rmrru->devices, rmrru->devices_cnt);
3823 		}
3824 	}
3825 
3826 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3827 		if (atsru->include_all)
3828 			continue;
3829 
3830 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3831 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3832 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3833 					(void *)atsr + atsr->header.length,
3834 					atsr->segment, atsru->devices,
3835 					atsru->devices_cnt);
3836 			if (ret > 0)
3837 				break;
3838 			else if (ret < 0)
3839 				return ret;
3840 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3841 			if (dmar_remove_dev_scope(info, atsr->segment,
3842 					atsru->devices, atsru->devices_cnt))
3843 				break;
3844 		}
3845 	}
3846 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3847 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3848 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3849 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3850 					(void *)satc + satc->header.length,
3851 					satc->segment, satcu->devices,
3852 					satcu->devices_cnt);
3853 			if (ret > 0)
3854 				break;
3855 			else if (ret < 0)
3856 				return ret;
3857 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3858 			if (dmar_remove_dev_scope(info, satc->segment,
3859 					satcu->devices, satcu->devices_cnt))
3860 				break;
3861 		}
3862 	}
3863 
3864 	return 0;
3865 }
3866 
3867 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3868 				       unsigned long val, void *v)
3869 {
3870 	struct memory_notify *mhp = v;
3871 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3872 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3873 			mhp->nr_pages - 1);
3874 
3875 	switch (val) {
3876 	case MEM_GOING_ONLINE:
3877 		if (iommu_domain_identity_map(si_domain,
3878 					      start_vpfn, last_vpfn)) {
3879 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3880 				start_vpfn, last_vpfn);
3881 			return NOTIFY_BAD;
3882 		}
3883 		break;
3884 
3885 	case MEM_OFFLINE:
3886 	case MEM_CANCEL_ONLINE:
3887 		{
3888 			struct dmar_drhd_unit *drhd;
3889 			struct intel_iommu *iommu;
3890 			LIST_HEAD(freelist);
3891 
3892 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3893 
3894 			rcu_read_lock();
3895 			for_each_active_iommu(iommu, drhd)
3896 				iommu_flush_iotlb_psi(iommu, si_domain,
3897 					start_vpfn, mhp->nr_pages,
3898 					list_empty(&freelist), 0);
3899 			rcu_read_unlock();
3900 			put_pages_list(&freelist);
3901 		}
3902 		break;
3903 	}
3904 
3905 	return NOTIFY_OK;
3906 }
3907 
3908 static struct notifier_block intel_iommu_memory_nb = {
3909 	.notifier_call = intel_iommu_memory_notifier,
3910 	.priority = 0
3911 };
3912 
3913 static void intel_disable_iommus(void)
3914 {
3915 	struct intel_iommu *iommu = NULL;
3916 	struct dmar_drhd_unit *drhd;
3917 
3918 	for_each_iommu(iommu, drhd)
3919 		iommu_disable_translation(iommu);
3920 }
3921 
3922 void intel_iommu_shutdown(void)
3923 {
3924 	struct dmar_drhd_unit *drhd;
3925 	struct intel_iommu *iommu = NULL;
3926 
3927 	if (no_iommu || dmar_disabled)
3928 		return;
3929 
3930 	down_write(&dmar_global_lock);
3931 
3932 	/* Disable PMRs explicitly here. */
3933 	for_each_iommu(iommu, drhd)
3934 		iommu_disable_protect_mem_regions(iommu);
3935 
3936 	/* Make sure the IOMMUs are switched off */
3937 	intel_disable_iommus();
3938 
3939 	up_write(&dmar_global_lock);
3940 }
3941 
3942 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3943 {
3944 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3945 
3946 	return container_of(iommu_dev, struct intel_iommu, iommu);
3947 }
3948 
3949 static ssize_t version_show(struct device *dev,
3950 			    struct device_attribute *attr, char *buf)
3951 {
3952 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3953 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3954 	return sprintf(buf, "%d:%d\n",
3955 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3956 }
3957 static DEVICE_ATTR_RO(version);
3958 
3959 static ssize_t address_show(struct device *dev,
3960 			    struct device_attribute *attr, char *buf)
3961 {
3962 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3963 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3964 }
3965 static DEVICE_ATTR_RO(address);
3966 
3967 static ssize_t cap_show(struct device *dev,
3968 			struct device_attribute *attr, char *buf)
3969 {
3970 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3971 	return sprintf(buf, "%llx\n", iommu->cap);
3972 }
3973 static DEVICE_ATTR_RO(cap);
3974 
3975 static ssize_t ecap_show(struct device *dev,
3976 			 struct device_attribute *attr, char *buf)
3977 {
3978 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3979 	return sprintf(buf, "%llx\n", iommu->ecap);
3980 }
3981 static DEVICE_ATTR_RO(ecap);
3982 
3983 static ssize_t domains_supported_show(struct device *dev,
3984 				      struct device_attribute *attr, char *buf)
3985 {
3986 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3987 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3988 }
3989 static DEVICE_ATTR_RO(domains_supported);
3990 
3991 static ssize_t domains_used_show(struct device *dev,
3992 				 struct device_attribute *attr, char *buf)
3993 {
3994 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3995 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3996 						  cap_ndoms(iommu->cap)));
3997 }
3998 static DEVICE_ATTR_RO(domains_used);
3999 
4000 static struct attribute *intel_iommu_attrs[] = {
4001 	&dev_attr_version.attr,
4002 	&dev_attr_address.attr,
4003 	&dev_attr_cap.attr,
4004 	&dev_attr_ecap.attr,
4005 	&dev_attr_domains_supported.attr,
4006 	&dev_attr_domains_used.attr,
4007 	NULL,
4008 };
4009 
4010 static struct attribute_group intel_iommu_group = {
4011 	.name = "intel-iommu",
4012 	.attrs = intel_iommu_attrs,
4013 };
4014 
4015 const struct attribute_group *intel_iommu_groups[] = {
4016 	&intel_iommu_group,
4017 	NULL,
4018 };
4019 
4020 static inline bool has_external_pci(void)
4021 {
4022 	struct pci_dev *pdev = NULL;
4023 
4024 	for_each_pci_dev(pdev)
4025 		if (pdev->external_facing)
4026 			return true;
4027 
4028 	return false;
4029 }
4030 
4031 static int __init platform_optin_force_iommu(void)
4032 {
4033 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4034 		return 0;
4035 
4036 	if (no_iommu || dmar_disabled)
4037 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4038 
4039 	/*
4040 	 * If Intel-IOMMU is disabled by default, we will apply identity
4041 	 * map for all devices except those marked as being untrusted.
4042 	 */
4043 	if (dmar_disabled)
4044 		iommu_set_default_passthrough(false);
4045 
4046 	dmar_disabled = 0;
4047 	no_iommu = 0;
4048 
4049 	return 1;
4050 }
4051 
4052 static int __init probe_acpi_namespace_devices(void)
4053 {
4054 	struct dmar_drhd_unit *drhd;
4055 	/* To avoid a -Wunused-but-set-variable warning. */
4056 	struct intel_iommu *iommu __maybe_unused;
4057 	struct device *dev;
4058 	int i, ret = 0;
4059 
4060 	for_each_active_iommu(iommu, drhd) {
4061 		for_each_active_dev_scope(drhd->devices,
4062 					  drhd->devices_cnt, i, dev) {
4063 			struct acpi_device_physical_node *pn;
4064 			struct iommu_group *group;
4065 			struct acpi_device *adev;
4066 
4067 			if (dev->bus != &acpi_bus_type)
4068 				continue;
4069 
4070 			adev = to_acpi_device(dev);
4071 			mutex_lock(&adev->physical_node_lock);
4072 			list_for_each_entry(pn,
4073 					    &adev->physical_node_list, node) {
4074 				group = iommu_group_get(pn->dev);
4075 				if (group) {
4076 					iommu_group_put(group);
4077 					continue;
4078 				}
4079 
4080 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4081 				ret = iommu_probe_device(pn->dev);
4082 				if (ret)
4083 					break;
4084 			}
4085 			mutex_unlock(&adev->physical_node_lock);
4086 
4087 			if (ret)
4088 				return ret;
4089 		}
4090 	}
4091 
4092 	return 0;
4093 }
4094 
4095 int __init intel_iommu_init(void)
4096 {
4097 	int ret = -ENODEV;
4098 	struct dmar_drhd_unit *drhd;
4099 	struct intel_iommu *iommu;
4100 
4101 	/*
4102 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4103 	 * opt in, so enforce that.
4104 	 */
4105 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4106 		    platform_optin_force_iommu();
4107 
4108 	down_write(&dmar_global_lock);
4109 	if (dmar_table_init()) {
4110 		if (force_on)
4111 			panic("tboot: Failed to initialize DMAR table\n");
4112 		goto out_free_dmar;
4113 	}
4114 
4115 	if (dmar_dev_scope_init() < 0) {
4116 		if (force_on)
4117 			panic("tboot: Failed to initialize DMAR device scope\n");
4118 		goto out_free_dmar;
4119 	}
4120 
4121 	up_write(&dmar_global_lock);
4122 
4123 	/*
4124 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4125 	 * complain later when we register it under the lock.
4126 	 */
4127 	dmar_register_bus_notifier();
4128 
4129 	down_write(&dmar_global_lock);
4130 
4131 	if (!no_iommu)
4132 		intel_iommu_debugfs_init();
4133 
4134 	if (no_iommu || dmar_disabled) {
4135 		/*
4136 		 * We exit the function here to ensure IOMMU's remapping and
4137 		 * mempool aren't setup, which means that the IOMMU's PMRs
4138 		 * won't be disabled via the call to init_dmars(). So disable
4139 		 * it explicitly here. The PMRs were setup by tboot prior to
4140 		 * calling SENTER, but the kernel is expected to reset/tear
4141 		 * down the PMRs.
4142 		 */
4143 		if (intel_iommu_tboot_noforce) {
4144 			for_each_iommu(iommu, drhd)
4145 				iommu_disable_protect_mem_regions(iommu);
4146 		}
4147 
4148 		/*
4149 		 * Make sure the IOMMUs are switched off, even when we
4150 		 * boot into a kexec kernel and the previous kernel left
4151 		 * them enabled
4152 		 */
4153 		intel_disable_iommus();
4154 		goto out_free_dmar;
4155 	}
4156 
4157 	if (list_empty(&dmar_rmrr_units))
4158 		pr_info("No RMRR found\n");
4159 
4160 	if (list_empty(&dmar_atsr_units))
4161 		pr_info("No ATSR found\n");
4162 
4163 	if (list_empty(&dmar_satc_units))
4164 		pr_info("No SATC found\n");
4165 
4166 	if (dmar_map_gfx)
4167 		intel_iommu_gfx_mapped = 1;
4168 
4169 	init_no_remapping_devices();
4170 
4171 	ret = init_dmars();
4172 	if (ret) {
4173 		if (force_on)
4174 			panic("tboot: Failed to initialize DMARs\n");
4175 		pr_err("Initialization failed\n");
4176 		goto out_free_dmar;
4177 	}
4178 	up_write(&dmar_global_lock);
4179 
4180 	init_iommu_pm_ops();
4181 
4182 	down_read(&dmar_global_lock);
4183 	for_each_active_iommu(iommu, drhd) {
4184 		/*
4185 		 * The flush queue implementation does not perform
4186 		 * page-selective invalidations that are required for efficient
4187 		 * TLB flushes in virtual environments.  The benefit of batching
4188 		 * is likely to be much lower than the overhead of synchronizing
4189 		 * the virtual and physical IOMMU page-tables.
4190 		 */
4191 		if (cap_caching_mode(iommu->cap)) {
4192 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4193 			iommu_set_dma_strict();
4194 		}
4195 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4196 				       intel_iommu_groups,
4197 				       "%s", iommu->name);
4198 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4199 	}
4200 	up_read(&dmar_global_lock);
4201 
4202 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4203 	if (si_domain && !hw_pass_through)
4204 		register_memory_notifier(&intel_iommu_memory_nb);
4205 
4206 	down_read(&dmar_global_lock);
4207 	if (probe_acpi_namespace_devices())
4208 		pr_warn("ACPI name space devices didn't probe correctly\n");
4209 
4210 	/* Finally, we enable the DMA remapping hardware. */
4211 	for_each_iommu(iommu, drhd) {
4212 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4213 			iommu_enable_translation(iommu);
4214 
4215 		iommu_disable_protect_mem_regions(iommu);
4216 	}
4217 	up_read(&dmar_global_lock);
4218 
4219 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4220 
4221 	intel_iommu_enabled = 1;
4222 
4223 	return 0;
4224 
4225 out_free_dmar:
4226 	intel_iommu_free_dmars();
4227 	up_write(&dmar_global_lock);
4228 	return ret;
4229 }
4230 
4231 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4232 {
4233 	struct device_domain_info *info = opaque;
4234 
4235 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4236 	return 0;
4237 }
4238 
4239 /*
4240  * NB - intel-iommu lacks any sort of reference counting for the users of
4241  * dependent devices.  If multiple endpoints have intersecting dependent
4242  * devices, unbinding the driver from any one of them will possibly leave
4243  * the others unable to operate.
4244  */
4245 static void domain_context_clear(struct device_domain_info *info)
4246 {
4247 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4248 		return;
4249 
4250 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4251 			       &domain_context_clear_one_cb, info);
4252 }
4253 
4254 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4255 {
4256 	struct dmar_domain *domain;
4257 	struct intel_iommu *iommu;
4258 	unsigned long flags;
4259 
4260 	assert_spin_locked(&device_domain_lock);
4261 
4262 	if (WARN_ON(!info))
4263 		return;
4264 
4265 	iommu = info->iommu;
4266 	domain = info->domain;
4267 
4268 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4269 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4270 			intel_pasid_tear_down_entry(iommu, info->dev,
4271 					PASID_RID2PASID, false);
4272 
4273 		iommu_disable_dev_iotlb(info);
4274 		domain_context_clear(info);
4275 		intel_pasid_free_table(info->dev);
4276 	}
4277 
4278 	list_del(&info->link);
4279 
4280 	spin_lock_irqsave(&iommu->lock, flags);
4281 	domain_detach_iommu(domain, iommu);
4282 	spin_unlock_irqrestore(&iommu->lock, flags);
4283 }
4284 
4285 static void dmar_remove_one_dev_info(struct device *dev)
4286 {
4287 	struct device_domain_info *info;
4288 	unsigned long flags;
4289 
4290 	spin_lock_irqsave(&device_domain_lock, flags);
4291 	info = dev_iommu_priv_get(dev);
4292 	if (info)
4293 		__dmar_remove_one_dev_info(info);
4294 	spin_unlock_irqrestore(&device_domain_lock, flags);
4295 }
4296 
4297 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4298 {
4299 	int adjust_width;
4300 
4301 	/* calculate AGAW */
4302 	domain->gaw = guest_width;
4303 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4304 	domain->agaw = width_to_agaw(adjust_width);
4305 
4306 	domain->iommu_coherency = false;
4307 	domain->iommu_snooping = false;
4308 	domain->iommu_superpage = 0;
4309 	domain->max_addr = 0;
4310 
4311 	/* always allocate the top pgd */
4312 	domain->pgd = alloc_pgtable_page(domain->nid);
4313 	if (!domain->pgd)
4314 		return -ENOMEM;
4315 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4316 	return 0;
4317 }
4318 
4319 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4320 {
4321 	struct dmar_domain *dmar_domain;
4322 	struct iommu_domain *domain;
4323 
4324 	switch (type) {
4325 	case IOMMU_DOMAIN_DMA:
4326 	case IOMMU_DOMAIN_DMA_FQ:
4327 	case IOMMU_DOMAIN_UNMANAGED:
4328 		dmar_domain = alloc_domain(type);
4329 		if (!dmar_domain) {
4330 			pr_err("Can't allocate dmar_domain\n");
4331 			return NULL;
4332 		}
4333 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4334 			pr_err("Domain initialization failed\n");
4335 			domain_exit(dmar_domain);
4336 			return NULL;
4337 		}
4338 
4339 		domain = &dmar_domain->domain;
4340 		domain->geometry.aperture_start = 0;
4341 		domain->geometry.aperture_end   =
4342 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4343 		domain->geometry.force_aperture = true;
4344 
4345 		return domain;
4346 	case IOMMU_DOMAIN_IDENTITY:
4347 		return &si_domain->domain;
4348 	default:
4349 		return NULL;
4350 	}
4351 
4352 	return NULL;
4353 }
4354 
4355 static void intel_iommu_domain_free(struct iommu_domain *domain)
4356 {
4357 	if (domain != &si_domain->domain)
4358 		domain_exit(to_dmar_domain(domain));
4359 }
4360 
4361 static int prepare_domain_attach_device(struct iommu_domain *domain,
4362 					struct device *dev)
4363 {
4364 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4365 	struct intel_iommu *iommu;
4366 	int addr_width;
4367 
4368 	iommu = device_to_iommu(dev, NULL, NULL);
4369 	if (!iommu)
4370 		return -ENODEV;
4371 
4372 	/* check if this iommu agaw is sufficient for max mapped address */
4373 	addr_width = agaw_to_width(iommu->agaw);
4374 	if (addr_width > cap_mgaw(iommu->cap))
4375 		addr_width = cap_mgaw(iommu->cap);
4376 
4377 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4378 		dev_err(dev, "%s: iommu width (%d) is not "
4379 		        "sufficient for the mapped address (%llx)\n",
4380 		        __func__, addr_width, dmar_domain->max_addr);
4381 		return -EFAULT;
4382 	}
4383 	dmar_domain->gaw = addr_width;
4384 
4385 	/*
4386 	 * Knock out extra levels of page tables if necessary
4387 	 */
4388 	while (iommu->agaw < dmar_domain->agaw) {
4389 		struct dma_pte *pte;
4390 
4391 		pte = dmar_domain->pgd;
4392 		if (dma_pte_present(pte)) {
4393 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4394 			free_pgtable_page(pte);
4395 		}
4396 		dmar_domain->agaw--;
4397 	}
4398 
4399 	return 0;
4400 }
4401 
4402 static int intel_iommu_attach_device(struct iommu_domain *domain,
4403 				     struct device *dev)
4404 {
4405 	int ret;
4406 
4407 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4408 	    device_is_rmrr_locked(dev)) {
4409 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4410 		return -EPERM;
4411 	}
4412 
4413 	/* normally dev is not mapped */
4414 	if (unlikely(domain_context_mapped(dev))) {
4415 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4416 
4417 		if (info->domain)
4418 			dmar_remove_one_dev_info(dev);
4419 	}
4420 
4421 	ret = prepare_domain_attach_device(domain, dev);
4422 	if (ret)
4423 		return ret;
4424 
4425 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4426 }
4427 
4428 static void intel_iommu_detach_device(struct iommu_domain *domain,
4429 				      struct device *dev)
4430 {
4431 	dmar_remove_one_dev_info(dev);
4432 }
4433 
4434 static int intel_iommu_map(struct iommu_domain *domain,
4435 			   unsigned long iova, phys_addr_t hpa,
4436 			   size_t size, int iommu_prot, gfp_t gfp)
4437 {
4438 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4439 	u64 max_addr;
4440 	int prot = 0;
4441 
4442 	if (iommu_prot & IOMMU_READ)
4443 		prot |= DMA_PTE_READ;
4444 	if (iommu_prot & IOMMU_WRITE)
4445 		prot |= DMA_PTE_WRITE;
4446 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4447 		prot |= DMA_PTE_SNP;
4448 
4449 	max_addr = iova + size;
4450 	if (dmar_domain->max_addr < max_addr) {
4451 		u64 end;
4452 
4453 		/* check if minimum agaw is sufficient for mapped address */
4454 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4455 		if (end < max_addr) {
4456 			pr_err("%s: iommu width (%d) is not "
4457 			       "sufficient for the mapped address (%llx)\n",
4458 			       __func__, dmar_domain->gaw, max_addr);
4459 			return -EFAULT;
4460 		}
4461 		dmar_domain->max_addr = max_addr;
4462 	}
4463 	/* Round up size to next multiple of PAGE_SIZE, if it and
4464 	   the low bits of hpa would take us onto the next page */
4465 	size = aligned_nrpages(hpa, size);
4466 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4467 				hpa >> VTD_PAGE_SHIFT, size, prot);
4468 }
4469 
4470 static int intel_iommu_map_pages(struct iommu_domain *domain,
4471 				 unsigned long iova, phys_addr_t paddr,
4472 				 size_t pgsize, size_t pgcount,
4473 				 int prot, gfp_t gfp, size_t *mapped)
4474 {
4475 	unsigned long pgshift = __ffs(pgsize);
4476 	size_t size = pgcount << pgshift;
4477 	int ret;
4478 
4479 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4480 		return -EINVAL;
4481 
4482 	if (!IS_ALIGNED(iova | paddr, pgsize))
4483 		return -EINVAL;
4484 
4485 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4486 	if (!ret && mapped)
4487 		*mapped = size;
4488 
4489 	return ret;
4490 }
4491 
4492 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4493 				unsigned long iova, size_t size,
4494 				struct iommu_iotlb_gather *gather)
4495 {
4496 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4497 	unsigned long start_pfn, last_pfn;
4498 	int level = 0;
4499 
4500 	/* Cope with horrid API which requires us to unmap more than the
4501 	   size argument if it happens to be a large-page mapping. */
4502 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4503 
4504 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4505 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4506 
4507 	start_pfn = iova >> VTD_PAGE_SHIFT;
4508 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4509 
4510 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4511 
4512 	if (dmar_domain->max_addr == iova + size)
4513 		dmar_domain->max_addr = iova;
4514 
4515 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4516 
4517 	return size;
4518 }
4519 
4520 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4521 				      unsigned long iova,
4522 				      size_t pgsize, size_t pgcount,
4523 				      struct iommu_iotlb_gather *gather)
4524 {
4525 	unsigned long pgshift = __ffs(pgsize);
4526 	size_t size = pgcount << pgshift;
4527 
4528 	return intel_iommu_unmap(domain, iova, size, gather);
4529 }
4530 
4531 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4532 				 struct iommu_iotlb_gather *gather)
4533 {
4534 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4535 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4536 	size_t size = gather->end - gather->start;
4537 	unsigned long start_pfn;
4538 	unsigned long nrpages;
4539 	int iommu_id;
4540 
4541 	nrpages = aligned_nrpages(gather->start, size);
4542 	start_pfn = mm_to_dma_pfn(iova_pfn);
4543 
4544 	for_each_domain_iommu(iommu_id, dmar_domain)
4545 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4546 				      start_pfn, nrpages,
4547 				      list_empty(&gather->freelist), 0);
4548 
4549 	put_pages_list(&gather->freelist);
4550 }
4551 
4552 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4553 					    dma_addr_t iova)
4554 {
4555 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4556 	struct dma_pte *pte;
4557 	int level = 0;
4558 	u64 phys = 0;
4559 
4560 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4561 	if (pte && dma_pte_present(pte))
4562 		phys = dma_pte_addr(pte) +
4563 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4564 						VTD_PAGE_SHIFT) - 1));
4565 
4566 	return phys;
4567 }
4568 
4569 static bool intel_iommu_capable(enum iommu_cap cap)
4570 {
4571 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4572 		return domain_update_iommu_snooping(NULL);
4573 	if (cap == IOMMU_CAP_INTR_REMAP)
4574 		return irq_remapping_enabled == 1;
4575 
4576 	return false;
4577 }
4578 
4579 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4580 {
4581 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4582 	struct device_domain_info *info;
4583 	struct intel_iommu *iommu;
4584 	unsigned long flags;
4585 	u8 bus, devfn;
4586 
4587 	iommu = device_to_iommu(dev, &bus, &devfn);
4588 	if (!iommu)
4589 		return ERR_PTR(-ENODEV);
4590 
4591 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4592 	if (!info)
4593 		return ERR_PTR(-ENOMEM);
4594 
4595 	if (dev_is_real_dma_subdevice(dev)) {
4596 		info->bus = pdev->bus->number;
4597 		info->devfn = pdev->devfn;
4598 		info->segment = pci_domain_nr(pdev->bus);
4599 	} else {
4600 		info->bus = bus;
4601 		info->devfn = devfn;
4602 		info->segment = iommu->segment;
4603 	}
4604 
4605 	info->dev = dev;
4606 	info->iommu = iommu;
4607 	if (dev_is_pci(dev)) {
4608 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4609 		    pci_ats_supported(pdev) &&
4610 		    dmar_ats_supported(pdev, iommu))
4611 			info->ats_supported = 1;
4612 
4613 		if (sm_supported(iommu)) {
4614 			if (pasid_supported(iommu)) {
4615 				int features = pci_pasid_features(pdev);
4616 
4617 				if (features >= 0)
4618 					info->pasid_supported = features | 1;
4619 			}
4620 
4621 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4622 			    pci_pri_supported(pdev))
4623 				info->pri_supported = 1;
4624 		}
4625 	}
4626 
4627 	spin_lock_irqsave(&device_domain_lock, flags);
4628 	list_add(&info->global, &device_domain_list);
4629 	dev_iommu_priv_set(dev, info);
4630 	spin_unlock_irqrestore(&device_domain_lock, flags);
4631 
4632 	return &iommu->iommu;
4633 }
4634 
4635 static void intel_iommu_release_device(struct device *dev)
4636 {
4637 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4638 	unsigned long flags;
4639 
4640 	dmar_remove_one_dev_info(dev);
4641 
4642 	spin_lock_irqsave(&device_domain_lock, flags);
4643 	dev_iommu_priv_set(dev, NULL);
4644 	list_del(&info->global);
4645 	spin_unlock_irqrestore(&device_domain_lock, flags);
4646 
4647 	kfree(info);
4648 	set_dma_ops(dev, NULL);
4649 }
4650 
4651 static void intel_iommu_probe_finalize(struct device *dev)
4652 {
4653 	set_dma_ops(dev, NULL);
4654 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4655 }
4656 
4657 static void intel_iommu_get_resv_regions(struct device *device,
4658 					 struct list_head *head)
4659 {
4660 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4661 	struct iommu_resv_region *reg;
4662 	struct dmar_rmrr_unit *rmrr;
4663 	struct device *i_dev;
4664 	int i;
4665 
4666 	down_read(&dmar_global_lock);
4667 	for_each_rmrr_units(rmrr) {
4668 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4669 					  i, i_dev) {
4670 			struct iommu_resv_region *resv;
4671 			enum iommu_resv_type type;
4672 			size_t length;
4673 
4674 			if (i_dev != device &&
4675 			    !is_downstream_to_pci_bridge(device, i_dev))
4676 				continue;
4677 
4678 			length = rmrr->end_address - rmrr->base_address + 1;
4679 
4680 			type = device_rmrr_is_relaxable(device) ?
4681 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4682 
4683 			resv = iommu_alloc_resv_region(rmrr->base_address,
4684 						       length, prot, type);
4685 			if (!resv)
4686 				break;
4687 
4688 			list_add_tail(&resv->list, head);
4689 		}
4690 	}
4691 	up_read(&dmar_global_lock);
4692 
4693 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4694 	if (dev_is_pci(device)) {
4695 		struct pci_dev *pdev = to_pci_dev(device);
4696 
4697 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4698 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4699 						   IOMMU_RESV_DIRECT_RELAXABLE);
4700 			if (reg)
4701 				list_add_tail(&reg->list, head);
4702 		}
4703 	}
4704 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4705 
4706 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4707 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4708 				      0, IOMMU_RESV_MSI);
4709 	if (!reg)
4710 		return;
4711 	list_add_tail(&reg->list, head);
4712 }
4713 
4714 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4715 {
4716 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4717 	struct context_entry *context;
4718 	struct dmar_domain *domain;
4719 	unsigned long flags;
4720 	u64 ctx_lo;
4721 	int ret;
4722 
4723 	domain = info->domain;
4724 	if (!domain)
4725 		return -EINVAL;
4726 
4727 	spin_lock_irqsave(&device_domain_lock, flags);
4728 	spin_lock(&iommu->lock);
4729 
4730 	ret = -EINVAL;
4731 	if (!info->pasid_supported)
4732 		goto out;
4733 
4734 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4735 	if (WARN_ON(!context))
4736 		goto out;
4737 
4738 	ctx_lo = context[0].lo;
4739 
4740 	if (!(ctx_lo & CONTEXT_PASIDE)) {
4741 		ctx_lo |= CONTEXT_PASIDE;
4742 		context[0].lo = ctx_lo;
4743 		wmb();
4744 		iommu->flush.flush_context(iommu,
4745 					   domain->iommu_did[iommu->seq_id],
4746 					   PCI_DEVID(info->bus, info->devfn),
4747 					   DMA_CCMD_MASK_NOBIT,
4748 					   DMA_CCMD_DEVICE_INVL);
4749 	}
4750 
4751 	/* Enable PASID support in the device, if it wasn't already */
4752 	if (!info->pasid_enabled)
4753 		iommu_enable_dev_iotlb(info);
4754 
4755 	ret = 0;
4756 
4757  out:
4758 	spin_unlock(&iommu->lock);
4759 	spin_unlock_irqrestore(&device_domain_lock, flags);
4760 
4761 	return ret;
4762 }
4763 
4764 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4765 {
4766 	if (dev_is_pci(dev))
4767 		return pci_device_group(dev);
4768 	return generic_device_group(dev);
4769 }
4770 
4771 static int intel_iommu_enable_sva(struct device *dev)
4772 {
4773 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4774 	struct intel_iommu *iommu;
4775 	int ret;
4776 
4777 	if (!info || dmar_disabled)
4778 		return -EINVAL;
4779 
4780 	iommu = info->iommu;
4781 	if (!iommu)
4782 		return -EINVAL;
4783 
4784 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4785 		return -ENODEV;
4786 
4787 	if (intel_iommu_enable_pasid(iommu, dev))
4788 		return -ENODEV;
4789 
4790 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4791 		return -EINVAL;
4792 
4793 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4794 	if (!ret)
4795 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4796 
4797 	return ret;
4798 }
4799 
4800 static int intel_iommu_disable_sva(struct device *dev)
4801 {
4802 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4803 	struct intel_iommu *iommu = info->iommu;
4804 	int ret;
4805 
4806 	ret = iommu_unregister_device_fault_handler(dev);
4807 	if (!ret)
4808 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4809 
4810 	return ret;
4811 }
4812 
4813 static int intel_iommu_enable_iopf(struct device *dev)
4814 {
4815 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4816 
4817 	if (info && info->pri_supported)
4818 		return 0;
4819 
4820 	return -ENODEV;
4821 }
4822 
4823 static int
4824 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4825 {
4826 	switch (feat) {
4827 	case IOMMU_DEV_FEAT_IOPF:
4828 		return intel_iommu_enable_iopf(dev);
4829 
4830 	case IOMMU_DEV_FEAT_SVA:
4831 		return intel_iommu_enable_sva(dev);
4832 
4833 	default:
4834 		return -ENODEV;
4835 	}
4836 }
4837 
4838 static int
4839 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4840 {
4841 	switch (feat) {
4842 	case IOMMU_DEV_FEAT_IOPF:
4843 		return 0;
4844 
4845 	case IOMMU_DEV_FEAT_SVA:
4846 		return intel_iommu_disable_sva(dev);
4847 
4848 	default:
4849 		return -ENODEV;
4850 	}
4851 }
4852 
4853 static bool intel_iommu_is_attach_deferred(struct device *dev)
4854 {
4855 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4856 
4857 	return translation_pre_enabled(info->iommu) && !info->domain;
4858 }
4859 
4860 /*
4861  * Check that the device does not live on an external facing PCI port that is
4862  * marked as untrusted. Such devices should not be able to apply quirks and
4863  * thus not be able to bypass the IOMMU restrictions.
4864  */
4865 static bool risky_device(struct pci_dev *pdev)
4866 {
4867 	if (pdev->untrusted) {
4868 		pci_info(pdev,
4869 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4870 			 pdev->vendor, pdev->device);
4871 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4872 		return true;
4873 	}
4874 	return false;
4875 }
4876 
4877 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4878 				       unsigned long iova, size_t size)
4879 {
4880 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4881 	unsigned long pages = aligned_nrpages(iova, size);
4882 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4883 	struct intel_iommu *iommu;
4884 	int iommu_id;
4885 
4886 	for_each_domain_iommu(iommu_id, dmar_domain) {
4887 		iommu = g_iommus[iommu_id];
4888 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
4889 	}
4890 }
4891 
4892 const struct iommu_ops intel_iommu_ops = {
4893 	.capable		= intel_iommu_capable,
4894 	.domain_alloc		= intel_iommu_domain_alloc,
4895 	.probe_device		= intel_iommu_probe_device,
4896 	.probe_finalize		= intel_iommu_probe_finalize,
4897 	.release_device		= intel_iommu_release_device,
4898 	.get_resv_regions	= intel_iommu_get_resv_regions,
4899 	.put_resv_regions	= generic_iommu_put_resv_regions,
4900 	.device_group		= intel_iommu_device_group,
4901 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4902 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4903 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4904 	.def_domain_type	= device_def_domain_type,
4905 	.pgsize_bitmap		= SZ_4K,
4906 #ifdef CONFIG_INTEL_IOMMU_SVM
4907 	.sva_bind		= intel_svm_bind,
4908 	.sva_unbind		= intel_svm_unbind,
4909 	.sva_get_pasid		= intel_svm_get_pasid,
4910 	.page_response		= intel_svm_page_response,
4911 #endif
4912 	.default_domain_ops = &(const struct iommu_domain_ops) {
4913 		.attach_dev		= intel_iommu_attach_device,
4914 		.detach_dev		= intel_iommu_detach_device,
4915 		.map_pages		= intel_iommu_map_pages,
4916 		.unmap_pages		= intel_iommu_unmap_pages,
4917 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4918 		.flush_iotlb_all        = intel_flush_iotlb_all,
4919 		.iotlb_sync		= intel_iommu_tlb_sync,
4920 		.iova_to_phys		= intel_iommu_iova_to_phys,
4921 		.free			= intel_iommu_domain_free,
4922 	}
4923 };
4924 
4925 static void quirk_iommu_igfx(struct pci_dev *dev)
4926 {
4927 	if (risky_device(dev))
4928 		return;
4929 
4930 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4931 	dmar_map_gfx = 0;
4932 }
4933 
4934 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4942 
4943 /* Broadwell igfx malfunctions with dmar */
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4968 
4969 static void quirk_iommu_rwbf(struct pci_dev *dev)
4970 {
4971 	if (risky_device(dev))
4972 		return;
4973 
4974 	/*
4975 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4976 	 * but needs it. Same seems to hold for the desktop versions.
4977 	 */
4978 	pci_info(dev, "Forcing write-buffer flush capability\n");
4979 	rwbf_quirk = 1;
4980 }
4981 
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4989 
4990 #define GGC 0x52
4991 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4992 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4993 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4994 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4995 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4996 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4997 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4998 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4999 
5000 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5001 {
5002 	unsigned short ggc;
5003 
5004 	if (risky_device(dev))
5005 		return;
5006 
5007 	if (pci_read_config_word(dev, GGC, &ggc))
5008 		return;
5009 
5010 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5011 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5012 		dmar_map_gfx = 0;
5013 	} else if (dmar_map_gfx) {
5014 		/* we have to ensure the gfx device is idle before we flush */
5015 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5016 		iommu_set_dma_strict();
5017 	}
5018 }
5019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5023 
5024 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5025 {
5026 	unsigned short ver;
5027 
5028 	if (!IS_GFX_DEVICE(dev))
5029 		return;
5030 
5031 	ver = (dev->device >> 8) & 0xff;
5032 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5033 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5034 	    ver != 0x9a)
5035 		return;
5036 
5037 	if (risky_device(dev))
5038 		return;
5039 
5040 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5041 	iommu_skip_te_disable = 1;
5042 }
5043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5044 
5045 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5046    ISOCH DMAR unit for the Azalia sound device, but not give it any
5047    TLB entries, which causes it to deadlock. Check for that.  We do
5048    this in a function called from init_dmars(), instead of in a PCI
5049    quirk, because we don't want to print the obnoxious "BIOS broken"
5050    message if VT-d is actually disabled.
5051 */
5052 static void __init check_tylersburg_isoch(void)
5053 {
5054 	struct pci_dev *pdev;
5055 	uint32_t vtisochctrl;
5056 
5057 	/* If there's no Azalia in the system anyway, forget it. */
5058 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5059 	if (!pdev)
5060 		return;
5061 
5062 	if (risky_device(pdev)) {
5063 		pci_dev_put(pdev);
5064 		return;
5065 	}
5066 
5067 	pci_dev_put(pdev);
5068 
5069 	/* System Management Registers. Might be hidden, in which case
5070 	   we can't do the sanity check. But that's OK, because the
5071 	   known-broken BIOSes _don't_ actually hide it, so far. */
5072 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5073 	if (!pdev)
5074 		return;
5075 
5076 	if (risky_device(pdev)) {
5077 		pci_dev_put(pdev);
5078 		return;
5079 	}
5080 
5081 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5082 		pci_dev_put(pdev);
5083 		return;
5084 	}
5085 
5086 	pci_dev_put(pdev);
5087 
5088 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5089 	if (vtisochctrl & 1)
5090 		return;
5091 
5092 	/* Drop all bits other than the number of TLB entries */
5093 	vtisochctrl &= 0x1c;
5094 
5095 	/* If we have the recommended number of TLB entries (16), fine. */
5096 	if (vtisochctrl == 0x10)
5097 		return;
5098 
5099 	/* Zero TLB entries? You get to ride the short bus to school. */
5100 	if (!vtisochctrl) {
5101 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5102 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5103 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5104 		     dmi_get_system_info(DMI_BIOS_VERSION),
5105 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5106 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5107 		return;
5108 	}
5109 
5110 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5111 	       vtisochctrl);
5112 }
5113