xref: /linux/drivers/iommu/intel/iommu.c (revision 3d7dc8658105f0408f53f5df13f5f2b4610bb4ca)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 /*
61  * set to 1 to panic kernel if can't successfully enable VT-d
62  * (used when kernel is launched w/ TXT)
63  */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67 
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69 
70 /*
71  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72  * if marked present.
73  */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 	if (!(re->lo & 1))
77 		return 0;
78 
79 	return re->lo & VTD_PAGE_MASK;
80 }
81 
82 /*
83  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84  * if marked present.
85  */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 	if (!(re->hi & 1))
89 		return 0;
90 
91 	return re->hi & VTD_PAGE_MASK;
92 }
93 
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 	struct device_domain_info *info =
97 		rb_entry(node, struct device_domain_info, node);
98 	const u16 *rid_lhs = key;
99 
100 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 		return -1;
102 
103 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 		return 1;
105 
106 	return 0;
107 }
108 
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 	struct device_domain_info *info =
112 		rb_entry(lhs, struct device_domain_info, node);
113 	u16 key = PCI_DEVID(info->bus, info->devfn);
114 
115 	return device_rid_cmp_key(&key, rhs);
116 }
117 
118 /*
119  * Looks up an IOMMU-probed device using its source ID.
120  *
121  * Returns the pointer to the device if there is a match. Otherwise,
122  * returns NULL.
123  *
124  * Note that this helper doesn't guarantee that the device won't be
125  * released by the iommu subsystem after being returned. The caller
126  * should use its own synchronization mechanism to avoid the device
127  * being released during its use if its possibly the case.
128  */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 	struct device_domain_info *info = NULL;
132 	struct rb_node *node;
133 	unsigned long flags;
134 
135 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 	if (node)
138 		info = rb_entry(node, struct device_domain_info, node);
139 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140 
141 	return info ? info->dev : NULL;
142 }
143 
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 				struct device_domain_info *info)
146 {
147 	struct rb_node *curr;
148 	unsigned long flags;
149 
150 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 	if (WARN_ON(curr))
154 		return -EEXIST;
155 
156 	return 0;
157 }
158 
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 	struct intel_iommu *iommu = info->iommu;
162 	unsigned long flags;
163 
164 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 	rb_erase(&info->node, &iommu->device_rbtree);
166 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168 
169 struct dmar_rmrr_unit {
170 	struct list_head list;		/* list of rmrr units	*/
171 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
172 	u64	base_address;		/* reserved base address*/
173 	u64	end_address;		/* reserved end address */
174 	struct dmar_dev_scope *devices;	/* target devices */
175 	int	devices_cnt;		/* target device count */
176 };
177 
178 struct dmar_atsr_unit {
179 	struct list_head list;		/* list of ATSR units */
180 	struct acpi_dmar_header *hdr;	/* ACPI header */
181 	struct dmar_dev_scope *devices;	/* target devices */
182 	int devices_cnt;		/* target device count */
183 	u8 include_all:1;		/* include all ports */
184 };
185 
186 struct dmar_satc_unit {
187 	struct list_head list;		/* list of SATC units */
188 	struct acpi_dmar_header *hdr;	/* ACPI header */
189 	struct dmar_dev_scope *devices;	/* target devices */
190 	struct intel_iommu *iommu;	/* the corresponding iommu */
191 	int devices_cnt;		/* target device count */
192 	u8 atc_required:1;		/* ATS is required */
193 };
194 
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198 
199 #define for_each_rmrr_units(rmrr) \
200 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201 
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203 
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206 
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209 
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214 
215 #define IDENTMAP_AZALIA		4
216 
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219 
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224 
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229 
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 	u32 gsts;
233 
234 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 	if (gsts & DMA_GSTS_TES)
236 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238 
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 	if (!str)
242 		return -EINVAL;
243 
244 	while (*str) {
245 		if (!strncmp(str, "on", 2)) {
246 			dmar_disabled = 0;
247 			pr_info("IOMMU enabled\n");
248 		} else if (!strncmp(str, "off", 3)) {
249 			dmar_disabled = 1;
250 			no_platform_optin = 1;
251 			pr_info("IOMMU disabled\n");
252 		} else if (!strncmp(str, "igfx_off", 8)) {
253 			disable_igfx_iommu = 1;
254 			pr_info("Disable GFX device mapping\n");
255 		} else if (!strncmp(str, "forcedac", 8)) {
256 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 			iommu_dma_forcedac = true;
258 		} else if (!strncmp(str, "strict", 6)) {
259 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 			iommu_set_dma_strict();
261 		} else if (!strncmp(str, "sp_off", 6)) {
262 			pr_info("Disable supported super page\n");
263 			intel_iommu_superpage = 0;
264 		} else if (!strncmp(str, "sm_on", 5)) {
265 			pr_info("Enable scalable mode if hardware supports\n");
266 			intel_iommu_sm = 1;
267 		} else if (!strncmp(str, "sm_off", 6)) {
268 			pr_info("Scalable mode is disallowed\n");
269 			intel_iommu_sm = 0;
270 		} else if (!strncmp(str, "tboot_noforce", 13)) {
271 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 			intel_iommu_tboot_noforce = 1;
273 		} else {
274 			pr_notice("Unknown option - '%s'\n", str);
275 		}
276 
277 		str += strcspn(str, ",");
278 		while (*str == ',')
279 			str++;
280 	}
281 
282 	return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289 
290 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292 
293 /*
294  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296  * the returned SAGAW.
297  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 	unsigned long fl_sagaw, sl_sagaw;
301 
302 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 	sl_sagaw = cap_sagaw(iommu->cap);
304 
305 	/* Second level only. */
306 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 		return sl_sagaw;
308 
309 	/* First level only. */
310 	if (!ecap_slts(iommu->ecap))
311 		return fl_sagaw;
312 
313 	return fl_sagaw & sl_sagaw;
314 }
315 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 	unsigned long sagaw;
319 	int agaw;
320 
321 	sagaw = __iommu_calculate_sagaw(iommu);
322 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 		if (test_bit(agaw, &sagaw))
324 			break;
325 	}
326 
327 	return agaw;
328 }
329 
330 /*
331  * Calculate max SAGAW for each iommu.
332  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337 
338 /*
339  * calculate agaw for each iommu.
340  * "SAGAW" may be different across iommus, use a default agaw, and
341  * get a supported less agaw for iommus that don't support the default agaw.
342  */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347 
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 	return sm_supported(iommu) ?
351 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353 
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 	unsigned long bitmap = 0;
358 
359 	/*
360 	 * 1-level super page supports page size of 2MiB, 2-level super page
361 	 * supports page size of both 2MiB and 1GiB.
362 	 */
363 	if (domain->iommu_superpage == 1)
364 		bitmap |= SZ_2M;
365 	else if (domain->iommu_superpage == 2)
366 		bitmap |= SZ_2M | SZ_1G;
367 
368 	return bitmap;
369 }
370 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 					 u8 devfn, int alloc)
373 {
374 	struct root_entry *root = &iommu->root_entry[bus];
375 	struct context_entry *context;
376 	u64 *entry;
377 
378 	/*
379 	 * Except that the caller requested to allocate a new entry,
380 	 * returning a copied context entry makes no sense.
381 	 */
382 	if (!alloc && context_copied(iommu, bus, devfn))
383 		return NULL;
384 
385 	entry = &root->lo;
386 	if (sm_supported(iommu)) {
387 		if (devfn >= 0x80) {
388 			devfn -= 0x80;
389 			entry = &root->hi;
390 		}
391 		devfn *= 2;
392 	}
393 	if (*entry & 1)
394 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 	else {
396 		unsigned long phy_addr;
397 		if (!alloc)
398 			return NULL;
399 
400 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 		if (!context)
402 			return NULL;
403 
404 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 		phy_addr = virt_to_phys((void *)context);
406 		*entry = phy_addr | 1;
407 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
408 	}
409 	return &context[devfn];
410 }
411 
412 /**
413  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414  *				 sub-hierarchy of a candidate PCI-PCI bridge
415  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416  * @bridge: the candidate PCI-PCI bridge
417  *
418  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419  */
420 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 	struct pci_dev *pdev, *pbridge;
424 
425 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 		return false;
427 
428 	pdev = to_pci_dev(dev);
429 	pbridge = to_pci_dev(bridge);
430 
431 	if (pbridge->subordinate &&
432 	    pbridge->subordinate->number <= pdev->bus->number &&
433 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 		return true;
435 
436 	return false;
437 }
438 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 	struct dmar_drhd_unit *drhd;
442 	u32 vtbar;
443 	int rc;
444 
445 	/* We know that this device on this chipset has its own IOMMU.
446 	 * If we find it under a different IOMMU, then the BIOS is lying
447 	 * to us. Hope that the IOMMU for this device is actually
448 	 * disabled, and it needs no translation...
449 	 */
450 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 	if (rc) {
452 		/* "can't" happen */
453 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 		return false;
455 	}
456 	vtbar &= 0xffff0000;
457 
458 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
459 	drhd = dmar_find_matched_drhd_unit(pdev);
460 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 		return true;
464 	}
465 
466 	return false;
467 }
468 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 	if (!iommu || iommu->drhd->ignored)
472 		return true;
473 
474 	if (dev_is_pci(dev)) {
475 		struct pci_dev *pdev = to_pci_dev(dev);
476 
477 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 		    quirk_ioat_snb_local_iommu(pdev))
480 			return true;
481 	}
482 
483 	return false;
484 }
485 
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 	struct dmar_drhd_unit *drhd = NULL;
489 	struct pci_dev *pdev = NULL;
490 	struct intel_iommu *iommu;
491 	struct device *tmp;
492 	u16 segment = 0;
493 	int i;
494 
495 	if (!dev)
496 		return NULL;
497 
498 	if (dev_is_pci(dev)) {
499 		struct pci_dev *pf_pdev;
500 
501 		pdev = pci_real_dma_dev(to_pci_dev(dev));
502 
503 		/* VFs aren't listed in scope tables; we need to look up
504 		 * the PF instead to find the IOMMU. */
505 		pf_pdev = pci_physfn(pdev);
506 		dev = &pf_pdev->dev;
507 		segment = pci_domain_nr(pdev->bus);
508 	} else if (has_acpi_companion(dev))
509 		dev = &ACPI_COMPANION(dev)->dev;
510 
511 	rcu_read_lock();
512 	for_each_iommu(iommu, drhd) {
513 		if (pdev && segment != drhd->segment)
514 			continue;
515 
516 		for_each_active_dev_scope(drhd->devices,
517 					  drhd->devices_cnt, i, tmp) {
518 			if (tmp == dev) {
519 				/* For a VF use its original BDF# not that of the PF
520 				 * which we used for the IOMMU lookup. Strictly speaking
521 				 * we could do this for all PCI devices; we only need to
522 				 * get the BDF# from the scope table for ACPI matches. */
523 				if (pdev && pdev->is_virtfn)
524 					goto got_pdev;
525 
526 				if (bus && devfn) {
527 					*bus = drhd->devices[i].bus;
528 					*devfn = drhd->devices[i].devfn;
529 				}
530 				goto out;
531 			}
532 
533 			if (is_downstream_to_pci_bridge(dev, tmp))
534 				goto got_pdev;
535 		}
536 
537 		if (pdev && drhd->include_all) {
538 got_pdev:
539 			if (bus && devfn) {
540 				*bus = pdev->bus->number;
541 				*devfn = pdev->devfn;
542 			}
543 			goto out;
544 		}
545 	}
546 	iommu = NULL;
547 out:
548 	if (iommu_is_dummy(iommu, dev))
549 		iommu = NULL;
550 
551 	rcu_read_unlock();
552 
553 	return iommu;
554 }
555 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)556 static void domain_flush_cache(struct dmar_domain *domain,
557 			       void *addr, int size)
558 {
559 	if (!domain->iommu_coherency)
560 		clflush_cache_range(addr, size);
561 }
562 
free_context_table(struct intel_iommu * iommu)563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 	struct context_entry *context;
566 	int i;
567 
568 	if (!iommu->root_entry)
569 		return;
570 
571 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 		context = iommu_context_addr(iommu, i, 0, 0);
573 		if (context)
574 			iommu_free_page(context);
575 
576 		if (!sm_supported(iommu))
577 			continue;
578 
579 		context = iommu_context_addr(iommu, i, 0x80, 0);
580 		if (context)
581 			iommu_free_page(context);
582 	}
583 
584 	iommu_free_page(iommu->root_entry);
585 	iommu->root_entry = NULL;
586 }
587 
588 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 	struct dma_pte *pte;
593 	int offset;
594 
595 	while (1) {
596 		offset = pfn_level_offset(pfn, level);
597 		pte = &parent[offset];
598 
599 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600 
601 		if (!dma_pte_present(pte)) {
602 			pr_info("page table not present at level %d\n", level - 1);
603 			break;
604 		}
605 
606 		if (level == 1 || dma_pte_superpage(pte))
607 			break;
608 
609 		parent = phys_to_virt(dma_pte_addr(pte));
610 		level--;
611 	}
612 }
613 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 			  unsigned long long addr, u32 pasid)
616 {
617 	struct pasid_dir_entry *dir, *pde;
618 	struct pasid_entry *entries, *pte;
619 	struct context_entry *ctx_entry;
620 	struct root_entry *rt_entry;
621 	int i, dir_index, index, level;
622 	u8 devfn = source_id & 0xff;
623 	u8 bus = source_id >> 8;
624 	struct dma_pte *pgtable;
625 
626 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627 
628 	/* root entry dump */
629 	if (!iommu->root_entry) {
630 		pr_info("root table is not present\n");
631 		return;
632 	}
633 	rt_entry = &iommu->root_entry[bus];
634 
635 	if (sm_supported(iommu))
636 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 			rt_entry->hi, rt_entry->lo);
638 	else
639 		pr_info("root entry: 0x%016llx", rt_entry->lo);
640 
641 	/* context entry dump */
642 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 	if (!ctx_entry) {
644 		pr_info("context table is not present\n");
645 		return;
646 	}
647 
648 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 		ctx_entry->hi, ctx_entry->lo);
650 
651 	/* legacy mode does not require PASID entries */
652 	if (!sm_supported(iommu)) {
653 		if (!context_present(ctx_entry)) {
654 			pr_info("legacy mode page table is not present\n");
655 			return;
656 		}
657 		level = agaw_to_level(ctx_entry->hi & 7);
658 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 		goto pgtable_walk;
660 	}
661 
662 	if (!context_present(ctx_entry)) {
663 		pr_info("pasid directory table is not present\n");
664 		return;
665 	}
666 
667 	/* get the pointer to pasid directory entry */
668 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669 
670 	/* For request-without-pasid, get the pasid from context entry */
671 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 		pasid = IOMMU_NO_PASID;
673 
674 	dir_index = pasid >> PASID_PDE_SHIFT;
675 	pde = &dir[dir_index];
676 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677 
678 	/* get the pointer to the pasid table entry */
679 	entries = get_pasid_table_from_pde(pde);
680 	if (!entries) {
681 		pr_info("pasid table is not present\n");
682 		return;
683 	}
684 	index = pasid & PASID_PTE_MASK;
685 	pte = &entries[index];
686 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688 
689 	if (!pasid_pte_is_present(pte)) {
690 		pr_info("scalable mode page table is not present\n");
691 		return;
692 	}
693 
694 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 	} else {
698 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 	}
701 
702 pgtable_walk:
703 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 				      unsigned long pfn, int *target_level,
709 				      gfp_t gfp)
710 {
711 	struct dma_pte *parent, *pte;
712 	int level = agaw_to_level(domain->agaw);
713 	int offset;
714 
715 	if (!domain_pfn_supported(domain, pfn))
716 		/* Address beyond IOMMU's addressing capabilities. */
717 		return NULL;
718 
719 	parent = domain->pgd;
720 
721 	while (1) {
722 		void *tmp_page;
723 
724 		offset = pfn_level_offset(pfn, level);
725 		pte = &parent[offset];
726 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 			break;
728 		if (level == *target_level)
729 			break;
730 
731 		if (!dma_pte_present(pte)) {
732 			uint64_t pteval, tmp;
733 
734 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735 
736 			if (!tmp_page)
737 				return NULL;
738 
739 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
741 			if (domain->use_first_level)
742 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
743 
744 			tmp = 0ULL;
745 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
746 				/* Someone else set it while we were thinking; use theirs. */
747 				iommu_free_page(tmp_page);
748 			else
749 				domain_flush_cache(domain, pte, sizeof(*pte));
750 		}
751 		if (level == 1)
752 			break;
753 
754 		parent = phys_to_virt(dma_pte_addr(pte));
755 		level--;
756 	}
757 
758 	if (!*target_level)
759 		*target_level = level;
760 
761 	return pte;
762 }
763 
764 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)765 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
766 					 unsigned long pfn,
767 					 int level, int *large_page)
768 {
769 	struct dma_pte *parent, *pte;
770 	int total = agaw_to_level(domain->agaw);
771 	int offset;
772 
773 	parent = domain->pgd;
774 	while (level <= total) {
775 		offset = pfn_level_offset(pfn, total);
776 		pte = &parent[offset];
777 		if (level == total)
778 			return pte;
779 
780 		if (!dma_pte_present(pte)) {
781 			*large_page = total;
782 			break;
783 		}
784 
785 		if (dma_pte_superpage(pte)) {
786 			*large_page = total;
787 			return pte;
788 		}
789 
790 		parent = phys_to_virt(dma_pte_addr(pte));
791 		total--;
792 	}
793 	return NULL;
794 }
795 
796 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)797 static void dma_pte_clear_range(struct dmar_domain *domain,
798 				unsigned long start_pfn,
799 				unsigned long last_pfn)
800 {
801 	unsigned int large_page;
802 	struct dma_pte *first_pte, *pte;
803 
804 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
805 	    WARN_ON(start_pfn > last_pfn))
806 		return;
807 
808 	/* we don't need lock here; nobody else touches the iova range */
809 	do {
810 		large_page = 1;
811 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
812 		if (!pte) {
813 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
814 			continue;
815 		}
816 		do {
817 			dma_clear_pte(pte);
818 			start_pfn += lvl_to_nr_pages(large_page);
819 			pte++;
820 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
821 
822 		domain_flush_cache(domain, first_pte,
823 				   (void *)pte - (void *)first_pte);
824 
825 	} while (start_pfn && start_pfn <= last_pfn);
826 }
827 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)828 static void dma_pte_free_level(struct dmar_domain *domain, int level,
829 			       int retain_level, struct dma_pte *pte,
830 			       unsigned long pfn, unsigned long start_pfn,
831 			       unsigned long last_pfn)
832 {
833 	pfn = max(start_pfn, pfn);
834 	pte = &pte[pfn_level_offset(pfn, level)];
835 
836 	do {
837 		unsigned long level_pfn;
838 		struct dma_pte *level_pte;
839 
840 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
841 			goto next;
842 
843 		level_pfn = pfn & level_mask(level);
844 		level_pte = phys_to_virt(dma_pte_addr(pte));
845 
846 		if (level > 2) {
847 			dma_pte_free_level(domain, level - 1, retain_level,
848 					   level_pte, level_pfn, start_pfn,
849 					   last_pfn);
850 		}
851 
852 		/*
853 		 * Free the page table if we're below the level we want to
854 		 * retain and the range covers the entire table.
855 		 */
856 		if (level < retain_level && !(start_pfn > level_pfn ||
857 		      last_pfn < level_pfn + level_size(level) - 1)) {
858 			dma_clear_pte(pte);
859 			domain_flush_cache(domain, pte, sizeof(*pte));
860 			iommu_free_page(level_pte);
861 		}
862 next:
863 		pfn += level_size(level);
864 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
865 }
866 
867 /*
868  * clear last level (leaf) ptes and free page table pages below the
869  * level we wish to keep intact.
870  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)871 static void dma_pte_free_pagetable(struct dmar_domain *domain,
872 				   unsigned long start_pfn,
873 				   unsigned long last_pfn,
874 				   int retain_level)
875 {
876 	dma_pte_clear_range(domain, start_pfn, last_pfn);
877 
878 	/* We don't need lock here; nobody else touches the iova range */
879 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
880 			   domain->pgd, 0, start_pfn, last_pfn);
881 
882 	/* free pgd */
883 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
884 		iommu_free_page(domain->pgd);
885 		domain->pgd = NULL;
886 	}
887 }
888 
889 /* When a page at a given level is being unlinked from its parent, we don't
890    need to *modify* it at all. All we need to do is make a list of all the
891    pages which can be freed just as soon as we've flushed the IOTLB and we
892    know the hardware page-walk will no longer touch them.
893    The 'pte' argument is the *parent* PTE, pointing to the page that is to
894    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)895 static void dma_pte_list_pagetables(struct dmar_domain *domain,
896 				    int level, struct dma_pte *pte,
897 				    struct list_head *freelist)
898 {
899 	struct page *pg;
900 
901 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
902 	list_add_tail(&pg->lru, freelist);
903 
904 	if (level == 1)
905 		return;
906 
907 	pte = page_address(pg);
908 	do {
909 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
910 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
911 		pte++;
912 	} while (!first_pte_in_page(pte));
913 }
914 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)915 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
916 				struct dma_pte *pte, unsigned long pfn,
917 				unsigned long start_pfn, unsigned long last_pfn,
918 				struct list_head *freelist)
919 {
920 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
921 
922 	pfn = max(start_pfn, pfn);
923 	pte = &pte[pfn_level_offset(pfn, level)];
924 
925 	do {
926 		unsigned long level_pfn = pfn & level_mask(level);
927 
928 		if (!dma_pte_present(pte))
929 			goto next;
930 
931 		/* If range covers entire pagetable, free it */
932 		if (start_pfn <= level_pfn &&
933 		    last_pfn >= level_pfn + level_size(level) - 1) {
934 			/* These suborbinate page tables are going away entirely. Don't
935 			   bother to clear them; we're just going to *free* them. */
936 			if (level > 1 && !dma_pte_superpage(pte))
937 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
938 
939 			dma_clear_pte(pte);
940 			if (!first_pte)
941 				first_pte = pte;
942 			last_pte = pte;
943 		} else if (level > 1) {
944 			/* Recurse down into a level that isn't *entirely* obsolete */
945 			dma_pte_clear_level(domain, level - 1,
946 					    phys_to_virt(dma_pte_addr(pte)),
947 					    level_pfn, start_pfn, last_pfn,
948 					    freelist);
949 		}
950 next:
951 		pfn = level_pfn + level_size(level);
952 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
953 
954 	if (first_pte)
955 		domain_flush_cache(domain, first_pte,
956 				   (void *)++last_pte - (void *)first_pte);
957 }
958 
959 /* We can't just free the pages because the IOMMU may still be walking
960    the page tables, and may have cached the intermediate levels. The
961    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)962 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
963 			 unsigned long last_pfn, struct list_head *freelist)
964 {
965 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
966 	    WARN_ON(start_pfn > last_pfn))
967 		return;
968 
969 	/* we don't need lock here; nobody else touches the iova range */
970 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
971 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
972 
973 	/* free pgd */
974 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
975 		struct page *pgd_page = virt_to_page(domain->pgd);
976 		list_add_tail(&pgd_page->lru, freelist);
977 		domain->pgd = NULL;
978 	}
979 }
980 
981 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)982 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
983 {
984 	struct root_entry *root;
985 
986 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
987 	if (!root) {
988 		pr_err("Allocating root entry for %s failed\n",
989 			iommu->name);
990 		return -ENOMEM;
991 	}
992 
993 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
994 	iommu->root_entry = root;
995 
996 	return 0;
997 }
998 
iommu_set_root_entry(struct intel_iommu * iommu)999 static void iommu_set_root_entry(struct intel_iommu *iommu)
1000 {
1001 	u64 addr;
1002 	u32 sts;
1003 	unsigned long flag;
1004 
1005 	addr = virt_to_phys(iommu->root_entry);
1006 	if (sm_supported(iommu))
1007 		addr |= DMA_RTADDR_SMT;
1008 
1009 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1010 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1011 
1012 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1013 
1014 	/* Make sure hardware complete it */
1015 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1016 		      readl, (sts & DMA_GSTS_RTPS), sts);
1017 
1018 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1019 
1020 	/*
1021 	 * Hardware invalidates all DMA remapping hardware translation
1022 	 * caches as part of SRTP flow.
1023 	 */
1024 	if (cap_esrtps(iommu->cap))
1025 		return;
1026 
1027 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1028 	if (sm_supported(iommu))
1029 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1030 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1031 }
1032 
iommu_flush_write_buffer(struct intel_iommu * iommu)1033 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1034 {
1035 	u32 val;
1036 	unsigned long flag;
1037 
1038 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1039 		return;
1040 
1041 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1042 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1043 
1044 	/* Make sure hardware complete it */
1045 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1046 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1047 
1048 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1049 }
1050 
1051 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1052 static void __iommu_flush_context(struct intel_iommu *iommu,
1053 				  u16 did, u16 source_id, u8 function_mask,
1054 				  u64 type)
1055 {
1056 	u64 val = 0;
1057 	unsigned long flag;
1058 
1059 	switch (type) {
1060 	case DMA_CCMD_GLOBAL_INVL:
1061 		val = DMA_CCMD_GLOBAL_INVL;
1062 		break;
1063 	case DMA_CCMD_DOMAIN_INVL:
1064 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1065 		break;
1066 	case DMA_CCMD_DEVICE_INVL:
1067 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1068 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1069 		break;
1070 	default:
1071 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1072 			iommu->name, type);
1073 		return;
1074 	}
1075 	val |= DMA_CCMD_ICC;
1076 
1077 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1078 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1079 
1080 	/* Make sure hardware complete it */
1081 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1082 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1083 
1084 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085 }
1086 
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1087 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1088 			 unsigned int size_order, u64 type)
1089 {
1090 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1091 	u64 val = 0, val_iva = 0;
1092 	unsigned long flag;
1093 
1094 	switch (type) {
1095 	case DMA_TLB_GLOBAL_FLUSH:
1096 		/* global flush doesn't need set IVA_REG */
1097 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1098 		break;
1099 	case DMA_TLB_DSI_FLUSH:
1100 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1101 		break;
1102 	case DMA_TLB_PSI_FLUSH:
1103 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 		/* IH bit is passed in as part of address */
1105 		val_iva = size_order | addr;
1106 		break;
1107 	default:
1108 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1109 			iommu->name, type);
1110 		return;
1111 	}
1112 
1113 	if (cap_write_drain(iommu->cap))
1114 		val |= DMA_TLB_WRITE_DRAIN;
1115 
1116 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1117 	/* Note: Only uses first TLB reg currently */
1118 	if (val_iva)
1119 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1120 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1121 
1122 	/* Make sure hardware complete it */
1123 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1124 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1125 
1126 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1127 
1128 	/* check IOTLB invalidation granularity */
1129 	if (DMA_TLB_IAIG(val) == 0)
1130 		pr_err("Flush IOTLB failed\n");
1131 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1132 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1133 			(unsigned long long)DMA_TLB_IIRG(type),
1134 			(unsigned long long)DMA_TLB_IAIG(val));
1135 }
1136 
1137 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1138 domain_lookup_dev_info(struct dmar_domain *domain,
1139 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1140 {
1141 	struct device_domain_info *info;
1142 	unsigned long flags;
1143 
1144 	spin_lock_irqsave(&domain->lock, flags);
1145 	list_for_each_entry(info, &domain->devices, link) {
1146 		if (info->iommu == iommu && info->bus == bus &&
1147 		    info->devfn == devfn) {
1148 			spin_unlock_irqrestore(&domain->lock, flags);
1149 			return info;
1150 		}
1151 	}
1152 	spin_unlock_irqrestore(&domain->lock, flags);
1153 
1154 	return NULL;
1155 }
1156 
1157 /*
1158  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1159  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1160  * check because it applies only to the built-in QAT devices and it doesn't
1161  * grant additional privileges.
1162  */
1163 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1164 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1165 {
1166 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1167 		return false;
1168 
1169 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1170 		return false;
1171 
1172 	return true;
1173 }
1174 
iommu_enable_pci_caps(struct device_domain_info * info)1175 static void iommu_enable_pci_caps(struct device_domain_info *info)
1176 {
1177 	struct pci_dev *pdev;
1178 
1179 	if (!dev_is_pci(info->dev))
1180 		return;
1181 
1182 	pdev = to_pci_dev(info->dev);
1183 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1184 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1185 		info->ats_enabled = 1;
1186 }
1187 
iommu_disable_pci_caps(struct device_domain_info * info)1188 static void iommu_disable_pci_caps(struct device_domain_info *info)
1189 {
1190 	struct pci_dev *pdev;
1191 
1192 	if (!dev_is_pci(info->dev))
1193 		return;
1194 
1195 	pdev = to_pci_dev(info->dev);
1196 
1197 	if (info->ats_enabled) {
1198 		pci_disable_ats(pdev);
1199 		info->ats_enabled = 0;
1200 	}
1201 }
1202 
intel_flush_iotlb_all(struct iommu_domain * domain)1203 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1204 {
1205 	cache_tag_flush_all(to_dmar_domain(domain));
1206 }
1207 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1208 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1209 {
1210 	u32 pmen;
1211 	unsigned long flags;
1212 
1213 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1214 		return;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1217 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1218 	pmen &= ~DMA_PMEN_EPM;
1219 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1220 
1221 	/* wait for the protected region status bit to clear */
1222 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1223 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1224 
1225 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226 }
1227 
iommu_enable_translation(struct intel_iommu * iommu)1228 static void iommu_enable_translation(struct intel_iommu *iommu)
1229 {
1230 	u32 sts;
1231 	unsigned long flags;
1232 
1233 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1234 	iommu->gcmd |= DMA_GCMD_TE;
1235 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1236 
1237 	/* Make sure hardware complete it */
1238 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1239 		      readl, (sts & DMA_GSTS_TES), sts);
1240 
1241 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1242 }
1243 
iommu_disable_translation(struct intel_iommu * iommu)1244 static void iommu_disable_translation(struct intel_iommu *iommu)
1245 {
1246 	u32 sts;
1247 	unsigned long flag;
1248 
1249 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1250 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1251 		return;
1252 
1253 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 	iommu->gcmd &= ~DMA_GCMD_TE;
1255 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1256 
1257 	/* Make sure hardware complete it */
1258 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1260 
1261 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263 
iommu_init_domains(struct intel_iommu * iommu)1264 static int iommu_init_domains(struct intel_iommu *iommu)
1265 {
1266 	u32 ndomains;
1267 
1268 	ndomains = cap_ndoms(iommu->cap);
1269 	pr_debug("%s: Number of Domains supported <%d>\n",
1270 		 iommu->name, ndomains);
1271 
1272 	spin_lock_init(&iommu->lock);
1273 
1274 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1275 	if (!iommu->domain_ids)
1276 		return -ENOMEM;
1277 
1278 	/*
1279 	 * If Caching mode is set, then invalid translations are tagged
1280 	 * with domain-id 0, hence we need to pre-allocate it. We also
1281 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1282 	 * make sure it is not used for a real domain.
1283 	 */
1284 	set_bit(0, iommu->domain_ids);
1285 
1286 	/*
1287 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1288 	 * entry for first-level or pass-through translation modes should
1289 	 * be programmed with a domain id different from those used for
1290 	 * second-level or nested translation. We reserve a domain id for
1291 	 * this purpose. This domain id is also used for identity domain
1292 	 * in legacy mode.
1293 	 */
1294 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1295 
1296 	return 0;
1297 }
1298 
disable_dmar_iommu(struct intel_iommu * iommu)1299 static void disable_dmar_iommu(struct intel_iommu *iommu)
1300 {
1301 	if (!iommu->domain_ids)
1302 		return;
1303 
1304 	/*
1305 	 * All iommu domains must have been detached from the devices,
1306 	 * hence there should be no domain IDs in use.
1307 	 */
1308 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1309 		    > NUM_RESERVED_DID))
1310 		return;
1311 
1312 	if (iommu->gcmd & DMA_GCMD_TE)
1313 		iommu_disable_translation(iommu);
1314 }
1315 
free_dmar_iommu(struct intel_iommu * iommu)1316 static void free_dmar_iommu(struct intel_iommu *iommu)
1317 {
1318 	if (iommu->domain_ids) {
1319 		bitmap_free(iommu->domain_ids);
1320 		iommu->domain_ids = NULL;
1321 	}
1322 
1323 	if (iommu->copied_tables) {
1324 		bitmap_free(iommu->copied_tables);
1325 		iommu->copied_tables = NULL;
1326 	}
1327 
1328 	/* free context mapping */
1329 	free_context_table(iommu);
1330 
1331 	if (ecap_prs(iommu->ecap))
1332 		intel_iommu_finish_prq(iommu);
1333 }
1334 
1335 /*
1336  * Check and return whether first level is used by default for
1337  * DMA translation.
1338  */
first_level_by_default(struct intel_iommu * iommu)1339 static bool first_level_by_default(struct intel_iommu *iommu)
1340 {
1341 	/* Only SL is available in legacy mode */
1342 	if (!sm_supported(iommu))
1343 		return false;
1344 
1345 	/* Only level (either FL or SL) is available, just use it */
1346 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1347 		return ecap_flts(iommu->ecap);
1348 
1349 	return true;
1350 }
1351 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1352 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1353 {
1354 	struct iommu_domain_info *info, *curr;
1355 	unsigned long ndomains;
1356 	int num, ret = -ENOSPC;
1357 
1358 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1359 		return 0;
1360 
1361 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1362 	if (!info)
1363 		return -ENOMEM;
1364 
1365 	spin_lock(&iommu->lock);
1366 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1367 	if (curr) {
1368 		curr->refcnt++;
1369 		spin_unlock(&iommu->lock);
1370 		kfree(info);
1371 		return 0;
1372 	}
1373 
1374 	ndomains = cap_ndoms(iommu->cap);
1375 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1376 	if (num >= ndomains) {
1377 		pr_err("%s: No free domain ids\n", iommu->name);
1378 		goto err_unlock;
1379 	}
1380 
1381 	set_bit(num, iommu->domain_ids);
1382 	info->refcnt	= 1;
1383 	info->did	= num;
1384 	info->iommu	= iommu;
1385 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1386 			  NULL, info, GFP_ATOMIC);
1387 	if (curr) {
1388 		ret = xa_err(curr) ? : -EBUSY;
1389 		goto err_clear;
1390 	}
1391 
1392 	spin_unlock(&iommu->lock);
1393 	return 0;
1394 
1395 err_clear:
1396 	clear_bit(info->did, iommu->domain_ids);
1397 err_unlock:
1398 	spin_unlock(&iommu->lock);
1399 	kfree(info);
1400 	return ret;
1401 }
1402 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1403 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1404 {
1405 	struct iommu_domain_info *info;
1406 
1407 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1408 		return;
1409 
1410 	spin_lock(&iommu->lock);
1411 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1412 	if (--info->refcnt == 0) {
1413 		clear_bit(info->did, iommu->domain_ids);
1414 		xa_erase(&domain->iommu_array, iommu->seq_id);
1415 		domain->nid = NUMA_NO_NODE;
1416 		kfree(info);
1417 	}
1418 	spin_unlock(&iommu->lock);
1419 }
1420 
domain_exit(struct dmar_domain * domain)1421 static void domain_exit(struct dmar_domain *domain)
1422 {
1423 	if (domain->pgd) {
1424 		LIST_HEAD(freelist);
1425 
1426 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1427 		iommu_put_pages_list(&freelist);
1428 	}
1429 
1430 	if (WARN_ON(!list_empty(&domain->devices)))
1431 		return;
1432 
1433 	kfree(domain->qi_batch);
1434 	kfree(domain);
1435 }
1436 
1437 /*
1438  * For kdump cases, old valid entries may be cached due to the
1439  * in-flight DMA and copied pgtable, but there is no unmapping
1440  * behaviour for them, thus we need an explicit cache flush for
1441  * the newly-mapped device. For kdump, at this point, the device
1442  * is supposed to finish reset at its driver probe stage, so no
1443  * in-flight DMA will exist, and we don't need to worry anymore
1444  * hereafter.
1445  */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1446 static void copied_context_tear_down(struct intel_iommu *iommu,
1447 				     struct context_entry *context,
1448 				     u8 bus, u8 devfn)
1449 {
1450 	u16 did_old;
1451 
1452 	if (!context_copied(iommu, bus, devfn))
1453 		return;
1454 
1455 	assert_spin_locked(&iommu->lock);
1456 
1457 	did_old = context_domain_id(context);
1458 	context_clear_entry(context);
1459 
1460 	if (did_old < cap_ndoms(iommu->cap)) {
1461 		iommu->flush.flush_context(iommu, did_old,
1462 					   PCI_DEVID(bus, devfn),
1463 					   DMA_CCMD_MASK_NOBIT,
1464 					   DMA_CCMD_DEVICE_INVL);
1465 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1466 					 DMA_TLB_DSI_FLUSH);
1467 	}
1468 
1469 	clear_context_copied(iommu, bus, devfn);
1470 }
1471 
1472 /*
1473  * It's a non-present to present mapping. If hardware doesn't cache
1474  * non-present entry we only need to flush the write-buffer. If the
1475  * _does_ cache non-present entries, then it does so in the special
1476  * domain #0, which we have to flush:
1477  */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1478 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1479 					u8 bus, u8 devfn)
1480 {
1481 	if (cap_caching_mode(iommu->cap)) {
1482 		iommu->flush.flush_context(iommu, 0,
1483 					   PCI_DEVID(bus, devfn),
1484 					   DMA_CCMD_MASK_NOBIT,
1485 					   DMA_CCMD_DEVICE_INVL);
1486 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1487 	} else {
1488 		iommu_flush_write_buffer(iommu);
1489 	}
1490 }
1491 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1492 static int domain_context_mapping_one(struct dmar_domain *domain,
1493 				      struct intel_iommu *iommu,
1494 				      u8 bus, u8 devfn)
1495 {
1496 	struct device_domain_info *info =
1497 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1498 	u16 did = domain_id_iommu(domain, iommu);
1499 	int translation = CONTEXT_TT_MULTI_LEVEL;
1500 	struct dma_pte *pgd = domain->pgd;
1501 	struct context_entry *context;
1502 	int ret;
1503 
1504 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1505 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1506 
1507 	spin_lock(&iommu->lock);
1508 	ret = -ENOMEM;
1509 	context = iommu_context_addr(iommu, bus, devfn, 1);
1510 	if (!context)
1511 		goto out_unlock;
1512 
1513 	ret = 0;
1514 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1515 		goto out_unlock;
1516 
1517 	copied_context_tear_down(iommu, context, bus, devfn);
1518 	context_clear_entry(context);
1519 	context_set_domain_id(context, did);
1520 
1521 	if (info && info->ats_supported)
1522 		translation = CONTEXT_TT_DEV_IOTLB;
1523 	else
1524 		translation = CONTEXT_TT_MULTI_LEVEL;
1525 
1526 	context_set_address_root(context, virt_to_phys(pgd));
1527 	context_set_address_width(context, domain->agaw);
1528 	context_set_translation_type(context, translation);
1529 	context_set_fault_enable(context);
1530 	context_set_present(context);
1531 	if (!ecap_coherent(iommu->ecap))
1532 		clflush_cache_range(context, sizeof(*context));
1533 	context_present_cache_flush(iommu, did, bus, devfn);
1534 	ret = 0;
1535 
1536 out_unlock:
1537 	spin_unlock(&iommu->lock);
1538 
1539 	return ret;
1540 }
1541 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1542 static int domain_context_mapping_cb(struct pci_dev *pdev,
1543 				     u16 alias, void *opaque)
1544 {
1545 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1546 	struct intel_iommu *iommu = info->iommu;
1547 	struct dmar_domain *domain = opaque;
1548 
1549 	return domain_context_mapping_one(domain, iommu,
1550 					  PCI_BUS_NUM(alias), alias & 0xff);
1551 }
1552 
1553 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1554 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1555 {
1556 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1557 	struct intel_iommu *iommu = info->iommu;
1558 	u8 bus = info->bus, devfn = info->devfn;
1559 
1560 	if (!dev_is_pci(dev))
1561 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1562 
1563 	return pci_for_each_dma_alias(to_pci_dev(dev),
1564 				      domain_context_mapping_cb, domain);
1565 }
1566 
1567 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1568 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1569 				   unsigned long phy_pfn, unsigned long pages)
1570 {
1571 	int support, level = 1;
1572 	unsigned long pfnmerge;
1573 
1574 	support = domain->iommu_superpage;
1575 
1576 	/* To use a large page, the virtual *and* physical addresses
1577 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1578 	   of them will mean we have to use smaller pages. So just
1579 	   merge them and check both at once. */
1580 	pfnmerge = iov_pfn | phy_pfn;
1581 
1582 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1583 		pages >>= VTD_STRIDE_SHIFT;
1584 		if (!pages)
1585 			break;
1586 		pfnmerge >>= VTD_STRIDE_SHIFT;
1587 		level++;
1588 		support--;
1589 	}
1590 	return level;
1591 }
1592 
1593 /*
1594  * Ensure that old small page tables are removed to make room for superpage(s).
1595  * We're going to add new large pages, so make sure we don't remove their parent
1596  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1597  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1598 static void switch_to_super_page(struct dmar_domain *domain,
1599 				 unsigned long start_pfn,
1600 				 unsigned long end_pfn, int level)
1601 {
1602 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1603 	struct dma_pte *pte = NULL;
1604 
1605 	while (start_pfn <= end_pfn) {
1606 		if (!pte)
1607 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1608 					     GFP_ATOMIC);
1609 
1610 		if (dma_pte_present(pte)) {
1611 			dma_pte_free_pagetable(domain, start_pfn,
1612 					       start_pfn + lvl_pages - 1,
1613 					       level + 1);
1614 
1615 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1616 					      end_pfn << VTD_PAGE_SHIFT, 0);
1617 		}
1618 
1619 		pte++;
1620 		start_pfn += lvl_pages;
1621 		if (first_pte_in_page(pte))
1622 			pte = NULL;
1623 	}
1624 }
1625 
1626 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1627 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1628 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1629 		 gfp_t gfp)
1630 {
1631 	struct dma_pte *first_pte = NULL, *pte = NULL;
1632 	unsigned int largepage_lvl = 0;
1633 	unsigned long lvl_pages = 0;
1634 	phys_addr_t pteval;
1635 	u64 attr;
1636 
1637 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1638 		return -EINVAL;
1639 
1640 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1641 		return -EINVAL;
1642 
1643 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1644 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1645 		return -EINVAL;
1646 	}
1647 
1648 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1649 	attr |= DMA_FL_PTE_PRESENT;
1650 	if (domain->use_first_level) {
1651 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1652 		if (prot & DMA_PTE_WRITE)
1653 			attr |= DMA_FL_PTE_DIRTY;
1654 	}
1655 
1656 	domain->has_mappings = true;
1657 
1658 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1659 
1660 	while (nr_pages > 0) {
1661 		uint64_t tmp;
1662 
1663 		if (!pte) {
1664 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1665 					phys_pfn, nr_pages);
1666 
1667 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1668 					     gfp);
1669 			if (!pte)
1670 				return -ENOMEM;
1671 			first_pte = pte;
1672 
1673 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1674 
1675 			/* It is large page*/
1676 			if (largepage_lvl > 1) {
1677 				unsigned long end_pfn;
1678 				unsigned long pages_to_remove;
1679 
1680 				pteval |= DMA_PTE_LARGE_PAGE;
1681 				pages_to_remove = min_t(unsigned long, nr_pages,
1682 							nr_pte_to_next_page(pte) * lvl_pages);
1683 				end_pfn = iov_pfn + pages_to_remove - 1;
1684 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1685 			} else {
1686 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1687 			}
1688 
1689 		}
1690 		/* We don't need lock here, nobody else
1691 		 * touches the iova range
1692 		 */
1693 		tmp = 0ULL;
1694 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1695 			static int dumps = 5;
1696 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1697 				iov_pfn, tmp, (unsigned long long)pteval);
1698 			if (dumps) {
1699 				dumps--;
1700 				debug_dma_dump_mappings(NULL);
1701 			}
1702 			WARN_ON(1);
1703 		}
1704 
1705 		nr_pages -= lvl_pages;
1706 		iov_pfn += lvl_pages;
1707 		phys_pfn += lvl_pages;
1708 		pteval += lvl_pages * VTD_PAGE_SIZE;
1709 
1710 		/* If the next PTE would be the first in a new page, then we
1711 		 * need to flush the cache on the entries we've just written.
1712 		 * And then we'll need to recalculate 'pte', so clear it and
1713 		 * let it get set again in the if (!pte) block above.
1714 		 *
1715 		 * If we're done (!nr_pages) we need to flush the cache too.
1716 		 *
1717 		 * Also if we've been setting superpages, we may need to
1718 		 * recalculate 'pte' and switch back to smaller pages for the
1719 		 * end of the mapping, if the trailing size is not enough to
1720 		 * use another superpage (i.e. nr_pages < lvl_pages).
1721 		 */
1722 		pte++;
1723 		if (!nr_pages || first_pte_in_page(pte) ||
1724 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1725 			domain_flush_cache(domain, first_pte,
1726 					   (void *)pte - (void *)first_pte);
1727 			pte = NULL;
1728 		}
1729 	}
1730 
1731 	return 0;
1732 }
1733 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1734 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1735 {
1736 	struct intel_iommu *iommu = info->iommu;
1737 	struct context_entry *context;
1738 	u16 did;
1739 
1740 	spin_lock(&iommu->lock);
1741 	context = iommu_context_addr(iommu, bus, devfn, 0);
1742 	if (!context) {
1743 		spin_unlock(&iommu->lock);
1744 		return;
1745 	}
1746 
1747 	did = context_domain_id(context);
1748 	context_clear_entry(context);
1749 	__iommu_flush_cache(iommu, context, sizeof(*context));
1750 	spin_unlock(&iommu->lock);
1751 	intel_context_flush_present(info, context, did, true);
1752 }
1753 
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1754 int __domain_setup_first_level(struct intel_iommu *iommu,
1755 			       struct device *dev, ioasid_t pasid,
1756 			       u16 did, pgd_t *pgd, int flags,
1757 			       struct iommu_domain *old)
1758 {
1759 	if (!old)
1760 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1761 						     pasid, did, flags);
1762 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1763 					       iommu_domain_did(old, iommu),
1764 					       flags);
1765 }
1766 
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1767 static int domain_setup_second_level(struct intel_iommu *iommu,
1768 				     struct dmar_domain *domain,
1769 				     struct device *dev, ioasid_t pasid,
1770 				     struct iommu_domain *old)
1771 {
1772 	if (!old)
1773 		return intel_pasid_setup_second_level(iommu, domain,
1774 						      dev, pasid);
1775 	return intel_pasid_replace_second_level(iommu, domain, dev,
1776 						iommu_domain_did(old, iommu),
1777 						pasid);
1778 }
1779 
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1780 static int domain_setup_passthrough(struct intel_iommu *iommu,
1781 				    struct device *dev, ioasid_t pasid,
1782 				    struct iommu_domain *old)
1783 {
1784 	if (!old)
1785 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1786 	return intel_pasid_replace_pass_through(iommu, dev,
1787 						iommu_domain_did(old, iommu),
1788 						pasid);
1789 }
1790 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1791 static int domain_setup_first_level(struct intel_iommu *iommu,
1792 				    struct dmar_domain *domain,
1793 				    struct device *dev,
1794 				    u32 pasid, struct iommu_domain *old)
1795 {
1796 	struct dma_pte *pgd = domain->pgd;
1797 	int level, flags = 0;
1798 
1799 	level = agaw_to_level(domain->agaw);
1800 	if (level != 4 && level != 5)
1801 		return -EINVAL;
1802 
1803 	if (level == 5)
1804 		flags |= PASID_FLAG_FL5LP;
1805 
1806 	if (domain->force_snooping)
1807 		flags |= PASID_FLAG_PAGE_SNOOP;
1808 
1809 	return __domain_setup_first_level(iommu, dev, pasid,
1810 					  domain_id_iommu(domain, iommu),
1811 					  (pgd_t *)pgd, flags, old);
1812 }
1813 
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1814 static int dmar_domain_attach_device(struct dmar_domain *domain,
1815 				     struct device *dev)
1816 {
1817 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1818 	struct intel_iommu *iommu = info->iommu;
1819 	unsigned long flags;
1820 	int ret;
1821 
1822 	ret = domain_attach_iommu(domain, iommu);
1823 	if (ret)
1824 		return ret;
1825 
1826 	info->domain = domain;
1827 	spin_lock_irqsave(&domain->lock, flags);
1828 	list_add(&info->link, &domain->devices);
1829 	spin_unlock_irqrestore(&domain->lock, flags);
1830 
1831 	if (dev_is_real_dma_subdevice(dev))
1832 		return 0;
1833 
1834 	if (!sm_supported(iommu))
1835 		ret = domain_context_mapping(domain, dev);
1836 	else if (domain->use_first_level)
1837 		ret = domain_setup_first_level(iommu, domain, dev,
1838 					       IOMMU_NO_PASID, NULL);
1839 	else
1840 		ret = domain_setup_second_level(iommu, domain, dev,
1841 						IOMMU_NO_PASID, NULL);
1842 
1843 	if (ret)
1844 		goto out_block_translation;
1845 
1846 	iommu_enable_pci_caps(info);
1847 
1848 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1849 	if (ret)
1850 		goto out_block_translation;
1851 
1852 	return 0;
1853 
1854 out_block_translation:
1855 	device_block_translation(dev);
1856 	return ret;
1857 }
1858 
1859 /**
1860  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1861  * is relaxable (ie. is allowed to be not enforced under some conditions)
1862  * @dev: device handle
1863  *
1864  * We assume that PCI USB devices with RMRRs have them largely
1865  * for historical reasons and that the RMRR space is not actively used post
1866  * boot.  This exclusion may change if vendors begin to abuse it.
1867  *
1868  * The same exception is made for graphics devices, with the requirement that
1869  * any use of the RMRR regions will be torn down before assigning the device
1870  * to a guest.
1871  *
1872  * Return: true if the RMRR is relaxable, false otherwise
1873  */
device_rmrr_is_relaxable(struct device * dev)1874 static bool device_rmrr_is_relaxable(struct device *dev)
1875 {
1876 	struct pci_dev *pdev;
1877 
1878 	if (!dev_is_pci(dev))
1879 		return false;
1880 
1881 	pdev = to_pci_dev(dev);
1882 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1883 		return true;
1884 	else
1885 		return false;
1886 }
1887 
device_def_domain_type(struct device * dev)1888 static int device_def_domain_type(struct device *dev)
1889 {
1890 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1891 	struct intel_iommu *iommu = info->iommu;
1892 
1893 	/*
1894 	 * Hardware does not support the passthrough translation mode.
1895 	 * Always use a dynamaic mapping domain.
1896 	 */
1897 	if (!ecap_pass_through(iommu->ecap))
1898 		return IOMMU_DOMAIN_DMA;
1899 
1900 	if (dev_is_pci(dev)) {
1901 		struct pci_dev *pdev = to_pci_dev(dev);
1902 
1903 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1904 			return IOMMU_DOMAIN_IDENTITY;
1905 	}
1906 
1907 	return 0;
1908 }
1909 
intel_iommu_init_qi(struct intel_iommu * iommu)1910 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1911 {
1912 	/*
1913 	 * Start from the sane iommu hardware state.
1914 	 * If the queued invalidation is already initialized by us
1915 	 * (for example, while enabling interrupt-remapping) then
1916 	 * we got the things already rolling from a sane state.
1917 	 */
1918 	if (!iommu->qi) {
1919 		/*
1920 		 * Clear any previous faults.
1921 		 */
1922 		dmar_fault(-1, iommu);
1923 		/*
1924 		 * Disable queued invalidation if supported and already enabled
1925 		 * before OS handover.
1926 		 */
1927 		dmar_disable_qi(iommu);
1928 	}
1929 
1930 	if (dmar_enable_qi(iommu)) {
1931 		/*
1932 		 * Queued Invalidate not enabled, use Register Based Invalidate
1933 		 */
1934 		iommu->flush.flush_context = __iommu_flush_context;
1935 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1936 		pr_info("%s: Using Register based invalidation\n",
1937 			iommu->name);
1938 	} else {
1939 		iommu->flush.flush_context = qi_flush_context;
1940 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1941 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1942 	}
1943 }
1944 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1945 static int copy_context_table(struct intel_iommu *iommu,
1946 			      struct root_entry *old_re,
1947 			      struct context_entry **tbl,
1948 			      int bus, bool ext)
1949 {
1950 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1951 	struct context_entry *new_ce = NULL, ce;
1952 	struct context_entry *old_ce = NULL;
1953 	struct root_entry re;
1954 	phys_addr_t old_ce_phys;
1955 
1956 	tbl_idx = ext ? bus * 2 : bus;
1957 	memcpy(&re, old_re, sizeof(re));
1958 
1959 	for (devfn = 0; devfn < 256; devfn++) {
1960 		/* First calculate the correct index */
1961 		idx = (ext ? devfn * 2 : devfn) % 256;
1962 
1963 		if (idx == 0) {
1964 			/* First save what we may have and clean up */
1965 			if (new_ce) {
1966 				tbl[tbl_idx] = new_ce;
1967 				__iommu_flush_cache(iommu, new_ce,
1968 						    VTD_PAGE_SIZE);
1969 				pos = 1;
1970 			}
1971 
1972 			if (old_ce)
1973 				memunmap(old_ce);
1974 
1975 			ret = 0;
1976 			if (devfn < 0x80)
1977 				old_ce_phys = root_entry_lctp(&re);
1978 			else
1979 				old_ce_phys = root_entry_uctp(&re);
1980 
1981 			if (!old_ce_phys) {
1982 				if (ext && devfn == 0) {
1983 					/* No LCTP, try UCTP */
1984 					devfn = 0x7f;
1985 					continue;
1986 				} else {
1987 					goto out;
1988 				}
1989 			}
1990 
1991 			ret = -ENOMEM;
1992 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1993 					MEMREMAP_WB);
1994 			if (!old_ce)
1995 				goto out;
1996 
1997 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1998 			if (!new_ce)
1999 				goto out_unmap;
2000 
2001 			ret = 0;
2002 		}
2003 
2004 		/* Now copy the context entry */
2005 		memcpy(&ce, old_ce + idx, sizeof(ce));
2006 
2007 		if (!context_present(&ce))
2008 			continue;
2009 
2010 		did = context_domain_id(&ce);
2011 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2012 			set_bit(did, iommu->domain_ids);
2013 
2014 		set_context_copied(iommu, bus, devfn);
2015 		new_ce[idx] = ce;
2016 	}
2017 
2018 	tbl[tbl_idx + pos] = new_ce;
2019 
2020 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2021 
2022 out_unmap:
2023 	memunmap(old_ce);
2024 
2025 out:
2026 	return ret;
2027 }
2028 
copy_translation_tables(struct intel_iommu * iommu)2029 static int copy_translation_tables(struct intel_iommu *iommu)
2030 {
2031 	struct context_entry **ctxt_tbls;
2032 	struct root_entry *old_rt;
2033 	phys_addr_t old_rt_phys;
2034 	int ctxt_table_entries;
2035 	u64 rtaddr_reg;
2036 	int bus, ret;
2037 	bool new_ext, ext;
2038 
2039 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2040 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2041 	new_ext    = !!sm_supported(iommu);
2042 
2043 	/*
2044 	 * The RTT bit can only be changed when translation is disabled,
2045 	 * but disabling translation means to open a window for data
2046 	 * corruption. So bail out and don't copy anything if we would
2047 	 * have to change the bit.
2048 	 */
2049 	if (new_ext != ext)
2050 		return -EINVAL;
2051 
2052 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2053 	if (!iommu->copied_tables)
2054 		return -ENOMEM;
2055 
2056 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2057 	if (!old_rt_phys)
2058 		return -EINVAL;
2059 
2060 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2061 	if (!old_rt)
2062 		return -ENOMEM;
2063 
2064 	/* This is too big for the stack - allocate it from slab */
2065 	ctxt_table_entries = ext ? 512 : 256;
2066 	ret = -ENOMEM;
2067 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2068 	if (!ctxt_tbls)
2069 		goto out_unmap;
2070 
2071 	for (bus = 0; bus < 256; bus++) {
2072 		ret = copy_context_table(iommu, &old_rt[bus],
2073 					 ctxt_tbls, bus, ext);
2074 		if (ret) {
2075 			pr_err("%s: Failed to copy context table for bus %d\n",
2076 				iommu->name, bus);
2077 			continue;
2078 		}
2079 	}
2080 
2081 	spin_lock(&iommu->lock);
2082 
2083 	/* Context tables are copied, now write them to the root_entry table */
2084 	for (bus = 0; bus < 256; bus++) {
2085 		int idx = ext ? bus * 2 : bus;
2086 		u64 val;
2087 
2088 		if (ctxt_tbls[idx]) {
2089 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2090 			iommu->root_entry[bus].lo = val;
2091 		}
2092 
2093 		if (!ext || !ctxt_tbls[idx + 1])
2094 			continue;
2095 
2096 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2097 		iommu->root_entry[bus].hi = val;
2098 	}
2099 
2100 	spin_unlock(&iommu->lock);
2101 
2102 	kfree(ctxt_tbls);
2103 
2104 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2105 
2106 	ret = 0;
2107 
2108 out_unmap:
2109 	memunmap(old_rt);
2110 
2111 	return ret;
2112 }
2113 
init_dmars(void)2114 static int __init init_dmars(void)
2115 {
2116 	struct dmar_drhd_unit *drhd;
2117 	struct intel_iommu *iommu;
2118 	int ret;
2119 
2120 	for_each_iommu(iommu, drhd) {
2121 		if (drhd->ignored) {
2122 			iommu_disable_translation(iommu);
2123 			continue;
2124 		}
2125 
2126 		/*
2127 		 * Find the max pasid size of all IOMMU's in the system.
2128 		 * We need to ensure the system pasid table is no bigger
2129 		 * than the smallest supported.
2130 		 */
2131 		if (pasid_supported(iommu)) {
2132 			u32 temp = 2 << ecap_pss(iommu->ecap);
2133 
2134 			intel_pasid_max_id = min_t(u32, temp,
2135 						   intel_pasid_max_id);
2136 		}
2137 
2138 		intel_iommu_init_qi(iommu);
2139 
2140 		ret = iommu_init_domains(iommu);
2141 		if (ret)
2142 			goto free_iommu;
2143 
2144 		init_translation_status(iommu);
2145 
2146 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2147 			iommu_disable_translation(iommu);
2148 			clear_translation_pre_enabled(iommu);
2149 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2150 				iommu->name);
2151 		}
2152 
2153 		/*
2154 		 * TBD:
2155 		 * we could share the same root & context tables
2156 		 * among all IOMMU's. Need to Split it later.
2157 		 */
2158 		ret = iommu_alloc_root_entry(iommu);
2159 		if (ret)
2160 			goto free_iommu;
2161 
2162 		if (translation_pre_enabled(iommu)) {
2163 			pr_info("Translation already enabled - trying to copy translation structures\n");
2164 
2165 			ret = copy_translation_tables(iommu);
2166 			if (ret) {
2167 				/*
2168 				 * We found the IOMMU with translation
2169 				 * enabled - but failed to copy over the
2170 				 * old root-entry table. Try to proceed
2171 				 * by disabling translation now and
2172 				 * allocating a clean root-entry table.
2173 				 * This might cause DMAR faults, but
2174 				 * probably the dump will still succeed.
2175 				 */
2176 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2177 				       iommu->name);
2178 				iommu_disable_translation(iommu);
2179 				clear_translation_pre_enabled(iommu);
2180 			} else {
2181 				pr_info("Copied translation tables from previous kernel for %s\n",
2182 					iommu->name);
2183 			}
2184 		}
2185 
2186 		intel_svm_check(iommu);
2187 	}
2188 
2189 	/*
2190 	 * Now that qi is enabled on all iommus, set the root entry and flush
2191 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2192 	 * flush_context function will loop forever and the boot hangs.
2193 	 */
2194 	for_each_active_iommu(iommu, drhd) {
2195 		iommu_flush_write_buffer(iommu);
2196 		iommu_set_root_entry(iommu);
2197 	}
2198 
2199 	check_tylersburg_isoch();
2200 
2201 	/*
2202 	 * for each drhd
2203 	 *   enable fault log
2204 	 *   global invalidate context cache
2205 	 *   global invalidate iotlb
2206 	 *   enable translation
2207 	 */
2208 	for_each_iommu(iommu, drhd) {
2209 		if (drhd->ignored) {
2210 			/*
2211 			 * we always have to disable PMRs or DMA may fail on
2212 			 * this device
2213 			 */
2214 			if (force_on)
2215 				iommu_disable_protect_mem_regions(iommu);
2216 			continue;
2217 		}
2218 
2219 		iommu_flush_write_buffer(iommu);
2220 
2221 		if (ecap_prs(iommu->ecap)) {
2222 			/*
2223 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2224 			 * could cause possible lock race condition.
2225 			 */
2226 			up_write(&dmar_global_lock);
2227 			ret = intel_iommu_enable_prq(iommu);
2228 			down_write(&dmar_global_lock);
2229 			if (ret)
2230 				goto free_iommu;
2231 		}
2232 
2233 		ret = dmar_set_interrupt(iommu);
2234 		if (ret)
2235 			goto free_iommu;
2236 	}
2237 
2238 	return 0;
2239 
2240 free_iommu:
2241 	for_each_active_iommu(iommu, drhd) {
2242 		disable_dmar_iommu(iommu);
2243 		free_dmar_iommu(iommu);
2244 	}
2245 
2246 	return ret;
2247 }
2248 
init_no_remapping_devices(void)2249 static void __init init_no_remapping_devices(void)
2250 {
2251 	struct dmar_drhd_unit *drhd;
2252 	struct device *dev;
2253 	int i;
2254 
2255 	for_each_drhd_unit(drhd) {
2256 		if (!drhd->include_all) {
2257 			for_each_active_dev_scope(drhd->devices,
2258 						  drhd->devices_cnt, i, dev)
2259 				break;
2260 			/* ignore DMAR unit if no devices exist */
2261 			if (i == drhd->devices_cnt)
2262 				drhd->ignored = 1;
2263 		}
2264 	}
2265 
2266 	for_each_active_drhd_unit(drhd) {
2267 		if (drhd->include_all)
2268 			continue;
2269 
2270 		for_each_active_dev_scope(drhd->devices,
2271 					  drhd->devices_cnt, i, dev)
2272 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2273 				break;
2274 		if (i < drhd->devices_cnt)
2275 			continue;
2276 
2277 		/* This IOMMU has *only* gfx devices. Either bypass it or
2278 		   set the gfx_mapped flag, as appropriate */
2279 		drhd->gfx_dedicated = 1;
2280 		if (disable_igfx_iommu)
2281 			drhd->ignored = 1;
2282 	}
2283 }
2284 
2285 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2286 static int init_iommu_hw(void)
2287 {
2288 	struct dmar_drhd_unit *drhd;
2289 	struct intel_iommu *iommu = NULL;
2290 	int ret;
2291 
2292 	for_each_active_iommu(iommu, drhd) {
2293 		if (iommu->qi) {
2294 			ret = dmar_reenable_qi(iommu);
2295 			if (ret)
2296 				return ret;
2297 		}
2298 	}
2299 
2300 	for_each_iommu(iommu, drhd) {
2301 		if (drhd->ignored) {
2302 			/*
2303 			 * we always have to disable PMRs or DMA may fail on
2304 			 * this device
2305 			 */
2306 			if (force_on)
2307 				iommu_disable_protect_mem_regions(iommu);
2308 			continue;
2309 		}
2310 
2311 		iommu_flush_write_buffer(iommu);
2312 		iommu_set_root_entry(iommu);
2313 		iommu_enable_translation(iommu);
2314 		iommu_disable_protect_mem_regions(iommu);
2315 	}
2316 
2317 	return 0;
2318 }
2319 
iommu_flush_all(void)2320 static void iommu_flush_all(void)
2321 {
2322 	struct dmar_drhd_unit *drhd;
2323 	struct intel_iommu *iommu;
2324 
2325 	for_each_active_iommu(iommu, drhd) {
2326 		iommu->flush.flush_context(iommu, 0, 0, 0,
2327 					   DMA_CCMD_GLOBAL_INVL);
2328 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2329 					 DMA_TLB_GLOBAL_FLUSH);
2330 	}
2331 }
2332 
iommu_suspend(void)2333 static int iommu_suspend(void)
2334 {
2335 	struct dmar_drhd_unit *drhd;
2336 	struct intel_iommu *iommu = NULL;
2337 	unsigned long flag;
2338 
2339 	iommu_flush_all();
2340 
2341 	for_each_active_iommu(iommu, drhd) {
2342 		iommu_disable_translation(iommu);
2343 
2344 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2345 
2346 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2347 			readl(iommu->reg + DMAR_FECTL_REG);
2348 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2349 			readl(iommu->reg + DMAR_FEDATA_REG);
2350 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2351 			readl(iommu->reg + DMAR_FEADDR_REG);
2352 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2353 			readl(iommu->reg + DMAR_FEUADDR_REG);
2354 
2355 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2356 	}
2357 	return 0;
2358 }
2359 
iommu_resume(void)2360 static void iommu_resume(void)
2361 {
2362 	struct dmar_drhd_unit *drhd;
2363 	struct intel_iommu *iommu = NULL;
2364 	unsigned long flag;
2365 
2366 	if (init_iommu_hw()) {
2367 		if (force_on)
2368 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2369 		else
2370 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2371 		return;
2372 	}
2373 
2374 	for_each_active_iommu(iommu, drhd) {
2375 
2376 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2377 
2378 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2379 			iommu->reg + DMAR_FECTL_REG);
2380 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2381 			iommu->reg + DMAR_FEDATA_REG);
2382 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2383 			iommu->reg + DMAR_FEADDR_REG);
2384 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2385 			iommu->reg + DMAR_FEUADDR_REG);
2386 
2387 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2388 	}
2389 }
2390 
2391 static struct syscore_ops iommu_syscore_ops = {
2392 	.resume		= iommu_resume,
2393 	.suspend	= iommu_suspend,
2394 };
2395 
init_iommu_pm_ops(void)2396 static void __init init_iommu_pm_ops(void)
2397 {
2398 	register_syscore_ops(&iommu_syscore_ops);
2399 }
2400 
2401 #else
init_iommu_pm_ops(void)2402 static inline void init_iommu_pm_ops(void) {}
2403 #endif	/* CONFIG_PM */
2404 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2405 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2406 {
2407 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2408 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2409 	    rmrr->end_address <= rmrr->base_address ||
2410 	    arch_rmrr_sanity_check(rmrr))
2411 		return -EINVAL;
2412 
2413 	return 0;
2414 }
2415 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2416 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2417 {
2418 	struct acpi_dmar_reserved_memory *rmrr;
2419 	struct dmar_rmrr_unit *rmrru;
2420 
2421 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2422 	if (rmrr_sanity_check(rmrr)) {
2423 		pr_warn(FW_BUG
2424 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2425 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2426 			   rmrr->base_address, rmrr->end_address,
2427 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2428 			   dmi_get_system_info(DMI_BIOS_VERSION),
2429 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2430 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2431 	}
2432 
2433 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2434 	if (!rmrru)
2435 		goto out;
2436 
2437 	rmrru->hdr = header;
2438 
2439 	rmrru->base_address = rmrr->base_address;
2440 	rmrru->end_address = rmrr->end_address;
2441 
2442 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2443 				((void *)rmrr) + rmrr->header.length,
2444 				&rmrru->devices_cnt);
2445 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2446 		goto free_rmrru;
2447 
2448 	list_add(&rmrru->list, &dmar_rmrr_units);
2449 
2450 	return 0;
2451 free_rmrru:
2452 	kfree(rmrru);
2453 out:
2454 	return -ENOMEM;
2455 }
2456 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2457 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2458 {
2459 	struct dmar_atsr_unit *atsru;
2460 	struct acpi_dmar_atsr *tmp;
2461 
2462 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2463 				dmar_rcu_check()) {
2464 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2465 		if (atsr->segment != tmp->segment)
2466 			continue;
2467 		if (atsr->header.length != tmp->header.length)
2468 			continue;
2469 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2470 			return atsru;
2471 	}
2472 
2473 	return NULL;
2474 }
2475 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2476 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2477 {
2478 	struct acpi_dmar_atsr *atsr;
2479 	struct dmar_atsr_unit *atsru;
2480 
2481 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2482 		return 0;
2483 
2484 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2485 	atsru = dmar_find_atsr(atsr);
2486 	if (atsru)
2487 		return 0;
2488 
2489 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2490 	if (!atsru)
2491 		return -ENOMEM;
2492 
2493 	/*
2494 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2495 	 * copy the memory content because the memory buffer will be freed
2496 	 * on return.
2497 	 */
2498 	atsru->hdr = (void *)(atsru + 1);
2499 	memcpy(atsru->hdr, hdr, hdr->length);
2500 	atsru->include_all = atsr->flags & 0x1;
2501 	if (!atsru->include_all) {
2502 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2503 				(void *)atsr + atsr->header.length,
2504 				&atsru->devices_cnt);
2505 		if (atsru->devices_cnt && atsru->devices == NULL) {
2506 			kfree(atsru);
2507 			return -ENOMEM;
2508 		}
2509 	}
2510 
2511 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2512 
2513 	return 0;
2514 }
2515 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2516 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2517 {
2518 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2519 	kfree(atsru);
2520 }
2521 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2522 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2523 {
2524 	struct acpi_dmar_atsr *atsr;
2525 	struct dmar_atsr_unit *atsru;
2526 
2527 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2528 	atsru = dmar_find_atsr(atsr);
2529 	if (atsru) {
2530 		list_del_rcu(&atsru->list);
2531 		synchronize_rcu();
2532 		intel_iommu_free_atsr(atsru);
2533 	}
2534 
2535 	return 0;
2536 }
2537 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2538 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2539 {
2540 	int i;
2541 	struct device *dev;
2542 	struct acpi_dmar_atsr *atsr;
2543 	struct dmar_atsr_unit *atsru;
2544 
2545 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2546 	atsru = dmar_find_atsr(atsr);
2547 	if (!atsru)
2548 		return 0;
2549 
2550 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2551 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2552 					  i, dev)
2553 			return -EBUSY;
2554 	}
2555 
2556 	return 0;
2557 }
2558 
dmar_find_satc(struct acpi_dmar_satc * satc)2559 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2560 {
2561 	struct dmar_satc_unit *satcu;
2562 	struct acpi_dmar_satc *tmp;
2563 
2564 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2565 				dmar_rcu_check()) {
2566 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2567 		if (satc->segment != tmp->segment)
2568 			continue;
2569 		if (satc->header.length != tmp->header.length)
2570 			continue;
2571 		if (memcmp(satc, tmp, satc->header.length) == 0)
2572 			return satcu;
2573 	}
2574 
2575 	return NULL;
2576 }
2577 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2578 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2579 {
2580 	struct acpi_dmar_satc *satc;
2581 	struct dmar_satc_unit *satcu;
2582 
2583 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2584 		return 0;
2585 
2586 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2587 	satcu = dmar_find_satc(satc);
2588 	if (satcu)
2589 		return 0;
2590 
2591 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2592 	if (!satcu)
2593 		return -ENOMEM;
2594 
2595 	satcu->hdr = (void *)(satcu + 1);
2596 	memcpy(satcu->hdr, hdr, hdr->length);
2597 	satcu->atc_required = satc->flags & 0x1;
2598 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2599 					      (void *)satc + satc->header.length,
2600 					      &satcu->devices_cnt);
2601 	if (satcu->devices_cnt && !satcu->devices) {
2602 		kfree(satcu);
2603 		return -ENOMEM;
2604 	}
2605 	list_add_rcu(&satcu->list, &dmar_satc_units);
2606 
2607 	return 0;
2608 }
2609 
intel_iommu_add(struct dmar_drhd_unit * dmaru)2610 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2611 {
2612 	struct intel_iommu *iommu = dmaru->iommu;
2613 	int ret;
2614 
2615 	/*
2616 	 * Disable translation if already enabled prior to OS handover.
2617 	 */
2618 	if (iommu->gcmd & DMA_GCMD_TE)
2619 		iommu_disable_translation(iommu);
2620 
2621 	ret = iommu_init_domains(iommu);
2622 	if (ret == 0)
2623 		ret = iommu_alloc_root_entry(iommu);
2624 	if (ret)
2625 		goto out;
2626 
2627 	intel_svm_check(iommu);
2628 
2629 	if (dmaru->ignored) {
2630 		/*
2631 		 * we always have to disable PMRs or DMA may fail on this device
2632 		 */
2633 		if (force_on)
2634 			iommu_disable_protect_mem_regions(iommu);
2635 		return 0;
2636 	}
2637 
2638 	intel_iommu_init_qi(iommu);
2639 	iommu_flush_write_buffer(iommu);
2640 
2641 	if (ecap_prs(iommu->ecap)) {
2642 		ret = intel_iommu_enable_prq(iommu);
2643 		if (ret)
2644 			goto disable_iommu;
2645 	}
2646 
2647 	ret = dmar_set_interrupt(iommu);
2648 	if (ret)
2649 		goto disable_iommu;
2650 
2651 	iommu_set_root_entry(iommu);
2652 	iommu_enable_translation(iommu);
2653 
2654 	iommu_disable_protect_mem_regions(iommu);
2655 	return 0;
2656 
2657 disable_iommu:
2658 	disable_dmar_iommu(iommu);
2659 out:
2660 	free_dmar_iommu(iommu);
2661 	return ret;
2662 }
2663 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2664 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2665 {
2666 	int ret = 0;
2667 	struct intel_iommu *iommu = dmaru->iommu;
2668 
2669 	if (!intel_iommu_enabled)
2670 		return 0;
2671 	if (iommu == NULL)
2672 		return -EINVAL;
2673 
2674 	if (insert) {
2675 		ret = intel_iommu_add(dmaru);
2676 	} else {
2677 		disable_dmar_iommu(iommu);
2678 		free_dmar_iommu(iommu);
2679 	}
2680 
2681 	return ret;
2682 }
2683 
intel_iommu_free_dmars(void)2684 static void intel_iommu_free_dmars(void)
2685 {
2686 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2687 	struct dmar_atsr_unit *atsru, *atsr_n;
2688 	struct dmar_satc_unit *satcu, *satc_n;
2689 
2690 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2691 		list_del(&rmrru->list);
2692 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2693 		kfree(rmrru);
2694 	}
2695 
2696 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2697 		list_del(&atsru->list);
2698 		intel_iommu_free_atsr(atsru);
2699 	}
2700 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2701 		list_del(&satcu->list);
2702 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2703 		kfree(satcu);
2704 	}
2705 }
2706 
dmar_find_matched_satc_unit(struct pci_dev * dev)2707 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2708 {
2709 	struct dmar_satc_unit *satcu;
2710 	struct acpi_dmar_satc *satc;
2711 	struct device *tmp;
2712 	int i;
2713 
2714 	dev = pci_physfn(dev);
2715 	rcu_read_lock();
2716 
2717 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2718 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2719 		if (satc->segment != pci_domain_nr(dev->bus))
2720 			continue;
2721 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2722 			if (to_pci_dev(tmp) == dev)
2723 				goto out;
2724 	}
2725 	satcu = NULL;
2726 out:
2727 	rcu_read_unlock();
2728 	return satcu;
2729 }
2730 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2731 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2732 {
2733 	int i, ret = 1;
2734 	struct pci_bus *bus;
2735 	struct pci_dev *bridge = NULL;
2736 	struct device *tmp;
2737 	struct acpi_dmar_atsr *atsr;
2738 	struct dmar_atsr_unit *atsru;
2739 	struct dmar_satc_unit *satcu;
2740 
2741 	dev = pci_physfn(dev);
2742 	satcu = dmar_find_matched_satc_unit(dev);
2743 	if (satcu)
2744 		/*
2745 		 * This device supports ATS as it is in SATC table.
2746 		 * When IOMMU is in legacy mode, enabling ATS is done
2747 		 * automatically by HW for the device that requires
2748 		 * ATS, hence OS should not enable this device ATS
2749 		 * to avoid duplicated TLB invalidation.
2750 		 */
2751 		return !(satcu->atc_required && !sm_supported(iommu));
2752 
2753 	for (bus = dev->bus; bus; bus = bus->parent) {
2754 		bridge = bus->self;
2755 		/* If it's an integrated device, allow ATS */
2756 		if (!bridge)
2757 			return 1;
2758 		/* Connected via non-PCIe: no ATS */
2759 		if (!pci_is_pcie(bridge) ||
2760 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2761 			return 0;
2762 		/* If we found the root port, look it up in the ATSR */
2763 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2764 			break;
2765 	}
2766 
2767 	rcu_read_lock();
2768 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2769 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2770 		if (atsr->segment != pci_domain_nr(dev->bus))
2771 			continue;
2772 
2773 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2774 			if (tmp == &bridge->dev)
2775 				goto out;
2776 
2777 		if (atsru->include_all)
2778 			goto out;
2779 	}
2780 	ret = 0;
2781 out:
2782 	rcu_read_unlock();
2783 
2784 	return ret;
2785 }
2786 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2787 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2788 {
2789 	int ret;
2790 	struct dmar_rmrr_unit *rmrru;
2791 	struct dmar_atsr_unit *atsru;
2792 	struct dmar_satc_unit *satcu;
2793 	struct acpi_dmar_atsr *atsr;
2794 	struct acpi_dmar_reserved_memory *rmrr;
2795 	struct acpi_dmar_satc *satc;
2796 
2797 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2798 		return 0;
2799 
2800 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2801 		rmrr = container_of(rmrru->hdr,
2802 				    struct acpi_dmar_reserved_memory, header);
2803 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2804 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2805 				((void *)rmrr) + rmrr->header.length,
2806 				rmrr->segment, rmrru->devices,
2807 				rmrru->devices_cnt);
2808 			if (ret < 0)
2809 				return ret;
2810 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2811 			dmar_remove_dev_scope(info, rmrr->segment,
2812 				rmrru->devices, rmrru->devices_cnt);
2813 		}
2814 	}
2815 
2816 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2817 		if (atsru->include_all)
2818 			continue;
2819 
2820 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2821 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2822 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2823 					(void *)atsr + atsr->header.length,
2824 					atsr->segment, atsru->devices,
2825 					atsru->devices_cnt);
2826 			if (ret > 0)
2827 				break;
2828 			else if (ret < 0)
2829 				return ret;
2830 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2831 			if (dmar_remove_dev_scope(info, atsr->segment,
2832 					atsru->devices, atsru->devices_cnt))
2833 				break;
2834 		}
2835 	}
2836 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2837 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2838 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2839 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2840 					(void *)satc + satc->header.length,
2841 					satc->segment, satcu->devices,
2842 					satcu->devices_cnt);
2843 			if (ret > 0)
2844 				break;
2845 			else if (ret < 0)
2846 				return ret;
2847 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2848 			if (dmar_remove_dev_scope(info, satc->segment,
2849 					satcu->devices, satcu->devices_cnt))
2850 				break;
2851 		}
2852 	}
2853 
2854 	return 0;
2855 }
2856 
intel_disable_iommus(void)2857 static void intel_disable_iommus(void)
2858 {
2859 	struct intel_iommu *iommu = NULL;
2860 	struct dmar_drhd_unit *drhd;
2861 
2862 	for_each_iommu(iommu, drhd)
2863 		iommu_disable_translation(iommu);
2864 }
2865 
intel_iommu_shutdown(void)2866 void intel_iommu_shutdown(void)
2867 {
2868 	struct dmar_drhd_unit *drhd;
2869 	struct intel_iommu *iommu = NULL;
2870 
2871 	if (no_iommu || dmar_disabled)
2872 		return;
2873 
2874 	down_write(&dmar_global_lock);
2875 
2876 	/* Disable PMRs explicitly here. */
2877 	for_each_iommu(iommu, drhd)
2878 		iommu_disable_protect_mem_regions(iommu);
2879 
2880 	/* Make sure the IOMMUs are switched off */
2881 	intel_disable_iommus();
2882 
2883 	up_write(&dmar_global_lock);
2884 }
2885 
dev_to_intel_iommu(struct device * dev)2886 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2887 {
2888 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2889 
2890 	return container_of(iommu_dev, struct intel_iommu, iommu);
2891 }
2892 
version_show(struct device * dev,struct device_attribute * attr,char * buf)2893 static ssize_t version_show(struct device *dev,
2894 			    struct device_attribute *attr, char *buf)
2895 {
2896 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2897 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2898 	return sysfs_emit(buf, "%d:%d\n",
2899 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2900 }
2901 static DEVICE_ATTR_RO(version);
2902 
address_show(struct device * dev,struct device_attribute * attr,char * buf)2903 static ssize_t address_show(struct device *dev,
2904 			    struct device_attribute *attr, char *buf)
2905 {
2906 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2907 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2908 }
2909 static DEVICE_ATTR_RO(address);
2910 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2911 static ssize_t cap_show(struct device *dev,
2912 			struct device_attribute *attr, char *buf)
2913 {
2914 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2915 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2916 }
2917 static DEVICE_ATTR_RO(cap);
2918 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2919 static ssize_t ecap_show(struct device *dev,
2920 			 struct device_attribute *attr, char *buf)
2921 {
2922 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2923 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2924 }
2925 static DEVICE_ATTR_RO(ecap);
2926 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2927 static ssize_t domains_supported_show(struct device *dev,
2928 				      struct device_attribute *attr, char *buf)
2929 {
2930 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2931 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2932 }
2933 static DEVICE_ATTR_RO(domains_supported);
2934 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2935 static ssize_t domains_used_show(struct device *dev,
2936 				 struct device_attribute *attr, char *buf)
2937 {
2938 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2939 	return sysfs_emit(buf, "%d\n",
2940 			  bitmap_weight(iommu->domain_ids,
2941 					cap_ndoms(iommu->cap)));
2942 }
2943 static DEVICE_ATTR_RO(domains_used);
2944 
2945 static struct attribute *intel_iommu_attrs[] = {
2946 	&dev_attr_version.attr,
2947 	&dev_attr_address.attr,
2948 	&dev_attr_cap.attr,
2949 	&dev_attr_ecap.attr,
2950 	&dev_attr_domains_supported.attr,
2951 	&dev_attr_domains_used.attr,
2952 	NULL,
2953 };
2954 
2955 static struct attribute_group intel_iommu_group = {
2956 	.name = "intel-iommu",
2957 	.attrs = intel_iommu_attrs,
2958 };
2959 
2960 const struct attribute_group *intel_iommu_groups[] = {
2961 	&intel_iommu_group,
2962 	NULL,
2963 };
2964 
has_external_pci(void)2965 static bool has_external_pci(void)
2966 {
2967 	struct pci_dev *pdev = NULL;
2968 
2969 	for_each_pci_dev(pdev)
2970 		if (pdev->external_facing) {
2971 			pci_dev_put(pdev);
2972 			return true;
2973 		}
2974 
2975 	return false;
2976 }
2977 
platform_optin_force_iommu(void)2978 static int __init platform_optin_force_iommu(void)
2979 {
2980 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2981 		return 0;
2982 
2983 	if (no_iommu || dmar_disabled)
2984 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2985 
2986 	/*
2987 	 * If Intel-IOMMU is disabled by default, we will apply identity
2988 	 * map for all devices except those marked as being untrusted.
2989 	 */
2990 	if (dmar_disabled)
2991 		iommu_set_default_passthrough(false);
2992 
2993 	dmar_disabled = 0;
2994 	no_iommu = 0;
2995 
2996 	return 1;
2997 }
2998 
probe_acpi_namespace_devices(void)2999 static int __init probe_acpi_namespace_devices(void)
3000 {
3001 	struct dmar_drhd_unit *drhd;
3002 	/* To avoid a -Wunused-but-set-variable warning. */
3003 	struct intel_iommu *iommu __maybe_unused;
3004 	struct device *dev;
3005 	int i, ret = 0;
3006 
3007 	for_each_active_iommu(iommu, drhd) {
3008 		for_each_active_dev_scope(drhd->devices,
3009 					  drhd->devices_cnt, i, dev) {
3010 			struct acpi_device_physical_node *pn;
3011 			struct acpi_device *adev;
3012 
3013 			if (dev->bus != &acpi_bus_type)
3014 				continue;
3015 
3016 			adev = to_acpi_device(dev);
3017 			mutex_lock(&adev->physical_node_lock);
3018 			list_for_each_entry(pn,
3019 					    &adev->physical_node_list, node) {
3020 				ret = iommu_probe_device(pn->dev);
3021 				if (ret)
3022 					break;
3023 			}
3024 			mutex_unlock(&adev->physical_node_lock);
3025 
3026 			if (ret)
3027 				return ret;
3028 		}
3029 	}
3030 
3031 	return 0;
3032 }
3033 
tboot_force_iommu(void)3034 static __init int tboot_force_iommu(void)
3035 {
3036 	if (!tboot_enabled())
3037 		return 0;
3038 
3039 	if (no_iommu || dmar_disabled)
3040 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3041 
3042 	dmar_disabled = 0;
3043 	no_iommu = 0;
3044 
3045 	return 1;
3046 }
3047 
intel_iommu_init(void)3048 int __init intel_iommu_init(void)
3049 {
3050 	int ret = -ENODEV;
3051 	struct dmar_drhd_unit *drhd;
3052 	struct intel_iommu *iommu;
3053 
3054 	/*
3055 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3056 	 * opt in, so enforce that.
3057 	 */
3058 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3059 		    platform_optin_force_iommu();
3060 
3061 	down_write(&dmar_global_lock);
3062 	if (dmar_table_init()) {
3063 		if (force_on)
3064 			panic("tboot: Failed to initialize DMAR table\n");
3065 		goto out_free_dmar;
3066 	}
3067 
3068 	if (dmar_dev_scope_init() < 0) {
3069 		if (force_on)
3070 			panic("tboot: Failed to initialize DMAR device scope\n");
3071 		goto out_free_dmar;
3072 	}
3073 
3074 	up_write(&dmar_global_lock);
3075 
3076 	/*
3077 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3078 	 * complain later when we register it under the lock.
3079 	 */
3080 	dmar_register_bus_notifier();
3081 
3082 	down_write(&dmar_global_lock);
3083 
3084 	if (!no_iommu)
3085 		intel_iommu_debugfs_init();
3086 
3087 	if (no_iommu || dmar_disabled) {
3088 		/*
3089 		 * We exit the function here to ensure IOMMU's remapping and
3090 		 * mempool aren't setup, which means that the IOMMU's PMRs
3091 		 * won't be disabled via the call to init_dmars(). So disable
3092 		 * it explicitly here. The PMRs were setup by tboot prior to
3093 		 * calling SENTER, but the kernel is expected to reset/tear
3094 		 * down the PMRs.
3095 		 */
3096 		if (intel_iommu_tboot_noforce) {
3097 			for_each_iommu(iommu, drhd)
3098 				iommu_disable_protect_mem_regions(iommu);
3099 		}
3100 
3101 		/*
3102 		 * Make sure the IOMMUs are switched off, even when we
3103 		 * boot into a kexec kernel and the previous kernel left
3104 		 * them enabled
3105 		 */
3106 		intel_disable_iommus();
3107 		goto out_free_dmar;
3108 	}
3109 
3110 	if (list_empty(&dmar_rmrr_units))
3111 		pr_info("No RMRR found\n");
3112 
3113 	if (list_empty(&dmar_atsr_units))
3114 		pr_info("No ATSR found\n");
3115 
3116 	if (list_empty(&dmar_satc_units))
3117 		pr_info("No SATC found\n");
3118 
3119 	init_no_remapping_devices();
3120 
3121 	ret = init_dmars();
3122 	if (ret) {
3123 		if (force_on)
3124 			panic("tboot: Failed to initialize DMARs\n");
3125 		pr_err("Initialization failed\n");
3126 		goto out_free_dmar;
3127 	}
3128 	up_write(&dmar_global_lock);
3129 
3130 	init_iommu_pm_ops();
3131 
3132 	down_read(&dmar_global_lock);
3133 	for_each_active_iommu(iommu, drhd) {
3134 		/*
3135 		 * The flush queue implementation does not perform
3136 		 * page-selective invalidations that are required for efficient
3137 		 * TLB flushes in virtual environments.  The benefit of batching
3138 		 * is likely to be much lower than the overhead of synchronizing
3139 		 * the virtual and physical IOMMU page-tables.
3140 		 */
3141 		if (cap_caching_mode(iommu->cap) &&
3142 		    !first_level_by_default(iommu)) {
3143 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3144 			iommu_set_dma_strict();
3145 		}
3146 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3147 				       intel_iommu_groups,
3148 				       "%s", iommu->name);
3149 		/*
3150 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3151 		 * Release the dmar_global_lock before entering the device probe path
3152 		 * to avoid unnecessary lock order splat.
3153 		 */
3154 		up_read(&dmar_global_lock);
3155 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3156 		down_read(&dmar_global_lock);
3157 
3158 		iommu_pmu_register(iommu);
3159 	}
3160 
3161 	if (probe_acpi_namespace_devices())
3162 		pr_warn("ACPI name space devices didn't probe correctly\n");
3163 
3164 	/* Finally, we enable the DMA remapping hardware. */
3165 	for_each_iommu(iommu, drhd) {
3166 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3167 			iommu_enable_translation(iommu);
3168 
3169 		iommu_disable_protect_mem_regions(iommu);
3170 	}
3171 	up_read(&dmar_global_lock);
3172 
3173 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3174 
3175 	intel_iommu_enabled = 1;
3176 
3177 	return 0;
3178 
3179 out_free_dmar:
3180 	intel_iommu_free_dmars();
3181 	up_write(&dmar_global_lock);
3182 	return ret;
3183 }
3184 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3185 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3186 {
3187 	struct device_domain_info *info = opaque;
3188 
3189 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3190 	return 0;
3191 }
3192 
3193 /*
3194  * NB - intel-iommu lacks any sort of reference counting for the users of
3195  * dependent devices.  If multiple endpoints have intersecting dependent
3196  * devices, unbinding the driver from any one of them will possibly leave
3197  * the others unable to operate.
3198  */
domain_context_clear(struct device_domain_info * info)3199 static void domain_context_clear(struct device_domain_info *info)
3200 {
3201 	if (!dev_is_pci(info->dev)) {
3202 		domain_context_clear_one(info, info->bus, info->devfn);
3203 		return;
3204 	}
3205 
3206 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3207 			       &domain_context_clear_one_cb, info);
3208 }
3209 
3210 /*
3211  * Clear the page table pointer in context or pasid table entries so that
3212  * all DMA requests without PASID from the device are blocked. If the page
3213  * table has been set, clean up the data structures.
3214  */
device_block_translation(struct device * dev)3215 void device_block_translation(struct device *dev)
3216 {
3217 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3218 	struct intel_iommu *iommu = info->iommu;
3219 	unsigned long flags;
3220 
3221 	if (info->domain)
3222 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3223 
3224 	iommu_disable_pci_caps(info);
3225 	if (!dev_is_real_dma_subdevice(dev)) {
3226 		if (sm_supported(iommu))
3227 			intel_pasid_tear_down_entry(iommu, dev,
3228 						    IOMMU_NO_PASID, false);
3229 		else
3230 			domain_context_clear(info);
3231 	}
3232 
3233 	if (!info->domain)
3234 		return;
3235 
3236 	spin_lock_irqsave(&info->domain->lock, flags);
3237 	list_del(&info->link);
3238 	spin_unlock_irqrestore(&info->domain->lock, flags);
3239 
3240 	domain_detach_iommu(info->domain, iommu);
3241 	info->domain = NULL;
3242 }
3243 
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3244 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3245 				      struct device *dev)
3246 {
3247 	device_block_translation(dev);
3248 	return 0;
3249 }
3250 
3251 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3252 					 struct device *dev, ioasid_t pasid,
3253 					 struct iommu_domain *old);
3254 
3255 static struct iommu_domain blocking_domain = {
3256 	.type = IOMMU_DOMAIN_BLOCKED,
3257 	.ops = &(const struct iommu_domain_ops) {
3258 		.attach_dev	= blocking_domain_attach_dev,
3259 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3260 	}
3261 };
3262 
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3263 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3264 {
3265 	if (!intel_iommu_superpage)
3266 		return 0;
3267 
3268 	if (first_stage)
3269 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3270 
3271 	return fls(cap_super_page_val(iommu->cap));
3272 }
3273 
paging_domain_alloc(struct device * dev,bool first_stage)3274 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3275 {
3276 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3277 	struct intel_iommu *iommu = info->iommu;
3278 	struct dmar_domain *domain;
3279 	int addr_width;
3280 
3281 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3282 	if (!domain)
3283 		return ERR_PTR(-ENOMEM);
3284 
3285 	INIT_LIST_HEAD(&domain->devices);
3286 	INIT_LIST_HEAD(&domain->dev_pasids);
3287 	INIT_LIST_HEAD(&domain->cache_tags);
3288 	spin_lock_init(&domain->lock);
3289 	spin_lock_init(&domain->cache_lock);
3290 	xa_init(&domain->iommu_array);
3291 
3292 	domain->nid = dev_to_node(dev);
3293 	domain->use_first_level = first_stage;
3294 
3295 	/* calculate the address width */
3296 	addr_width = agaw_to_width(iommu->agaw);
3297 	if (addr_width > cap_mgaw(iommu->cap))
3298 		addr_width = cap_mgaw(iommu->cap);
3299 	domain->gaw = addr_width;
3300 	domain->agaw = iommu->agaw;
3301 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3302 
3303 	/* iommu memory access coherency */
3304 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3305 
3306 	/* pagesize bitmap */
3307 	domain->domain.pgsize_bitmap = SZ_4K;
3308 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3309 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3310 
3311 	/*
3312 	 * IOVA aperture: First-level translation restricts the input-address
3313 	 * to a canonical address (i.e., address bits 63:N have the same value
3314 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3315 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3316 	 */
3317 	domain->domain.geometry.force_aperture = true;
3318 	domain->domain.geometry.aperture_start = 0;
3319 	if (first_stage)
3320 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3321 	else
3322 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3323 
3324 	/* always allocate the top pgd */
3325 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3326 	if (!domain->pgd) {
3327 		kfree(domain);
3328 		return ERR_PTR(-ENOMEM);
3329 	}
3330 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3331 
3332 	return domain;
3333 }
3334 
3335 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3336 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3337 				      const struct iommu_user_data *user_data)
3338 {
3339 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3340 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3341 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3342 	struct intel_iommu *iommu = info->iommu;
3343 	struct dmar_domain *dmar_domain;
3344 	struct iommu_domain *domain;
3345 	bool first_stage;
3346 
3347 	if (flags &
3348 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3349 		return ERR_PTR(-EOPNOTSUPP);
3350 	if (nested_parent && !nested_supported(iommu))
3351 		return ERR_PTR(-EOPNOTSUPP);
3352 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3353 		return ERR_PTR(-EOPNOTSUPP);
3354 
3355 	/*
3356 	 * Always allocate the guest compatible page table unless
3357 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3358 	 * is specified.
3359 	 */
3360 	if (nested_parent || dirty_tracking) {
3361 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3362 			return ERR_PTR(-EOPNOTSUPP);
3363 		first_stage = false;
3364 	} else {
3365 		first_stage = first_level_by_default(iommu);
3366 	}
3367 
3368 	dmar_domain = paging_domain_alloc(dev, first_stage);
3369 	if (IS_ERR(dmar_domain))
3370 		return ERR_CAST(dmar_domain);
3371 	domain = &dmar_domain->domain;
3372 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3373 	domain->owner = &intel_iommu_ops;
3374 	domain->ops = intel_iommu_ops.default_domain_ops;
3375 
3376 	if (nested_parent) {
3377 		dmar_domain->nested_parent = true;
3378 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3379 		spin_lock_init(&dmar_domain->s1_lock);
3380 	}
3381 
3382 	if (dirty_tracking) {
3383 		if (dmar_domain->use_first_level) {
3384 			iommu_domain_free(domain);
3385 			return ERR_PTR(-EOPNOTSUPP);
3386 		}
3387 		domain->dirty_ops = &intel_dirty_ops;
3388 	}
3389 
3390 	return domain;
3391 }
3392 
intel_iommu_domain_free(struct iommu_domain * domain)3393 static void intel_iommu_domain_free(struct iommu_domain *domain)
3394 {
3395 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3396 
3397 	WARN_ON(dmar_domain->nested_parent &&
3398 		!list_empty(&dmar_domain->s1_domains));
3399 	domain_exit(dmar_domain);
3400 }
3401 
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3402 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3403 {
3404 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3405 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3406 	struct intel_iommu *iommu = info->iommu;
3407 	int addr_width;
3408 
3409 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3410 		return -EPERM;
3411 
3412 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3413 		return -EINVAL;
3414 
3415 	if (domain->dirty_ops && !ssads_supported(iommu))
3416 		return -EINVAL;
3417 
3418 	if (dmar_domain->iommu_coherency !=
3419 			iommu_paging_structure_coherency(iommu))
3420 		return -EINVAL;
3421 
3422 	if (dmar_domain->iommu_superpage !=
3423 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3424 		return -EINVAL;
3425 
3426 	if (dmar_domain->use_first_level &&
3427 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3428 		return -EINVAL;
3429 
3430 	/* check if this iommu agaw is sufficient for max mapped address */
3431 	addr_width = agaw_to_width(iommu->agaw);
3432 	if (addr_width > cap_mgaw(iommu->cap))
3433 		addr_width = cap_mgaw(iommu->cap);
3434 
3435 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3436 		return -EINVAL;
3437 
3438 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3439 	    context_copied(iommu, info->bus, info->devfn))
3440 		return intel_pasid_setup_sm_context(dev);
3441 
3442 	return 0;
3443 }
3444 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3445 static int intel_iommu_attach_device(struct iommu_domain *domain,
3446 				     struct device *dev)
3447 {
3448 	int ret;
3449 
3450 	device_block_translation(dev);
3451 
3452 	ret = paging_domain_compatible(domain, dev);
3453 	if (ret)
3454 		return ret;
3455 
3456 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3457 }
3458 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3459 static int intel_iommu_map(struct iommu_domain *domain,
3460 			   unsigned long iova, phys_addr_t hpa,
3461 			   size_t size, int iommu_prot, gfp_t gfp)
3462 {
3463 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3464 	u64 max_addr;
3465 	int prot = 0;
3466 
3467 	if (iommu_prot & IOMMU_READ)
3468 		prot |= DMA_PTE_READ;
3469 	if (iommu_prot & IOMMU_WRITE)
3470 		prot |= DMA_PTE_WRITE;
3471 	if (dmar_domain->set_pte_snp)
3472 		prot |= DMA_PTE_SNP;
3473 
3474 	max_addr = iova + size;
3475 	if (dmar_domain->max_addr < max_addr) {
3476 		u64 end;
3477 
3478 		/* check if minimum agaw is sufficient for mapped address */
3479 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3480 		if (end < max_addr) {
3481 			pr_err("%s: iommu width (%d) is not "
3482 			       "sufficient for the mapped address (%llx)\n",
3483 			       __func__, dmar_domain->gaw, max_addr);
3484 			return -EFAULT;
3485 		}
3486 		dmar_domain->max_addr = max_addr;
3487 	}
3488 	/* Round up size to next multiple of PAGE_SIZE, if it and
3489 	   the low bits of hpa would take us onto the next page */
3490 	size = aligned_nrpages(hpa, size);
3491 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3492 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3493 }
3494 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3495 static int intel_iommu_map_pages(struct iommu_domain *domain,
3496 				 unsigned long iova, phys_addr_t paddr,
3497 				 size_t pgsize, size_t pgcount,
3498 				 int prot, gfp_t gfp, size_t *mapped)
3499 {
3500 	unsigned long pgshift = __ffs(pgsize);
3501 	size_t size = pgcount << pgshift;
3502 	int ret;
3503 
3504 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3505 		return -EINVAL;
3506 
3507 	if (!IS_ALIGNED(iova | paddr, pgsize))
3508 		return -EINVAL;
3509 
3510 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3511 	if (!ret && mapped)
3512 		*mapped = size;
3513 
3514 	return ret;
3515 }
3516 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3517 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3518 				unsigned long iova, size_t size,
3519 				struct iommu_iotlb_gather *gather)
3520 {
3521 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3522 	unsigned long start_pfn, last_pfn;
3523 	int level = 0;
3524 
3525 	/* Cope with horrid API which requires us to unmap more than the
3526 	   size argument if it happens to be a large-page mapping. */
3527 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3528 				     &level, GFP_ATOMIC)))
3529 		return 0;
3530 
3531 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3532 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3533 
3534 	start_pfn = iova >> VTD_PAGE_SHIFT;
3535 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3536 
3537 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3538 
3539 	if (dmar_domain->max_addr == iova + size)
3540 		dmar_domain->max_addr = iova;
3541 
3542 	/*
3543 	 * We do not use page-selective IOTLB invalidation in flush queue,
3544 	 * so there is no need to track page and sync iotlb.
3545 	 */
3546 	if (!iommu_iotlb_gather_queued(gather))
3547 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3548 
3549 	return size;
3550 }
3551 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3552 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3553 				      unsigned long iova,
3554 				      size_t pgsize, size_t pgcount,
3555 				      struct iommu_iotlb_gather *gather)
3556 {
3557 	unsigned long pgshift = __ffs(pgsize);
3558 	size_t size = pgcount << pgshift;
3559 
3560 	return intel_iommu_unmap(domain, iova, size, gather);
3561 }
3562 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3563 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3564 				 struct iommu_iotlb_gather *gather)
3565 {
3566 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3567 			      gather->end, list_empty(&gather->freelist));
3568 	iommu_put_pages_list(&gather->freelist);
3569 }
3570 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3571 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3572 					    dma_addr_t iova)
3573 {
3574 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3575 	struct dma_pte *pte;
3576 	int level = 0;
3577 	u64 phys = 0;
3578 
3579 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3580 			     GFP_ATOMIC);
3581 	if (pte && dma_pte_present(pte))
3582 		phys = dma_pte_addr(pte) +
3583 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3584 						VTD_PAGE_SHIFT) - 1));
3585 
3586 	return phys;
3587 }
3588 
domain_support_force_snooping(struct dmar_domain * domain)3589 static bool domain_support_force_snooping(struct dmar_domain *domain)
3590 {
3591 	struct device_domain_info *info;
3592 	bool support = true;
3593 
3594 	assert_spin_locked(&domain->lock);
3595 	list_for_each_entry(info, &domain->devices, link) {
3596 		if (!ecap_sc_support(info->iommu->ecap)) {
3597 			support = false;
3598 			break;
3599 		}
3600 	}
3601 
3602 	return support;
3603 }
3604 
domain_set_force_snooping(struct dmar_domain * domain)3605 static void domain_set_force_snooping(struct dmar_domain *domain)
3606 {
3607 	struct device_domain_info *info;
3608 
3609 	assert_spin_locked(&domain->lock);
3610 	/*
3611 	 * Second level page table supports per-PTE snoop control. The
3612 	 * iommu_map() interface will handle this by setting SNP bit.
3613 	 */
3614 	if (!domain->use_first_level) {
3615 		domain->set_pte_snp = true;
3616 		return;
3617 	}
3618 
3619 	list_for_each_entry(info, &domain->devices, link)
3620 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3621 						     IOMMU_NO_PASID);
3622 }
3623 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3624 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3625 {
3626 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3627 	unsigned long flags;
3628 
3629 	if (dmar_domain->force_snooping)
3630 		return true;
3631 
3632 	spin_lock_irqsave(&dmar_domain->lock, flags);
3633 	if (!domain_support_force_snooping(dmar_domain) ||
3634 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3635 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3636 		return false;
3637 	}
3638 
3639 	domain_set_force_snooping(dmar_domain);
3640 	dmar_domain->force_snooping = true;
3641 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3642 
3643 	return true;
3644 }
3645 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3646 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3647 {
3648 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3649 
3650 	switch (cap) {
3651 	case IOMMU_CAP_CACHE_COHERENCY:
3652 	case IOMMU_CAP_DEFERRED_FLUSH:
3653 		return true;
3654 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3655 		return dmar_platform_optin();
3656 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3657 		return ecap_sc_support(info->iommu->ecap);
3658 	case IOMMU_CAP_DIRTY_TRACKING:
3659 		return ssads_supported(info->iommu);
3660 	default:
3661 		return false;
3662 	}
3663 }
3664 
intel_iommu_probe_device(struct device * dev)3665 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3666 {
3667 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3668 	struct device_domain_info *info;
3669 	struct intel_iommu *iommu;
3670 	u8 bus, devfn;
3671 	int ret;
3672 
3673 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3674 	if (!iommu || !iommu->iommu.ops)
3675 		return ERR_PTR(-ENODEV);
3676 
3677 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3678 	if (!info)
3679 		return ERR_PTR(-ENOMEM);
3680 
3681 	if (dev_is_real_dma_subdevice(dev)) {
3682 		info->bus = pdev->bus->number;
3683 		info->devfn = pdev->devfn;
3684 		info->segment = pci_domain_nr(pdev->bus);
3685 	} else {
3686 		info->bus = bus;
3687 		info->devfn = devfn;
3688 		info->segment = iommu->segment;
3689 	}
3690 
3691 	info->dev = dev;
3692 	info->iommu = iommu;
3693 	if (dev_is_pci(dev)) {
3694 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3695 		    pci_ats_supported(pdev) &&
3696 		    dmar_ats_supported(pdev, iommu)) {
3697 			info->ats_supported = 1;
3698 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3699 
3700 			/*
3701 			 * For IOMMU that supports device IOTLB throttling
3702 			 * (DIT), we assign PFSID to the invalidation desc
3703 			 * of a VF such that IOMMU HW can gauge queue depth
3704 			 * at PF level. If DIT is not set, PFSID will be
3705 			 * treated as reserved, which should be set to 0.
3706 			 */
3707 			if (ecap_dit(iommu->ecap))
3708 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3709 			info->ats_qdep = pci_ats_queue_depth(pdev);
3710 		}
3711 		if (sm_supported(iommu)) {
3712 			if (pasid_supported(iommu)) {
3713 				int features = pci_pasid_features(pdev);
3714 
3715 				if (features >= 0)
3716 					info->pasid_supported = features | 1;
3717 			}
3718 
3719 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3720 			    pci_pri_supported(pdev))
3721 				info->pri_supported = 1;
3722 		}
3723 	}
3724 
3725 	dev_iommu_priv_set(dev, info);
3726 	if (pdev && pci_ats_supported(pdev)) {
3727 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3728 		ret = device_rbtree_insert(iommu, info);
3729 		if (ret)
3730 			goto free;
3731 	}
3732 
3733 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3734 		ret = intel_pasid_alloc_table(dev);
3735 		if (ret) {
3736 			dev_err(dev, "PASID table allocation failed\n");
3737 			goto clear_rbtree;
3738 		}
3739 
3740 		if (!context_copied(iommu, info->bus, info->devfn)) {
3741 			ret = intel_pasid_setup_sm_context(dev);
3742 			if (ret)
3743 				goto free_table;
3744 		}
3745 	}
3746 
3747 	intel_iommu_debugfs_create_dev(info);
3748 
3749 	/*
3750 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3751 	 * device is undefined if you enable PASID support after ATS support.
3752 	 * So always enable PASID support on devices which have it, even if
3753 	 * we can't yet know if we're ever going to use it.
3754 	 */
3755 	if (info->pasid_supported &&
3756 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3757 		info->pasid_enabled = 1;
3758 
3759 	return &iommu->iommu;
3760 free_table:
3761 	intel_pasid_free_table(dev);
3762 clear_rbtree:
3763 	device_rbtree_remove(info);
3764 free:
3765 	kfree(info);
3766 
3767 	return ERR_PTR(ret);
3768 }
3769 
intel_iommu_release_device(struct device * dev)3770 static void intel_iommu_release_device(struct device *dev)
3771 {
3772 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3773 	struct intel_iommu *iommu = info->iommu;
3774 
3775 	if (info->pasid_enabled) {
3776 		pci_disable_pasid(to_pci_dev(dev));
3777 		info->pasid_enabled = 0;
3778 	}
3779 
3780 	mutex_lock(&iommu->iopf_lock);
3781 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3782 		device_rbtree_remove(info);
3783 	mutex_unlock(&iommu->iopf_lock);
3784 
3785 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3786 	    !context_copied(iommu, info->bus, info->devfn))
3787 		intel_pasid_teardown_sm_context(dev);
3788 
3789 	intel_pasid_free_table(dev);
3790 	intel_iommu_debugfs_remove_dev(info);
3791 	kfree(info);
3792 	set_dma_ops(dev, NULL);
3793 }
3794 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3795 static void intel_iommu_get_resv_regions(struct device *device,
3796 					 struct list_head *head)
3797 {
3798 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3799 	struct iommu_resv_region *reg;
3800 	struct dmar_rmrr_unit *rmrr;
3801 	struct device *i_dev;
3802 	int i;
3803 
3804 	rcu_read_lock();
3805 	for_each_rmrr_units(rmrr) {
3806 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3807 					  i, i_dev) {
3808 			struct iommu_resv_region *resv;
3809 			enum iommu_resv_type type;
3810 			size_t length;
3811 
3812 			if (i_dev != device &&
3813 			    !is_downstream_to_pci_bridge(device, i_dev))
3814 				continue;
3815 
3816 			length = rmrr->end_address - rmrr->base_address + 1;
3817 
3818 			type = device_rmrr_is_relaxable(device) ?
3819 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3820 
3821 			resv = iommu_alloc_resv_region(rmrr->base_address,
3822 						       length, prot, type,
3823 						       GFP_ATOMIC);
3824 			if (!resv)
3825 				break;
3826 
3827 			list_add_tail(&resv->list, head);
3828 		}
3829 	}
3830 	rcu_read_unlock();
3831 
3832 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3833 	if (dev_is_pci(device)) {
3834 		struct pci_dev *pdev = to_pci_dev(device);
3835 
3836 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3837 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3838 					IOMMU_RESV_DIRECT_RELAXABLE,
3839 					GFP_KERNEL);
3840 			if (reg)
3841 				list_add_tail(&reg->list, head);
3842 		}
3843 	}
3844 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3845 
3846 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3847 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3848 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3849 	if (!reg)
3850 		return;
3851 	list_add_tail(&reg->list, head);
3852 }
3853 
intel_iommu_device_group(struct device * dev)3854 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3855 {
3856 	if (dev_is_pci(dev))
3857 		return pci_device_group(dev);
3858 	return generic_device_group(dev);
3859 }
3860 
intel_iommu_enable_sva(struct device * dev)3861 static int intel_iommu_enable_sva(struct device *dev)
3862 {
3863 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3864 	struct intel_iommu *iommu;
3865 
3866 	if (!info || dmar_disabled)
3867 		return -EINVAL;
3868 
3869 	iommu = info->iommu;
3870 	if (!iommu)
3871 		return -EINVAL;
3872 
3873 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3874 		return -ENODEV;
3875 
3876 	if (!info->pasid_enabled || !info->ats_enabled)
3877 		return -EINVAL;
3878 
3879 	/*
3880 	 * Devices having device-specific I/O fault handling should not
3881 	 * support PCI/PRI. The IOMMU side has no means to check the
3882 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
3883 	 * default that if the device driver enables SVA on a non-PRI
3884 	 * device, it will handle IOPF in its own way.
3885 	 */
3886 	if (!info->pri_supported)
3887 		return 0;
3888 
3889 	/* Devices supporting PRI should have it enabled. */
3890 	if (!info->pri_enabled)
3891 		return -EINVAL;
3892 
3893 	return 0;
3894 }
3895 
context_flip_pri(struct device_domain_info * info,bool enable)3896 static int context_flip_pri(struct device_domain_info *info, bool enable)
3897 {
3898 	struct intel_iommu *iommu = info->iommu;
3899 	u8 bus = info->bus, devfn = info->devfn;
3900 	struct context_entry *context;
3901 	u16 did;
3902 
3903 	spin_lock(&iommu->lock);
3904 	if (context_copied(iommu, bus, devfn)) {
3905 		spin_unlock(&iommu->lock);
3906 		return -EINVAL;
3907 	}
3908 
3909 	context = iommu_context_addr(iommu, bus, devfn, false);
3910 	if (!context || !context_present(context)) {
3911 		spin_unlock(&iommu->lock);
3912 		return -ENODEV;
3913 	}
3914 	did = context_domain_id(context);
3915 
3916 	if (enable)
3917 		context_set_sm_pre(context);
3918 	else
3919 		context_clear_sm_pre(context);
3920 
3921 	if (!ecap_coherent(iommu->ecap))
3922 		clflush_cache_range(context, sizeof(*context));
3923 	intel_context_flush_present(info, context, did, true);
3924 	spin_unlock(&iommu->lock);
3925 
3926 	return 0;
3927 }
3928 
intel_iommu_enable_iopf(struct device * dev)3929 static int intel_iommu_enable_iopf(struct device *dev)
3930 {
3931 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3932 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3933 	struct intel_iommu *iommu;
3934 	int ret;
3935 
3936 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3937 		return -ENODEV;
3938 
3939 	if (info->pri_enabled)
3940 		return -EBUSY;
3941 
3942 	iommu = info->iommu;
3943 	if (!iommu)
3944 		return -EINVAL;
3945 
3946 	/* PASID is required in PRG Response Message. */
3947 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3948 		return -EINVAL;
3949 
3950 	ret = pci_reset_pri(pdev);
3951 	if (ret)
3952 		return ret;
3953 
3954 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3955 	if (ret)
3956 		return ret;
3957 
3958 	ret = context_flip_pri(info, true);
3959 	if (ret)
3960 		goto err_remove_device;
3961 
3962 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
3963 	if (ret)
3964 		goto err_clear_pri;
3965 
3966 	info->pri_enabled = 1;
3967 
3968 	return 0;
3969 err_clear_pri:
3970 	context_flip_pri(info, false);
3971 err_remove_device:
3972 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3973 
3974 	return ret;
3975 }
3976 
intel_iommu_disable_iopf(struct device * dev)3977 static int intel_iommu_disable_iopf(struct device *dev)
3978 {
3979 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3980 	struct intel_iommu *iommu = info->iommu;
3981 
3982 	if (!info->pri_enabled)
3983 		return -EINVAL;
3984 
3985 	/* Disable new PRI reception: */
3986 	context_flip_pri(info, false);
3987 
3988 	/*
3989 	 * Remove device from fault queue and acknowledge all outstanding
3990 	 * PRQs to the device:
3991 	 */
3992 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3993 
3994 	/*
3995 	 * PCIe spec states that by clearing PRI enable bit, the Page
3996 	 * Request Interface will not issue new page requests, but has
3997 	 * outstanding page requests that have been transmitted or are
3998 	 * queued for transmission. This is supposed to be called after
3999 	 * the device driver has stopped DMA, all PASIDs have been
4000 	 * unbound and the outstanding PRQs have been drained.
4001 	 */
4002 	pci_disable_pri(to_pci_dev(dev));
4003 	info->pri_enabled = 0;
4004 
4005 	return 0;
4006 }
4007 
4008 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4009 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4010 {
4011 	switch (feat) {
4012 	case IOMMU_DEV_FEAT_IOPF:
4013 		return intel_iommu_enable_iopf(dev);
4014 
4015 	case IOMMU_DEV_FEAT_SVA:
4016 		return intel_iommu_enable_sva(dev);
4017 
4018 	default:
4019 		return -ENODEV;
4020 	}
4021 }
4022 
4023 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4024 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4025 {
4026 	switch (feat) {
4027 	case IOMMU_DEV_FEAT_IOPF:
4028 		return intel_iommu_disable_iopf(dev);
4029 
4030 	case IOMMU_DEV_FEAT_SVA:
4031 		return 0;
4032 
4033 	default:
4034 		return -ENODEV;
4035 	}
4036 }
4037 
intel_iommu_is_attach_deferred(struct device * dev)4038 static bool intel_iommu_is_attach_deferred(struct device *dev)
4039 {
4040 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4041 
4042 	return translation_pre_enabled(info->iommu) && !info->domain;
4043 }
4044 
4045 /*
4046  * Check that the device does not live on an external facing PCI port that is
4047  * marked as untrusted. Such devices should not be able to apply quirks and
4048  * thus not be able to bypass the IOMMU restrictions.
4049  */
risky_device(struct pci_dev * pdev)4050 static bool risky_device(struct pci_dev *pdev)
4051 {
4052 	if (pdev->untrusted) {
4053 		pci_info(pdev,
4054 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4055 			 pdev->vendor, pdev->device);
4056 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4057 		return true;
4058 	}
4059 	return false;
4060 }
4061 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4062 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4063 				      unsigned long iova, size_t size)
4064 {
4065 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4066 
4067 	return 0;
4068 }
4069 
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4070 void domain_remove_dev_pasid(struct iommu_domain *domain,
4071 			     struct device *dev, ioasid_t pasid)
4072 {
4073 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4074 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4075 	struct intel_iommu *iommu = info->iommu;
4076 	struct dmar_domain *dmar_domain;
4077 	unsigned long flags;
4078 
4079 	if (!domain)
4080 		return;
4081 
4082 	/* Identity domain has no meta data for pasid. */
4083 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4084 		return;
4085 
4086 	dmar_domain = to_dmar_domain(domain);
4087 	spin_lock_irqsave(&dmar_domain->lock, flags);
4088 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4089 		if (curr->dev == dev && curr->pasid == pasid) {
4090 			list_del(&curr->link_domain);
4091 			dev_pasid = curr;
4092 			break;
4093 		}
4094 	}
4095 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4096 
4097 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4098 	domain_detach_iommu(dmar_domain, iommu);
4099 	if (!WARN_ON_ONCE(!dev_pasid)) {
4100 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4101 		kfree(dev_pasid);
4102 	}
4103 }
4104 
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4105 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4106 					 struct device *dev, ioasid_t pasid,
4107 					 struct iommu_domain *old)
4108 {
4109 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4110 
4111 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4112 	domain_remove_dev_pasid(old, dev, pasid);
4113 
4114 	return 0;
4115 }
4116 
4117 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4118 domain_add_dev_pasid(struct iommu_domain *domain,
4119 		     struct device *dev, ioasid_t pasid)
4120 {
4121 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4122 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4123 	struct intel_iommu *iommu = info->iommu;
4124 	struct dev_pasid_info *dev_pasid;
4125 	unsigned long flags;
4126 	int ret;
4127 
4128 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4129 	if (!dev_pasid)
4130 		return ERR_PTR(-ENOMEM);
4131 
4132 	ret = domain_attach_iommu(dmar_domain, iommu);
4133 	if (ret)
4134 		goto out_free;
4135 
4136 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4137 	if (ret)
4138 		goto out_detach_iommu;
4139 
4140 	dev_pasid->dev = dev;
4141 	dev_pasid->pasid = pasid;
4142 	spin_lock_irqsave(&dmar_domain->lock, flags);
4143 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4144 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4145 
4146 	return dev_pasid;
4147 out_detach_iommu:
4148 	domain_detach_iommu(dmar_domain, iommu);
4149 out_free:
4150 	kfree(dev_pasid);
4151 	return ERR_PTR(ret);
4152 }
4153 
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4154 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4155 				     struct device *dev, ioasid_t pasid,
4156 				     struct iommu_domain *old)
4157 {
4158 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4159 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4160 	struct intel_iommu *iommu = info->iommu;
4161 	struct dev_pasid_info *dev_pasid;
4162 	int ret;
4163 
4164 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4165 		return -EINVAL;
4166 
4167 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4168 		return -EOPNOTSUPP;
4169 
4170 	if (domain->dirty_ops)
4171 		return -EINVAL;
4172 
4173 	if (context_copied(iommu, info->bus, info->devfn))
4174 		return -EBUSY;
4175 
4176 	ret = paging_domain_compatible(domain, dev);
4177 	if (ret)
4178 		return ret;
4179 
4180 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4181 	if (IS_ERR(dev_pasid))
4182 		return PTR_ERR(dev_pasid);
4183 
4184 	if (dmar_domain->use_first_level)
4185 		ret = domain_setup_first_level(iommu, dmar_domain,
4186 					       dev, pasid, old);
4187 	else
4188 		ret = domain_setup_second_level(iommu, dmar_domain,
4189 						dev, pasid, old);
4190 	if (ret)
4191 		goto out_remove_dev_pasid;
4192 
4193 	domain_remove_dev_pasid(old, dev, pasid);
4194 
4195 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4196 
4197 	return 0;
4198 
4199 out_remove_dev_pasid:
4200 	domain_remove_dev_pasid(domain, dev, pasid);
4201 	return ret;
4202 }
4203 
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4204 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4205 {
4206 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4207 	struct intel_iommu *iommu = info->iommu;
4208 	struct iommu_hw_info_vtd *vtd;
4209 
4210 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4211 	if (!vtd)
4212 		return ERR_PTR(-ENOMEM);
4213 
4214 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4215 	vtd->cap_reg = iommu->cap;
4216 	vtd->ecap_reg = iommu->ecap;
4217 	*length = sizeof(*vtd);
4218 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4219 	return vtd;
4220 }
4221 
4222 /*
4223  * Set dirty tracking for the device list of a domain. The caller must
4224  * hold the domain->lock when calling it.
4225  */
device_set_dirty_tracking(struct list_head * devices,bool enable)4226 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4227 {
4228 	struct device_domain_info *info;
4229 	int ret = 0;
4230 
4231 	list_for_each_entry(info, devices, link) {
4232 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4233 						       IOMMU_NO_PASID, enable);
4234 		if (ret)
4235 			break;
4236 	}
4237 
4238 	return ret;
4239 }
4240 
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4241 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4242 					    bool enable)
4243 {
4244 	struct dmar_domain *s1_domain;
4245 	unsigned long flags;
4246 	int ret;
4247 
4248 	spin_lock(&domain->s1_lock);
4249 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4250 		spin_lock_irqsave(&s1_domain->lock, flags);
4251 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4252 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4253 		if (ret)
4254 			goto err_unwind;
4255 	}
4256 	spin_unlock(&domain->s1_lock);
4257 	return 0;
4258 
4259 err_unwind:
4260 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4261 		spin_lock_irqsave(&s1_domain->lock, flags);
4262 		device_set_dirty_tracking(&s1_domain->devices,
4263 					  domain->dirty_tracking);
4264 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4265 	}
4266 	spin_unlock(&domain->s1_lock);
4267 	return ret;
4268 }
4269 
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4270 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4271 					  bool enable)
4272 {
4273 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4274 	int ret;
4275 
4276 	spin_lock(&dmar_domain->lock);
4277 	if (dmar_domain->dirty_tracking == enable)
4278 		goto out_unlock;
4279 
4280 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4281 	if (ret)
4282 		goto err_unwind;
4283 
4284 	if (dmar_domain->nested_parent) {
4285 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4286 		if (ret)
4287 			goto err_unwind;
4288 	}
4289 
4290 	dmar_domain->dirty_tracking = enable;
4291 out_unlock:
4292 	spin_unlock(&dmar_domain->lock);
4293 
4294 	return 0;
4295 
4296 err_unwind:
4297 	device_set_dirty_tracking(&dmar_domain->devices,
4298 				  dmar_domain->dirty_tracking);
4299 	spin_unlock(&dmar_domain->lock);
4300 	return ret;
4301 }
4302 
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4303 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4304 					    unsigned long iova, size_t size,
4305 					    unsigned long flags,
4306 					    struct iommu_dirty_bitmap *dirty)
4307 {
4308 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4309 	unsigned long end = iova + size - 1;
4310 	unsigned long pgsize;
4311 
4312 	/*
4313 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4314 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4315 	 * have occurred when we stopped dirty tracking. This ensures that we
4316 	 * never inherit dirtied bits from a previous cycle.
4317 	 */
4318 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4319 		return -EINVAL;
4320 
4321 	do {
4322 		struct dma_pte *pte;
4323 		int lvl = 0;
4324 
4325 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4326 				     GFP_ATOMIC);
4327 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4328 		if (!pte || !dma_pte_present(pte)) {
4329 			iova += pgsize;
4330 			continue;
4331 		}
4332 
4333 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4334 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4335 		iova += pgsize;
4336 	} while (iova < end);
4337 
4338 	return 0;
4339 }
4340 
4341 static const struct iommu_dirty_ops intel_dirty_ops = {
4342 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4343 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4344 };
4345 
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4346 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4347 {
4348 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4349 	struct intel_iommu *iommu = info->iommu;
4350 	struct context_entry *context;
4351 
4352 	spin_lock(&iommu->lock);
4353 	context = iommu_context_addr(iommu, bus, devfn, 1);
4354 	if (!context) {
4355 		spin_unlock(&iommu->lock);
4356 		return -ENOMEM;
4357 	}
4358 
4359 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4360 		spin_unlock(&iommu->lock);
4361 		return 0;
4362 	}
4363 
4364 	copied_context_tear_down(iommu, context, bus, devfn);
4365 	context_clear_entry(context);
4366 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4367 
4368 	/*
4369 	 * In pass through mode, AW must be programmed to indicate the largest
4370 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4371 	 */
4372 	context_set_address_width(context, iommu->msagaw);
4373 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4374 	context_set_fault_enable(context);
4375 	context_set_present(context);
4376 	if (!ecap_coherent(iommu->ecap))
4377 		clflush_cache_range(context, sizeof(*context));
4378 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4379 	spin_unlock(&iommu->lock);
4380 
4381 	return 0;
4382 }
4383 
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4384 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4385 {
4386 	struct device *dev = data;
4387 
4388 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4389 }
4390 
device_setup_pass_through(struct device * dev)4391 static int device_setup_pass_through(struct device *dev)
4392 {
4393 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4394 
4395 	if (!dev_is_pci(dev))
4396 		return context_setup_pass_through(dev, info->bus, info->devfn);
4397 
4398 	return pci_for_each_dma_alias(to_pci_dev(dev),
4399 				      context_setup_pass_through_cb, dev);
4400 }
4401 
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4402 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4403 {
4404 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4405 	struct intel_iommu *iommu = info->iommu;
4406 	int ret;
4407 
4408 	device_block_translation(dev);
4409 
4410 	if (dev_is_real_dma_subdevice(dev))
4411 		return 0;
4412 
4413 	if (sm_supported(iommu)) {
4414 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4415 		if (!ret)
4416 			iommu_enable_pci_caps(info);
4417 	} else {
4418 		ret = device_setup_pass_through(dev);
4419 	}
4420 
4421 	return ret;
4422 }
4423 
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4424 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4425 					 struct device *dev, ioasid_t pasid,
4426 					 struct iommu_domain *old)
4427 {
4428 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4429 	struct intel_iommu *iommu = info->iommu;
4430 	int ret;
4431 
4432 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4433 		return -EOPNOTSUPP;
4434 
4435 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4436 	if (ret)
4437 		return ret;
4438 
4439 	domain_remove_dev_pasid(old, dev, pasid);
4440 	return 0;
4441 }
4442 
4443 static struct iommu_domain identity_domain = {
4444 	.type = IOMMU_DOMAIN_IDENTITY,
4445 	.ops = &(const struct iommu_domain_ops) {
4446 		.attach_dev	= identity_domain_attach_dev,
4447 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4448 	},
4449 };
4450 
4451 const struct iommu_ops intel_iommu_ops = {
4452 	.blocked_domain		= &blocking_domain,
4453 	.release_domain		= &blocking_domain,
4454 	.identity_domain	= &identity_domain,
4455 	.capable		= intel_iommu_capable,
4456 	.hw_info		= intel_iommu_hw_info,
4457 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4458 	.domain_alloc_sva	= intel_svm_domain_alloc,
4459 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4460 	.probe_device		= intel_iommu_probe_device,
4461 	.release_device		= intel_iommu_release_device,
4462 	.get_resv_regions	= intel_iommu_get_resv_regions,
4463 	.device_group		= intel_iommu_device_group,
4464 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4465 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4466 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4467 	.def_domain_type	= device_def_domain_type,
4468 	.pgsize_bitmap		= SZ_4K,
4469 	.page_response		= intel_iommu_page_response,
4470 	.default_domain_ops = &(const struct iommu_domain_ops) {
4471 		.attach_dev		= intel_iommu_attach_device,
4472 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4473 		.map_pages		= intel_iommu_map_pages,
4474 		.unmap_pages		= intel_iommu_unmap_pages,
4475 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4476 		.flush_iotlb_all        = intel_flush_iotlb_all,
4477 		.iotlb_sync		= intel_iommu_tlb_sync,
4478 		.iova_to_phys		= intel_iommu_iova_to_phys,
4479 		.free			= intel_iommu_domain_free,
4480 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4481 	}
4482 };
4483 
quirk_iommu_igfx(struct pci_dev * dev)4484 static void quirk_iommu_igfx(struct pci_dev *dev)
4485 {
4486 	if (risky_device(dev))
4487 		return;
4488 
4489 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4490 	disable_igfx_iommu = 1;
4491 }
4492 
4493 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4497 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4498 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4499 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4500 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4501 
4502 /* Broadwell igfx malfunctions with dmar */
4503 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4505 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4506 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4507 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4508 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4527 
quirk_iommu_rwbf(struct pci_dev * dev)4528 static void quirk_iommu_rwbf(struct pci_dev *dev)
4529 {
4530 	if (risky_device(dev))
4531 		return;
4532 
4533 	/*
4534 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4535 	 * but needs it. Same seems to hold for the desktop versions.
4536 	 */
4537 	pci_info(dev, "Forcing write-buffer flush capability\n");
4538 	rwbf_quirk = 1;
4539 }
4540 
4541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4548 
4549 #define GGC 0x52
4550 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4551 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4552 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4553 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4554 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4555 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4556 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4557 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4558 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4559 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4560 {
4561 	unsigned short ggc;
4562 
4563 	if (risky_device(dev))
4564 		return;
4565 
4566 	if (pci_read_config_word(dev, GGC, &ggc))
4567 		return;
4568 
4569 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4570 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4571 		disable_igfx_iommu = 1;
4572 	} else if (!disable_igfx_iommu) {
4573 		/* we have to ensure the gfx device is idle before we flush */
4574 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4575 		iommu_set_dma_strict();
4576 	}
4577 }
4578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4582 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4583 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4584 {
4585 	unsigned short ver;
4586 
4587 	if (!IS_GFX_DEVICE(dev))
4588 		return;
4589 
4590 	ver = (dev->device >> 8) & 0xff;
4591 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4592 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4593 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4594 		return;
4595 
4596 	if (risky_device(dev))
4597 		return;
4598 
4599 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4600 	iommu_skip_te_disable = 1;
4601 }
4602 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4603 
4604 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4605    ISOCH DMAR unit for the Azalia sound device, but not give it any
4606    TLB entries, which causes it to deadlock. Check for that.  We do
4607    this in a function called from init_dmars(), instead of in a PCI
4608    quirk, because we don't want to print the obnoxious "BIOS broken"
4609    message if VT-d is actually disabled.
4610 */
check_tylersburg_isoch(void)4611 static void __init check_tylersburg_isoch(void)
4612 {
4613 	struct pci_dev *pdev;
4614 	uint32_t vtisochctrl;
4615 
4616 	/* If there's no Azalia in the system anyway, forget it. */
4617 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4618 	if (!pdev)
4619 		return;
4620 
4621 	if (risky_device(pdev)) {
4622 		pci_dev_put(pdev);
4623 		return;
4624 	}
4625 
4626 	pci_dev_put(pdev);
4627 
4628 	/* System Management Registers. Might be hidden, in which case
4629 	   we can't do the sanity check. But that's OK, because the
4630 	   known-broken BIOSes _don't_ actually hide it, so far. */
4631 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4632 	if (!pdev)
4633 		return;
4634 
4635 	if (risky_device(pdev)) {
4636 		pci_dev_put(pdev);
4637 		return;
4638 	}
4639 
4640 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4641 		pci_dev_put(pdev);
4642 		return;
4643 	}
4644 
4645 	pci_dev_put(pdev);
4646 
4647 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4648 	if (vtisochctrl & 1)
4649 		return;
4650 
4651 	/* Drop all bits other than the number of TLB entries */
4652 	vtisochctrl &= 0x1c;
4653 
4654 	/* If we have the recommended number of TLB entries (16), fine. */
4655 	if (vtisochctrl == 0x10)
4656 		return;
4657 
4658 	/* Zero TLB entries? You get to ride the short bus to school. */
4659 	if (!vtisochctrl) {
4660 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4661 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4662 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4663 		     dmi_get_system_info(DMI_BIOS_VERSION),
4664 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4665 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4666 		return;
4667 	}
4668 
4669 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4670 	       vtisochctrl);
4671 }
4672 
4673 /*
4674  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4675  * invalidation completion before posted writes initiated with translated address
4676  * that utilized translations matching the invalidation address range, violating
4677  * the invalidation completion ordering.
4678  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4679  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4680  * under the control of the trusted/privileged host device driver must use this
4681  * quirk.
4682  * Device TLBs are invalidated under the following six conditions:
4683  * 1. Device driver does DMA API unmap IOVA
4684  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4685  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4686  *    exit_mmap() due to crash
4687  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4688  *    VM has to free pages that were unmapped
4689  * 5. Userspace driver unmaps a DMA buffer
4690  * 6. Cache invalidation in vSVA usage (upcoming)
4691  *
4692  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4693  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4694  * invalidate TLB the same way as normal user unmap which will use this quirk.
4695  * The dTLB invalidation after PASID cache flush does not need this quirk.
4696  *
4697  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4698  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4699 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4700 			       unsigned long address, unsigned long mask,
4701 			       u32 pasid, u16 qdep)
4702 {
4703 	u16 sid;
4704 
4705 	if (likely(!info->dtlb_extra_inval))
4706 		return;
4707 
4708 	sid = PCI_DEVID(info->bus, info->devfn);
4709 	if (pasid == IOMMU_NO_PASID) {
4710 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4711 				   qdep, address, mask);
4712 	} else {
4713 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4714 					 pasid, qdep, address, mask);
4715 	}
4716 }
4717 
4718 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4719 
4720 /*
4721  * Function to submit a command to the enhanced command interface. The
4722  * valid enhanced command descriptions are defined in Table 47 of the
4723  * VT-d spec. The VT-d hardware implementation may support some but not
4724  * all commands, which can be determined by checking the Enhanced
4725  * Command Capability Register.
4726  *
4727  * Return values:
4728  *  - 0: Command successful without any error;
4729  *  - Negative: software error value;
4730  *  - Nonzero positive: failure status code defined in Table 48.
4731  */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4732 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4733 {
4734 	unsigned long flags;
4735 	u64 res;
4736 	int ret;
4737 
4738 	if (!cap_ecmds(iommu->cap))
4739 		return -ENODEV;
4740 
4741 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4742 
4743 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4744 	if (res & DMA_ECMD_ECRSP_IP) {
4745 		ret = -EBUSY;
4746 		goto err;
4747 	}
4748 
4749 	/*
4750 	 * Unconditionally write the operand B, because
4751 	 * - There is no side effect if an ecmd doesn't require an
4752 	 *   operand B, but we set the register to some value.
4753 	 * - It's not invoked in any critical path. The extra MMIO
4754 	 *   write doesn't bring any performance concerns.
4755 	 */
4756 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4757 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4758 
4759 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4760 		      !(res & DMA_ECMD_ECRSP_IP), res);
4761 
4762 	if (res & DMA_ECMD_ECRSP_IP) {
4763 		ret = -ETIMEDOUT;
4764 		goto err;
4765 	}
4766 
4767 	ret = ecmd_get_status_code(res);
4768 err:
4769 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4770 
4771 	return ret;
4772 }
4773