xref: /linux/drivers/iommu/intel/iommu.c (revision 8477ab143069c6b05d6da4a8184ded8b969240f5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 /*
61  * set to 1 to panic kernel if can't successfully enable VT-d
62  * (used when kernel is launched w/ TXT)
63  */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67 
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69 
70 /*
71  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72  * if marked present.
73  */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 	if (!(re->lo & 1))
77 		return 0;
78 
79 	return re->lo & VTD_PAGE_MASK;
80 }
81 
82 /*
83  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84  * if marked present.
85  */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 	if (!(re->hi & 1))
89 		return 0;
90 
91 	return re->hi & VTD_PAGE_MASK;
92 }
93 
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 	struct device_domain_info *info =
97 		rb_entry(node, struct device_domain_info, node);
98 	const u16 *rid_lhs = key;
99 
100 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 		return -1;
102 
103 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 		return 1;
105 
106 	return 0;
107 }
108 
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 	struct device_domain_info *info =
112 		rb_entry(lhs, struct device_domain_info, node);
113 	u16 key = PCI_DEVID(info->bus, info->devfn);
114 
115 	return device_rid_cmp_key(&key, rhs);
116 }
117 
118 /*
119  * Looks up an IOMMU-probed device using its source ID.
120  *
121  * Returns the pointer to the device if there is a match. Otherwise,
122  * returns NULL.
123  *
124  * Note that this helper doesn't guarantee that the device won't be
125  * released by the iommu subsystem after being returned. The caller
126  * should use its own synchronization mechanism to avoid the device
127  * being released during its use if its possibly the case.
128  */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 	struct device_domain_info *info = NULL;
132 	struct rb_node *node;
133 	unsigned long flags;
134 
135 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 	if (node)
138 		info = rb_entry(node, struct device_domain_info, node);
139 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140 
141 	return info ? info->dev : NULL;
142 }
143 
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 				struct device_domain_info *info)
146 {
147 	struct rb_node *curr;
148 	unsigned long flags;
149 
150 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 	if (WARN_ON(curr))
154 		return -EEXIST;
155 
156 	return 0;
157 }
158 
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 	struct intel_iommu *iommu = info->iommu;
162 	unsigned long flags;
163 
164 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 	rb_erase(&info->node, &iommu->device_rbtree);
166 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168 
169 struct dmar_rmrr_unit {
170 	struct list_head list;		/* list of rmrr units	*/
171 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
172 	u64	base_address;		/* reserved base address*/
173 	u64	end_address;		/* reserved end address */
174 	struct dmar_dev_scope *devices;	/* target devices */
175 	int	devices_cnt;		/* target device count */
176 };
177 
178 struct dmar_atsr_unit {
179 	struct list_head list;		/* list of ATSR units */
180 	struct acpi_dmar_header *hdr;	/* ACPI header */
181 	struct dmar_dev_scope *devices;	/* target devices */
182 	int devices_cnt;		/* target device count */
183 	u8 include_all:1;		/* include all ports */
184 };
185 
186 struct dmar_satc_unit {
187 	struct list_head list;		/* list of SATC units */
188 	struct acpi_dmar_header *hdr;	/* ACPI header */
189 	struct dmar_dev_scope *devices;	/* target devices */
190 	struct intel_iommu *iommu;	/* the corresponding iommu */
191 	int devices_cnt;		/* target device count */
192 	u8 atc_required:1;		/* ATS is required */
193 };
194 
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198 
199 #define for_each_rmrr_units(rmrr) \
200 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201 
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203 
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206 
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209 
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214 
215 #define IDENTMAP_AZALIA		4
216 
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219 
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224 
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229 
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 	u32 gsts;
233 
234 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 	if (gsts & DMA_GSTS_TES)
236 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238 
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 	if (!str)
242 		return -EINVAL;
243 
244 	while (*str) {
245 		if (!strncmp(str, "on", 2)) {
246 			dmar_disabled = 0;
247 			pr_info("IOMMU enabled\n");
248 		} else if (!strncmp(str, "off", 3)) {
249 			dmar_disabled = 1;
250 			no_platform_optin = 1;
251 			pr_info("IOMMU disabled\n");
252 		} else if (!strncmp(str, "igfx_off", 8)) {
253 			disable_igfx_iommu = 1;
254 			pr_info("Disable GFX device mapping\n");
255 		} else if (!strncmp(str, "forcedac", 8)) {
256 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 			iommu_dma_forcedac = true;
258 		} else if (!strncmp(str, "strict", 6)) {
259 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 			iommu_set_dma_strict();
261 		} else if (!strncmp(str, "sp_off", 6)) {
262 			pr_info("Disable supported super page\n");
263 			intel_iommu_superpage = 0;
264 		} else if (!strncmp(str, "sm_on", 5)) {
265 			pr_info("Enable scalable mode if hardware supports\n");
266 			intel_iommu_sm = 1;
267 		} else if (!strncmp(str, "sm_off", 6)) {
268 			pr_info("Scalable mode is disallowed\n");
269 			intel_iommu_sm = 0;
270 		} else if (!strncmp(str, "tboot_noforce", 13)) {
271 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 			intel_iommu_tboot_noforce = 1;
273 		} else {
274 			pr_notice("Unknown option - '%s'\n", str);
275 		}
276 
277 		str += strcspn(str, ",");
278 		while (*str == ',')
279 			str++;
280 	}
281 
282 	return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289 
290 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292 
293 /*
294  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296  * the returned SAGAW.
297  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 	unsigned long fl_sagaw, sl_sagaw;
301 
302 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 	sl_sagaw = cap_sagaw(iommu->cap);
304 
305 	/* Second level only. */
306 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 		return sl_sagaw;
308 
309 	/* First level only. */
310 	if (!ecap_slts(iommu->ecap))
311 		return fl_sagaw;
312 
313 	return fl_sagaw & sl_sagaw;
314 }
315 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 	unsigned long sagaw;
319 	int agaw;
320 
321 	sagaw = __iommu_calculate_sagaw(iommu);
322 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 		if (test_bit(agaw, &sagaw))
324 			break;
325 	}
326 
327 	return agaw;
328 }
329 
330 /*
331  * Calculate max SAGAW for each iommu.
332  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337 
338 /*
339  * calculate agaw for each iommu.
340  * "SAGAW" may be different across iommus, use a default agaw, and
341  * get a supported less agaw for iommus that don't support the default agaw.
342  */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347 
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 	return sm_supported(iommu) ?
351 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353 
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 	unsigned long bitmap = 0;
358 
359 	/*
360 	 * 1-level super page supports page size of 2MiB, 2-level super page
361 	 * supports page size of both 2MiB and 1GiB.
362 	 */
363 	if (domain->iommu_superpage == 1)
364 		bitmap |= SZ_2M;
365 	else if (domain->iommu_superpage == 2)
366 		bitmap |= SZ_2M | SZ_1G;
367 
368 	return bitmap;
369 }
370 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 					 u8 devfn, int alloc)
373 {
374 	struct root_entry *root = &iommu->root_entry[bus];
375 	struct context_entry *context;
376 	u64 *entry;
377 
378 	/*
379 	 * Except that the caller requested to allocate a new entry,
380 	 * returning a copied context entry makes no sense.
381 	 */
382 	if (!alloc && context_copied(iommu, bus, devfn))
383 		return NULL;
384 
385 	entry = &root->lo;
386 	if (sm_supported(iommu)) {
387 		if (devfn >= 0x80) {
388 			devfn -= 0x80;
389 			entry = &root->hi;
390 		}
391 		devfn *= 2;
392 	}
393 	if (*entry & 1)
394 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 	else {
396 		unsigned long phy_addr;
397 		if (!alloc)
398 			return NULL;
399 
400 		context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
401 						    SZ_4K);
402 		if (!context)
403 			return NULL;
404 
405 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
406 		phy_addr = virt_to_phys((void *)context);
407 		*entry = phy_addr | 1;
408 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
409 	}
410 	return &context[devfn];
411 }
412 
413 /**
414  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
415  *				 sub-hierarchy of a candidate PCI-PCI bridge
416  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417  * @bridge: the candidate PCI-PCI bridge
418  *
419  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420  */
421 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)422 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
423 {
424 	struct pci_dev *pdev, *pbridge;
425 
426 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
427 		return false;
428 
429 	pdev = to_pci_dev(dev);
430 	pbridge = to_pci_dev(bridge);
431 
432 	if (pbridge->subordinate &&
433 	    pbridge->subordinate->number <= pdev->bus->number &&
434 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
435 		return true;
436 
437 	return false;
438 }
439 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)440 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441 {
442 	struct dmar_drhd_unit *drhd;
443 	u32 vtbar;
444 	int rc;
445 
446 	/* We know that this device on this chipset has its own IOMMU.
447 	 * If we find it under a different IOMMU, then the BIOS is lying
448 	 * to us. Hope that the IOMMU for this device is actually
449 	 * disabled, and it needs no translation...
450 	 */
451 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
452 	if (rc) {
453 		/* "can't" happen */
454 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455 		return false;
456 	}
457 	vtbar &= 0xffff0000;
458 
459 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
460 	drhd = dmar_find_matched_drhd_unit(pdev);
461 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
462 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464 		return true;
465 	}
466 
467 	return false;
468 }
469 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)470 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
471 {
472 	if (!iommu || iommu->drhd->ignored)
473 		return true;
474 
475 	if (dev_is_pci(dev)) {
476 		struct pci_dev *pdev = to_pci_dev(dev);
477 
478 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480 		    quirk_ioat_snb_local_iommu(pdev))
481 			return true;
482 	}
483 
484 	return false;
485 }
486 
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)487 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
488 {
489 	struct dmar_drhd_unit *drhd = NULL;
490 	struct pci_dev *pdev = NULL;
491 	struct intel_iommu *iommu;
492 	struct device *tmp;
493 	u16 segment = 0;
494 	int i;
495 
496 	if (!dev)
497 		return NULL;
498 
499 	if (dev_is_pci(dev)) {
500 		struct pci_dev *pf_pdev;
501 
502 		pdev = pci_real_dma_dev(to_pci_dev(dev));
503 
504 		/* VFs aren't listed in scope tables; we need to look up
505 		 * the PF instead to find the IOMMU. */
506 		pf_pdev = pci_physfn(pdev);
507 		dev = &pf_pdev->dev;
508 		segment = pci_domain_nr(pdev->bus);
509 	} else if (has_acpi_companion(dev))
510 		dev = &ACPI_COMPANION(dev)->dev;
511 
512 	rcu_read_lock();
513 	for_each_iommu(iommu, drhd) {
514 		if (pdev && segment != drhd->segment)
515 			continue;
516 
517 		for_each_active_dev_scope(drhd->devices,
518 					  drhd->devices_cnt, i, tmp) {
519 			if (tmp == dev) {
520 				/* For a VF use its original BDF# not that of the PF
521 				 * which we used for the IOMMU lookup. Strictly speaking
522 				 * we could do this for all PCI devices; we only need to
523 				 * get the BDF# from the scope table for ACPI matches. */
524 				if (pdev && pdev->is_virtfn)
525 					goto got_pdev;
526 
527 				if (bus && devfn) {
528 					*bus = drhd->devices[i].bus;
529 					*devfn = drhd->devices[i].devfn;
530 				}
531 				goto out;
532 			}
533 
534 			if (is_downstream_to_pci_bridge(dev, tmp))
535 				goto got_pdev;
536 		}
537 
538 		if (pdev && drhd->include_all) {
539 got_pdev:
540 			if (bus && devfn) {
541 				*bus = pdev->bus->number;
542 				*devfn = pdev->devfn;
543 			}
544 			goto out;
545 		}
546 	}
547 	iommu = NULL;
548 out:
549 	if (iommu_is_dummy(iommu, dev))
550 		iommu = NULL;
551 
552 	rcu_read_unlock();
553 
554 	return iommu;
555 }
556 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)557 static void domain_flush_cache(struct dmar_domain *domain,
558 			       void *addr, int size)
559 {
560 	if (!domain->iommu_coherency)
561 		clflush_cache_range(addr, size);
562 }
563 
free_context_table(struct intel_iommu * iommu)564 static void free_context_table(struct intel_iommu *iommu)
565 {
566 	struct context_entry *context;
567 	int i;
568 
569 	if (!iommu->root_entry)
570 		return;
571 
572 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
573 		context = iommu_context_addr(iommu, i, 0, 0);
574 		if (context)
575 			iommu_free_pages(context);
576 
577 		if (!sm_supported(iommu))
578 			continue;
579 
580 		context = iommu_context_addr(iommu, i, 0x80, 0);
581 		if (context)
582 			iommu_free_pages(context);
583 	}
584 
585 	iommu_free_pages(iommu->root_entry);
586 	iommu->root_entry = NULL;
587 }
588 
589 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)590 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
591 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
592 {
593 	struct dma_pte *pte;
594 	int offset;
595 
596 	while (1) {
597 		offset = pfn_level_offset(pfn, level);
598 		pte = &parent[offset];
599 
600 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601 
602 		if (!dma_pte_present(pte)) {
603 			pr_info("page table not present at level %d\n", level - 1);
604 			break;
605 		}
606 
607 		if (level == 1 || dma_pte_superpage(pte))
608 			break;
609 
610 		parent = phys_to_virt(dma_pte_addr(pte));
611 		level--;
612 	}
613 }
614 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)615 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616 			  unsigned long long addr, u32 pasid)
617 {
618 	struct pasid_dir_entry *dir, *pde;
619 	struct pasid_entry *entries, *pte;
620 	struct context_entry *ctx_entry;
621 	struct root_entry *rt_entry;
622 	int i, dir_index, index, level;
623 	u8 devfn = source_id & 0xff;
624 	u8 bus = source_id >> 8;
625 	struct dma_pte *pgtable;
626 
627 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628 
629 	/* root entry dump */
630 	if (!iommu->root_entry) {
631 		pr_info("root table is not present\n");
632 		return;
633 	}
634 	rt_entry = &iommu->root_entry[bus];
635 
636 	if (sm_supported(iommu))
637 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638 			rt_entry->hi, rt_entry->lo);
639 	else
640 		pr_info("root entry: 0x%016llx", rt_entry->lo);
641 
642 	/* context entry dump */
643 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
644 	if (!ctx_entry) {
645 		pr_info("context table is not present\n");
646 		return;
647 	}
648 
649 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650 		ctx_entry->hi, ctx_entry->lo);
651 
652 	/* legacy mode does not require PASID entries */
653 	if (!sm_supported(iommu)) {
654 		if (!context_present(ctx_entry)) {
655 			pr_info("legacy mode page table is not present\n");
656 			return;
657 		}
658 		level = agaw_to_level(ctx_entry->hi & 7);
659 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
660 		goto pgtable_walk;
661 	}
662 
663 	if (!context_present(ctx_entry)) {
664 		pr_info("pasid directory table is not present\n");
665 		return;
666 	}
667 
668 	/* get the pointer to pasid directory entry */
669 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
670 
671 	/* For request-without-pasid, get the pasid from context entry */
672 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673 		pasid = IOMMU_NO_PASID;
674 
675 	dir_index = pasid >> PASID_PDE_SHIFT;
676 	pde = &dir[dir_index];
677 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678 
679 	/* get the pointer to the pasid table entry */
680 	entries = get_pasid_table_from_pde(pde);
681 	if (!entries) {
682 		pr_info("pasid table is not present\n");
683 		return;
684 	}
685 	index = pasid & PASID_PTE_MASK;
686 	pte = &entries[index];
687 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
688 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689 
690 	if (!pasid_pte_is_present(pte)) {
691 		pr_info("scalable mode page table is not present\n");
692 		return;
693 	}
694 
695 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
697 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
698 	} else {
699 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
700 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
701 	}
702 
703 pgtable_walk:
704 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
705 }
706 #endif
707 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709 				      unsigned long pfn, int *target_level,
710 				      gfp_t gfp)
711 {
712 	struct dma_pte *parent, *pte;
713 	int level = agaw_to_level(domain->agaw);
714 	int offset;
715 
716 	if (!domain_pfn_supported(domain, pfn))
717 		/* Address beyond IOMMU's addressing capabilities. */
718 		return NULL;
719 
720 	parent = domain->pgd;
721 
722 	while (1) {
723 		void *tmp_page;
724 
725 		offset = pfn_level_offset(pfn, level);
726 		pte = &parent[offset];
727 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
728 			break;
729 		if (level == *target_level)
730 			break;
731 
732 		if (!dma_pte_present(pte)) {
733 			uint64_t pteval, tmp;
734 
735 			tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
736 							     SZ_4K);
737 
738 			if (!tmp_page)
739 				return NULL;
740 
741 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
742 			pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
743 				 DMA_PTE_WRITE;
744 			if (domain->use_first_level)
745 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
746 
747 			tmp = 0ULL;
748 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
749 				/* Someone else set it while we were thinking; use theirs. */
750 				iommu_free_pages(tmp_page);
751 			else
752 				domain_flush_cache(domain, pte, sizeof(*pte));
753 		}
754 		if (level == 1)
755 			break;
756 
757 		parent = phys_to_virt(dma_pte_addr(pte));
758 		level--;
759 	}
760 
761 	if (!*target_level)
762 		*target_level = level;
763 
764 	return pte;
765 }
766 
767 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)768 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
769 					 unsigned long pfn,
770 					 int level, int *large_page)
771 {
772 	struct dma_pte *parent, *pte;
773 	int total = agaw_to_level(domain->agaw);
774 	int offset;
775 
776 	parent = domain->pgd;
777 	while (level <= total) {
778 		offset = pfn_level_offset(pfn, total);
779 		pte = &parent[offset];
780 		if (level == total)
781 			return pte;
782 
783 		if (!dma_pte_present(pte)) {
784 			*large_page = total;
785 			break;
786 		}
787 
788 		if (dma_pte_superpage(pte)) {
789 			*large_page = total;
790 			return pte;
791 		}
792 
793 		parent = phys_to_virt(dma_pte_addr(pte));
794 		total--;
795 	}
796 	return NULL;
797 }
798 
799 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)800 static void dma_pte_clear_range(struct dmar_domain *domain,
801 				unsigned long start_pfn,
802 				unsigned long last_pfn)
803 {
804 	unsigned int large_page;
805 	struct dma_pte *first_pte, *pte;
806 
807 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
808 	    WARN_ON(start_pfn > last_pfn))
809 		return;
810 
811 	/* we don't need lock here; nobody else touches the iova range */
812 	do {
813 		large_page = 1;
814 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
815 		if (!pte) {
816 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
817 			continue;
818 		}
819 		do {
820 			dma_clear_pte(pte);
821 			start_pfn += lvl_to_nr_pages(large_page);
822 			pte++;
823 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
824 
825 		domain_flush_cache(domain, first_pte,
826 				   (void *)pte - (void *)first_pte);
827 
828 	} while (start_pfn && start_pfn <= last_pfn);
829 }
830 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)831 static void dma_pte_free_level(struct dmar_domain *domain, int level,
832 			       int retain_level, struct dma_pte *pte,
833 			       unsigned long pfn, unsigned long start_pfn,
834 			       unsigned long last_pfn)
835 {
836 	pfn = max(start_pfn, pfn);
837 	pte = &pte[pfn_level_offset(pfn, level)];
838 
839 	do {
840 		unsigned long level_pfn;
841 		struct dma_pte *level_pte;
842 
843 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
844 			goto next;
845 
846 		level_pfn = pfn & level_mask(level);
847 		level_pte = phys_to_virt(dma_pte_addr(pte));
848 
849 		if (level > 2) {
850 			dma_pte_free_level(domain, level - 1, retain_level,
851 					   level_pte, level_pfn, start_pfn,
852 					   last_pfn);
853 		}
854 
855 		/*
856 		 * Free the page table if we're below the level we want to
857 		 * retain and the range covers the entire table.
858 		 */
859 		if (level < retain_level && !(start_pfn > level_pfn ||
860 		      last_pfn < level_pfn + level_size(level) - 1)) {
861 			dma_clear_pte(pte);
862 			domain_flush_cache(domain, pte, sizeof(*pte));
863 			iommu_free_pages(level_pte);
864 		}
865 next:
866 		pfn += level_size(level);
867 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
868 }
869 
870 /*
871  * clear last level (leaf) ptes and free page table pages below the
872  * level we wish to keep intact.
873  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)874 static void dma_pte_free_pagetable(struct dmar_domain *domain,
875 				   unsigned long start_pfn,
876 				   unsigned long last_pfn,
877 				   int retain_level)
878 {
879 	dma_pte_clear_range(domain, start_pfn, last_pfn);
880 
881 	/* We don't need lock here; nobody else touches the iova range */
882 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
883 			   domain->pgd, 0, start_pfn, last_pfn);
884 
885 	/* free pgd */
886 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
887 		iommu_free_pages(domain->pgd);
888 		domain->pgd = NULL;
889 	}
890 }
891 
892 /* When a page at a given level is being unlinked from its parent, we don't
893    need to *modify* it at all. All we need to do is make a list of all the
894    pages which can be freed just as soon as we've flushed the IOTLB and we
895    know the hardware page-walk will no longer touch them.
896    The 'pte' argument is the *parent* PTE, pointing to the page that is to
897    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * parent_pte,struct iommu_pages_list * freelist)898 static void dma_pte_list_pagetables(struct dmar_domain *domain,
899 				    int level, struct dma_pte *parent_pte,
900 				    struct iommu_pages_list *freelist)
901 {
902 	struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
903 
904 	iommu_pages_list_add(freelist, pte);
905 
906 	if (level == 1)
907 		return;
908 
909 	do {
910 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 		pte++;
913 	} while (!first_pte_in_page(pte));
914 }
915 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 				struct dma_pte *pte, unsigned long pfn,
918 				unsigned long start_pfn, unsigned long last_pfn,
919 				struct iommu_pages_list *freelist)
920 {
921 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
922 
923 	pfn = max(start_pfn, pfn);
924 	pte = &pte[pfn_level_offset(pfn, level)];
925 
926 	do {
927 		unsigned long level_pfn = pfn & level_mask(level);
928 
929 		if (!dma_pte_present(pte))
930 			goto next;
931 
932 		/* If range covers entire pagetable, free it */
933 		if (start_pfn <= level_pfn &&
934 		    last_pfn >= level_pfn + level_size(level) - 1) {
935 			/* These suborbinate page tables are going away entirely. Don't
936 			   bother to clear them; we're just going to *free* them. */
937 			if (level > 1 && !dma_pte_superpage(pte))
938 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939 
940 			dma_clear_pte(pte);
941 			if (!first_pte)
942 				first_pte = pte;
943 			last_pte = pte;
944 		} else if (level > 1) {
945 			/* Recurse down into a level that isn't *entirely* obsolete */
946 			dma_pte_clear_level(domain, level - 1,
947 					    phys_to_virt(dma_pte_addr(pte)),
948 					    level_pfn, start_pfn, last_pfn,
949 					    freelist);
950 		}
951 next:
952 		pfn = level_pfn + level_size(level);
953 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 
955 	if (first_pte)
956 		domain_flush_cache(domain, first_pte,
957 				   (void *)++last_pte - (void *)first_pte);
958 }
959 
960 /* We can't just free the pages because the IOMMU may still be walking
961    the page tables, and may have cached the intermediate levels. The
962    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 			 unsigned long last_pfn,
965 			 struct iommu_pages_list *freelist)
966 {
967 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
968 	    WARN_ON(start_pfn > last_pfn))
969 		return;
970 
971 	/* we don't need lock here; nobody else touches the iova range */
972 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
973 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
974 
975 	/* free pgd */
976 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
977 		iommu_pages_list_add(freelist, domain->pgd);
978 		domain->pgd = NULL;
979 	}
980 }
981 
982 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 	struct root_entry *root;
986 
987 	root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
988 	if (!root) {
989 		pr_err("Allocating root entry for %s failed\n",
990 			iommu->name);
991 		return -ENOMEM;
992 	}
993 
994 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
995 	iommu->root_entry = root;
996 
997 	return 0;
998 }
999 
iommu_set_root_entry(struct intel_iommu * iommu)1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 	u64 addr;
1003 	u32 sts;
1004 	unsigned long flag;
1005 
1006 	addr = virt_to_phys(iommu->root_entry);
1007 	if (sm_supported(iommu))
1008 		addr |= DMA_RTADDR_SMT;
1009 
1010 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012 
1013 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014 
1015 	/* Make sure hardware complete it */
1016 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 		      readl, (sts & DMA_GSTS_RTPS), sts);
1018 
1019 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020 
1021 	/*
1022 	 * Hardware invalidates all DMA remapping hardware translation
1023 	 * caches as part of SRTP flow.
1024 	 */
1025 	if (cap_esrtps(iommu->cap))
1026 		return;
1027 
1028 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 	if (sm_supported(iommu))
1030 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033 
iommu_flush_write_buffer(struct intel_iommu * iommu)1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 	u32 val;
1037 	unsigned long flag;
1038 
1039 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 		return;
1041 
1042 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044 
1045 	/* Make sure hardware complete it */
1046 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048 
1049 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051 
1052 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 				  u16 did, u16 source_id, u8 function_mask,
1055 				  u64 type)
1056 {
1057 	u64 val = 0;
1058 	unsigned long flag;
1059 
1060 	switch (type) {
1061 	case DMA_CCMD_GLOBAL_INVL:
1062 		val = DMA_CCMD_GLOBAL_INVL;
1063 		break;
1064 	case DMA_CCMD_DOMAIN_INVL:
1065 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 		break;
1067 	case DMA_CCMD_DEVICE_INVL:
1068 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 		break;
1071 	default:
1072 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 			iommu->name, type);
1074 		return;
1075 	}
1076 	val |= DMA_CCMD_ICC;
1077 
1078 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080 
1081 	/* Make sure hardware complete it */
1082 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084 
1085 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087 
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 			 unsigned int size_order, u64 type)
1090 {
1091 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 	u64 val = 0, val_iva = 0;
1093 	unsigned long flag;
1094 
1095 	switch (type) {
1096 	case DMA_TLB_GLOBAL_FLUSH:
1097 		/* global flush doesn't need set IVA_REG */
1098 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 		break;
1100 	case DMA_TLB_DSI_FLUSH:
1101 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 		break;
1103 	case DMA_TLB_PSI_FLUSH:
1104 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 		/* IH bit is passed in as part of address */
1106 		val_iva = size_order | addr;
1107 		break;
1108 	default:
1109 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 			iommu->name, type);
1111 		return;
1112 	}
1113 
1114 	if (cap_write_drain(iommu->cap))
1115 		val |= DMA_TLB_WRITE_DRAIN;
1116 
1117 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 	/* Note: Only uses first TLB reg currently */
1119 	if (val_iva)
1120 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122 
1123 	/* Make sure hardware complete it */
1124 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126 
1127 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128 
1129 	/* check IOTLB invalidation granularity */
1130 	if (DMA_TLB_IAIG(val) == 0)
1131 		pr_err("Flush IOTLB failed\n");
1132 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 			(unsigned long long)DMA_TLB_IIRG(type),
1135 			(unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137 
1138 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 	struct device_domain_info *info;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&domain->lock, flags);
1146 	list_for_each_entry(info, &domain->devices, link) {
1147 		if (info->iommu == iommu && info->bus == bus &&
1148 		    info->devfn == devfn) {
1149 			spin_unlock_irqrestore(&domain->lock, flags);
1150 			return info;
1151 		}
1152 	}
1153 	spin_unlock_irqrestore(&domain->lock, flags);
1154 
1155 	return NULL;
1156 }
1157 
1158 /*
1159  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161  * check because it applies only to the built-in QAT devices and it doesn't
1162  * grant additional privileges.
1163  */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 		return false;
1169 
1170 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 		return false;
1172 
1173 	return true;
1174 }
1175 
iommu_enable_pci_ats(struct device_domain_info * info)1176 static void iommu_enable_pci_ats(struct device_domain_info *info)
1177 {
1178 	struct pci_dev *pdev;
1179 
1180 	if (!info->ats_supported)
1181 		return;
1182 
1183 	pdev = to_pci_dev(info->dev);
1184 	if (!pci_ats_page_aligned(pdev))
1185 		return;
1186 
1187 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1188 		info->ats_enabled = 1;
1189 }
1190 
iommu_disable_pci_ats(struct device_domain_info * info)1191 static void iommu_disable_pci_ats(struct device_domain_info *info)
1192 {
1193 	if (!info->ats_enabled)
1194 		return;
1195 
1196 	pci_disable_ats(to_pci_dev(info->dev));
1197 	info->ats_enabled = 0;
1198 }
1199 
iommu_enable_pci_pri(struct device_domain_info * info)1200 static void iommu_enable_pci_pri(struct device_domain_info *info)
1201 {
1202 	struct pci_dev *pdev;
1203 
1204 	if (!info->ats_enabled || !info->pri_supported)
1205 		return;
1206 
1207 	pdev = to_pci_dev(info->dev);
1208 	/* PASID is required in PRG Response Message. */
1209 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210 		return;
1211 
1212 	if (pci_reset_pri(pdev))
1213 		return;
1214 
1215 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216 		info->pri_enabled = 1;
1217 }
1218 
iommu_disable_pci_pri(struct device_domain_info * info)1219 static void iommu_disable_pci_pri(struct device_domain_info *info)
1220 {
1221 	if (!info->pri_enabled)
1222 		return;
1223 
1224 	if (WARN_ON(info->iopf_refcount))
1225 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1226 
1227 	pci_disable_pri(to_pci_dev(info->dev));
1228 	info->pri_enabled = 0;
1229 }
1230 
intel_flush_iotlb_all(struct iommu_domain * domain)1231 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232 {
1233 	cache_tag_flush_all(to_dmar_domain(domain));
1234 }
1235 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237 {
1238 	u32 pmen;
1239 	unsigned long flags;
1240 
1241 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242 		return;
1243 
1244 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1246 	pmen &= ~DMA_PMEN_EPM;
1247 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1248 
1249 	/* wait for the protected region status bit to clear */
1250 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1252 
1253 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254 }
1255 
iommu_enable_translation(struct intel_iommu * iommu)1256 static void iommu_enable_translation(struct intel_iommu *iommu)
1257 {
1258 	u32 sts;
1259 	unsigned long flags;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262 	iommu->gcmd |= DMA_GCMD_TE;
1263 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1264 
1265 	/* Make sure hardware complete it */
1266 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267 		      readl, (sts & DMA_GSTS_TES), sts);
1268 
1269 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270 }
1271 
iommu_disable_translation(struct intel_iommu * iommu)1272 static void iommu_disable_translation(struct intel_iommu *iommu)
1273 {
1274 	u32 sts;
1275 	unsigned long flag;
1276 
1277 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1279 		return;
1280 
1281 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 	iommu->gcmd &= ~DMA_GCMD_TE;
1283 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1284 
1285 	/* Make sure hardware complete it */
1286 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1288 
1289 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 }
1291 
disable_dmar_iommu(struct intel_iommu * iommu)1292 static void disable_dmar_iommu(struct intel_iommu *iommu)
1293 {
1294 	/*
1295 	 * All iommu domains must have been detached from the devices,
1296 	 * hence there should be no domain IDs in use.
1297 	 */
1298 	if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1299 		return;
1300 
1301 	if (iommu->gcmd & DMA_GCMD_TE)
1302 		iommu_disable_translation(iommu);
1303 }
1304 
free_dmar_iommu(struct intel_iommu * iommu)1305 static void free_dmar_iommu(struct intel_iommu *iommu)
1306 {
1307 	if (iommu->copied_tables) {
1308 		bitmap_free(iommu->copied_tables);
1309 		iommu->copied_tables = NULL;
1310 	}
1311 
1312 	/* free context mapping */
1313 	free_context_table(iommu);
1314 
1315 	if (ecap_prs(iommu->ecap))
1316 		intel_iommu_finish_prq(iommu);
1317 }
1318 
1319 /*
1320  * Check and return whether first level is used by default for
1321  * DMA translation.
1322  */
first_level_by_default(struct intel_iommu * iommu)1323 static bool first_level_by_default(struct intel_iommu *iommu)
1324 {
1325 	/* Only SL is available in legacy mode */
1326 	if (!sm_supported(iommu))
1327 		return false;
1328 
1329 	/* Only level (either FL or SL) is available, just use it */
1330 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1331 		return ecap_flts(iommu->ecap);
1332 
1333 	return true;
1334 }
1335 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1336 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1337 {
1338 	struct iommu_domain_info *info, *curr;
1339 	int num, ret = -ENOSPC;
1340 
1341 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1342 		return 0;
1343 
1344 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1345 	if (!info)
1346 		return -ENOMEM;
1347 
1348 	guard(mutex)(&iommu->did_lock);
1349 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1350 	if (curr) {
1351 		curr->refcnt++;
1352 		kfree(info);
1353 		return 0;
1354 	}
1355 
1356 	num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1357 			      cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1358 	if (num < 0) {
1359 		pr_err("%s: No free domain ids\n", iommu->name);
1360 		goto err_unlock;
1361 	}
1362 
1363 	info->refcnt	= 1;
1364 	info->did	= num;
1365 	info->iommu	= iommu;
1366 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1367 			  NULL, info, GFP_KERNEL);
1368 	if (curr) {
1369 		ret = xa_err(curr) ? : -EBUSY;
1370 		goto err_clear;
1371 	}
1372 
1373 	return 0;
1374 
1375 err_clear:
1376 	ida_free(&iommu->domain_ida, info->did);
1377 err_unlock:
1378 	kfree(info);
1379 	return ret;
1380 }
1381 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1382 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1383 {
1384 	struct iommu_domain_info *info;
1385 
1386 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387 		return;
1388 
1389 	guard(mutex)(&iommu->did_lock);
1390 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1391 	if (--info->refcnt == 0) {
1392 		ida_free(&iommu->domain_ida, info->did);
1393 		xa_erase(&domain->iommu_array, iommu->seq_id);
1394 		domain->nid = NUMA_NO_NODE;
1395 		kfree(info);
1396 	}
1397 }
1398 
domain_exit(struct dmar_domain * domain)1399 static void domain_exit(struct dmar_domain *domain)
1400 {
1401 	if (domain->pgd) {
1402 		struct iommu_pages_list freelist =
1403 			IOMMU_PAGES_LIST_INIT(freelist);
1404 
1405 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1406 		iommu_put_pages_list(&freelist);
1407 	}
1408 
1409 	if (WARN_ON(!list_empty(&domain->devices)))
1410 		return;
1411 
1412 	kfree(domain->qi_batch);
1413 	kfree(domain);
1414 }
1415 
1416 /*
1417  * For kdump cases, old valid entries may be cached due to the
1418  * in-flight DMA and copied pgtable, but there is no unmapping
1419  * behaviour for them, thus we need an explicit cache flush for
1420  * the newly-mapped device. For kdump, at this point, the device
1421  * is supposed to finish reset at its driver probe stage, so no
1422  * in-flight DMA will exist, and we don't need to worry anymore
1423  * hereafter.
1424  */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1425 static void copied_context_tear_down(struct intel_iommu *iommu,
1426 				     struct context_entry *context,
1427 				     u8 bus, u8 devfn)
1428 {
1429 	u16 did_old;
1430 
1431 	if (!context_copied(iommu, bus, devfn))
1432 		return;
1433 
1434 	assert_spin_locked(&iommu->lock);
1435 
1436 	did_old = context_domain_id(context);
1437 	context_clear_entry(context);
1438 
1439 	if (did_old < cap_ndoms(iommu->cap)) {
1440 		iommu->flush.flush_context(iommu, did_old,
1441 					   PCI_DEVID(bus, devfn),
1442 					   DMA_CCMD_MASK_NOBIT,
1443 					   DMA_CCMD_DEVICE_INVL);
1444 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1445 					 DMA_TLB_DSI_FLUSH);
1446 	}
1447 
1448 	clear_context_copied(iommu, bus, devfn);
1449 }
1450 
1451 /*
1452  * It's a non-present to present mapping. If hardware doesn't cache
1453  * non-present entry we only need to flush the write-buffer. If the
1454  * _does_ cache non-present entries, then it does so in the special
1455  * domain #0, which we have to flush:
1456  */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1457 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1458 					u8 bus, u8 devfn)
1459 {
1460 	if (cap_caching_mode(iommu->cap)) {
1461 		iommu->flush.flush_context(iommu, 0,
1462 					   PCI_DEVID(bus, devfn),
1463 					   DMA_CCMD_MASK_NOBIT,
1464 					   DMA_CCMD_DEVICE_INVL);
1465 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1466 	} else {
1467 		iommu_flush_write_buffer(iommu);
1468 	}
1469 }
1470 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1471 static int domain_context_mapping_one(struct dmar_domain *domain,
1472 				      struct intel_iommu *iommu,
1473 				      u8 bus, u8 devfn)
1474 {
1475 	struct device_domain_info *info =
1476 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1477 	u16 did = domain_id_iommu(domain, iommu);
1478 	int translation = CONTEXT_TT_MULTI_LEVEL;
1479 	struct dma_pte *pgd = domain->pgd;
1480 	struct context_entry *context;
1481 	int ret;
1482 
1483 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1484 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1485 
1486 	spin_lock(&iommu->lock);
1487 	ret = -ENOMEM;
1488 	context = iommu_context_addr(iommu, bus, devfn, 1);
1489 	if (!context)
1490 		goto out_unlock;
1491 
1492 	ret = 0;
1493 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1494 		goto out_unlock;
1495 
1496 	copied_context_tear_down(iommu, context, bus, devfn);
1497 	context_clear_entry(context);
1498 	context_set_domain_id(context, did);
1499 
1500 	if (info && info->ats_supported)
1501 		translation = CONTEXT_TT_DEV_IOTLB;
1502 	else
1503 		translation = CONTEXT_TT_MULTI_LEVEL;
1504 
1505 	context_set_address_root(context, virt_to_phys(pgd));
1506 	context_set_address_width(context, domain->agaw);
1507 	context_set_translation_type(context, translation);
1508 	context_set_fault_enable(context);
1509 	context_set_present(context);
1510 	if (!ecap_coherent(iommu->ecap))
1511 		clflush_cache_range(context, sizeof(*context));
1512 	context_present_cache_flush(iommu, did, bus, devfn);
1513 	ret = 0;
1514 
1515 out_unlock:
1516 	spin_unlock(&iommu->lock);
1517 
1518 	return ret;
1519 }
1520 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1521 static int domain_context_mapping_cb(struct pci_dev *pdev,
1522 				     u16 alias, void *opaque)
1523 {
1524 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1525 	struct intel_iommu *iommu = info->iommu;
1526 	struct dmar_domain *domain = opaque;
1527 
1528 	return domain_context_mapping_one(domain, iommu,
1529 					  PCI_BUS_NUM(alias), alias & 0xff);
1530 }
1531 
1532 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1533 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1534 {
1535 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1536 	struct intel_iommu *iommu = info->iommu;
1537 	u8 bus = info->bus, devfn = info->devfn;
1538 	int ret;
1539 
1540 	if (!dev_is_pci(dev))
1541 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1542 
1543 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1544 				     domain_context_mapping_cb, domain);
1545 	if (ret)
1546 		return ret;
1547 
1548 	iommu_enable_pci_ats(info);
1549 
1550 	return 0;
1551 }
1552 
1553 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1554 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1555 				   unsigned long phy_pfn, unsigned long pages)
1556 {
1557 	int support, level = 1;
1558 	unsigned long pfnmerge;
1559 
1560 	support = domain->iommu_superpage;
1561 
1562 	/* To use a large page, the virtual *and* physical addresses
1563 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1564 	   of them will mean we have to use smaller pages. So just
1565 	   merge them and check both at once. */
1566 	pfnmerge = iov_pfn | phy_pfn;
1567 
1568 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1569 		pages >>= VTD_STRIDE_SHIFT;
1570 		if (!pages)
1571 			break;
1572 		pfnmerge >>= VTD_STRIDE_SHIFT;
1573 		level++;
1574 		support--;
1575 	}
1576 	return level;
1577 }
1578 
1579 /*
1580  * Ensure that old small page tables are removed to make room for superpage(s).
1581  * We're going to add new large pages, so make sure we don't remove their parent
1582  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1583  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1584 static void switch_to_super_page(struct dmar_domain *domain,
1585 				 unsigned long start_pfn,
1586 				 unsigned long end_pfn, int level)
1587 {
1588 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1589 	struct dma_pte *pte = NULL;
1590 
1591 	while (start_pfn <= end_pfn) {
1592 		if (!pte)
1593 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1594 					     GFP_ATOMIC);
1595 
1596 		if (dma_pte_present(pte)) {
1597 			dma_pte_free_pagetable(domain, start_pfn,
1598 					       start_pfn + lvl_pages - 1,
1599 					       level + 1);
1600 
1601 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1602 					      end_pfn << VTD_PAGE_SHIFT, 0);
1603 		}
1604 
1605 		pte++;
1606 		start_pfn += lvl_pages;
1607 		if (first_pte_in_page(pte))
1608 			pte = NULL;
1609 	}
1610 }
1611 
1612 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1613 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1614 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1615 		 gfp_t gfp)
1616 {
1617 	struct dma_pte *first_pte = NULL, *pte = NULL;
1618 	unsigned int largepage_lvl = 0;
1619 	unsigned long lvl_pages = 0;
1620 	phys_addr_t pteval;
1621 	u64 attr;
1622 
1623 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1624 		return -EINVAL;
1625 
1626 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1627 		return -EINVAL;
1628 
1629 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1630 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1631 		return -EINVAL;
1632 	}
1633 
1634 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1635 	if (domain->use_first_level) {
1636 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1637 		if (prot & DMA_PTE_WRITE)
1638 			attr |= DMA_FL_PTE_DIRTY;
1639 	}
1640 
1641 	domain->has_mappings = true;
1642 
1643 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1644 
1645 	while (nr_pages > 0) {
1646 		uint64_t tmp;
1647 
1648 		if (!pte) {
1649 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1650 					phys_pfn, nr_pages);
1651 
1652 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1653 					     gfp);
1654 			if (!pte)
1655 				return -ENOMEM;
1656 			first_pte = pte;
1657 
1658 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1659 
1660 			/* It is large page*/
1661 			if (largepage_lvl > 1) {
1662 				unsigned long end_pfn;
1663 				unsigned long pages_to_remove;
1664 
1665 				pteval |= DMA_PTE_LARGE_PAGE;
1666 				pages_to_remove = min_t(unsigned long, nr_pages,
1667 							nr_pte_to_next_page(pte) * lvl_pages);
1668 				end_pfn = iov_pfn + pages_to_remove - 1;
1669 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1670 			} else {
1671 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1672 			}
1673 
1674 		}
1675 		/* We don't need lock here, nobody else
1676 		 * touches the iova range
1677 		 */
1678 		tmp = 0ULL;
1679 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1680 			static int dumps = 5;
1681 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1682 				iov_pfn, tmp, (unsigned long long)pteval);
1683 			if (dumps) {
1684 				dumps--;
1685 				debug_dma_dump_mappings(NULL);
1686 			}
1687 			WARN_ON(1);
1688 		}
1689 
1690 		nr_pages -= lvl_pages;
1691 		iov_pfn += lvl_pages;
1692 		phys_pfn += lvl_pages;
1693 		pteval += lvl_pages * VTD_PAGE_SIZE;
1694 
1695 		/* If the next PTE would be the first in a new page, then we
1696 		 * need to flush the cache on the entries we've just written.
1697 		 * And then we'll need to recalculate 'pte', so clear it and
1698 		 * let it get set again in the if (!pte) block above.
1699 		 *
1700 		 * If we're done (!nr_pages) we need to flush the cache too.
1701 		 *
1702 		 * Also if we've been setting superpages, we may need to
1703 		 * recalculate 'pte' and switch back to smaller pages for the
1704 		 * end of the mapping, if the trailing size is not enough to
1705 		 * use another superpage (i.e. nr_pages < lvl_pages).
1706 		 */
1707 		pte++;
1708 		if (!nr_pages || first_pte_in_page(pte) ||
1709 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1710 			domain_flush_cache(domain, first_pte,
1711 					   (void *)pte - (void *)first_pte);
1712 			pte = NULL;
1713 		}
1714 	}
1715 
1716 	return 0;
1717 }
1718 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1719 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1720 {
1721 	struct intel_iommu *iommu = info->iommu;
1722 	struct context_entry *context;
1723 	u16 did;
1724 
1725 	spin_lock(&iommu->lock);
1726 	context = iommu_context_addr(iommu, bus, devfn, 0);
1727 	if (!context) {
1728 		spin_unlock(&iommu->lock);
1729 		return;
1730 	}
1731 
1732 	did = context_domain_id(context);
1733 	context_clear_entry(context);
1734 	__iommu_flush_cache(iommu, context, sizeof(*context));
1735 	spin_unlock(&iommu->lock);
1736 	intel_context_flush_no_pasid(info, context, did);
1737 }
1738 
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1739 int __domain_setup_first_level(struct intel_iommu *iommu,
1740 			       struct device *dev, ioasid_t pasid,
1741 			       u16 did, pgd_t *pgd, int flags,
1742 			       struct iommu_domain *old)
1743 {
1744 	if (!old)
1745 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1746 						     pasid, did, flags);
1747 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1748 					       iommu_domain_did(old, iommu),
1749 					       flags);
1750 }
1751 
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1752 static int domain_setup_second_level(struct intel_iommu *iommu,
1753 				     struct dmar_domain *domain,
1754 				     struct device *dev, ioasid_t pasid,
1755 				     struct iommu_domain *old)
1756 {
1757 	if (!old)
1758 		return intel_pasid_setup_second_level(iommu, domain,
1759 						      dev, pasid);
1760 	return intel_pasid_replace_second_level(iommu, domain, dev,
1761 						iommu_domain_did(old, iommu),
1762 						pasid);
1763 }
1764 
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1765 static int domain_setup_passthrough(struct intel_iommu *iommu,
1766 				    struct device *dev, ioasid_t pasid,
1767 				    struct iommu_domain *old)
1768 {
1769 	if (!old)
1770 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1771 	return intel_pasid_replace_pass_through(iommu, dev,
1772 						iommu_domain_did(old, iommu),
1773 						pasid);
1774 }
1775 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1776 static int domain_setup_first_level(struct intel_iommu *iommu,
1777 				    struct dmar_domain *domain,
1778 				    struct device *dev,
1779 				    u32 pasid, struct iommu_domain *old)
1780 {
1781 	struct dma_pte *pgd = domain->pgd;
1782 	int level, flags = 0;
1783 
1784 	level = agaw_to_level(domain->agaw);
1785 	if (level != 4 && level != 5)
1786 		return -EINVAL;
1787 
1788 	if (level == 5)
1789 		flags |= PASID_FLAG_FL5LP;
1790 
1791 	if (domain->force_snooping)
1792 		flags |= PASID_FLAG_PAGE_SNOOP;
1793 
1794 	return __domain_setup_first_level(iommu, dev, pasid,
1795 					  domain_id_iommu(domain, iommu),
1796 					  (pgd_t *)pgd, flags, old);
1797 }
1798 
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1799 static int dmar_domain_attach_device(struct dmar_domain *domain,
1800 				     struct device *dev)
1801 {
1802 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1803 	struct intel_iommu *iommu = info->iommu;
1804 	unsigned long flags;
1805 	int ret;
1806 
1807 	ret = domain_attach_iommu(domain, iommu);
1808 	if (ret)
1809 		return ret;
1810 
1811 	info->domain = domain;
1812 	info->domain_attached = true;
1813 	spin_lock_irqsave(&domain->lock, flags);
1814 	list_add(&info->link, &domain->devices);
1815 	spin_unlock_irqrestore(&domain->lock, flags);
1816 
1817 	if (dev_is_real_dma_subdevice(dev))
1818 		return 0;
1819 
1820 	if (!sm_supported(iommu))
1821 		ret = domain_context_mapping(domain, dev);
1822 	else if (domain->use_first_level)
1823 		ret = domain_setup_first_level(iommu, domain, dev,
1824 					       IOMMU_NO_PASID, NULL);
1825 	else
1826 		ret = domain_setup_second_level(iommu, domain, dev,
1827 						IOMMU_NO_PASID, NULL);
1828 
1829 	if (ret)
1830 		goto out_block_translation;
1831 
1832 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1833 	if (ret)
1834 		goto out_block_translation;
1835 
1836 	return 0;
1837 
1838 out_block_translation:
1839 	device_block_translation(dev);
1840 	return ret;
1841 }
1842 
1843 /**
1844  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1845  * is relaxable (ie. is allowed to be not enforced under some conditions)
1846  * @dev: device handle
1847  *
1848  * We assume that PCI USB devices with RMRRs have them largely
1849  * for historical reasons and that the RMRR space is not actively used post
1850  * boot.  This exclusion may change if vendors begin to abuse it.
1851  *
1852  * The same exception is made for graphics devices, with the requirement that
1853  * any use of the RMRR regions will be torn down before assigning the device
1854  * to a guest.
1855  *
1856  * Return: true if the RMRR is relaxable, false otherwise
1857  */
device_rmrr_is_relaxable(struct device * dev)1858 static bool device_rmrr_is_relaxable(struct device *dev)
1859 {
1860 	struct pci_dev *pdev;
1861 
1862 	if (!dev_is_pci(dev))
1863 		return false;
1864 
1865 	pdev = to_pci_dev(dev);
1866 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1867 		return true;
1868 	else
1869 		return false;
1870 }
1871 
device_def_domain_type(struct device * dev)1872 static int device_def_domain_type(struct device *dev)
1873 {
1874 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1875 	struct intel_iommu *iommu = info->iommu;
1876 
1877 	/*
1878 	 * Hardware does not support the passthrough translation mode.
1879 	 * Always use a dynamaic mapping domain.
1880 	 */
1881 	if (!ecap_pass_through(iommu->ecap))
1882 		return IOMMU_DOMAIN_DMA;
1883 
1884 	if (dev_is_pci(dev)) {
1885 		struct pci_dev *pdev = to_pci_dev(dev);
1886 
1887 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1888 			return IOMMU_DOMAIN_IDENTITY;
1889 	}
1890 
1891 	return 0;
1892 }
1893 
intel_iommu_init_qi(struct intel_iommu * iommu)1894 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1895 {
1896 	/*
1897 	 * Start from the sane iommu hardware state.
1898 	 * If the queued invalidation is already initialized by us
1899 	 * (for example, while enabling interrupt-remapping) then
1900 	 * we got the things already rolling from a sane state.
1901 	 */
1902 	if (!iommu->qi) {
1903 		/*
1904 		 * Clear any previous faults.
1905 		 */
1906 		dmar_fault(-1, iommu);
1907 		/*
1908 		 * Disable queued invalidation if supported and already enabled
1909 		 * before OS handover.
1910 		 */
1911 		dmar_disable_qi(iommu);
1912 	}
1913 
1914 	if (dmar_enable_qi(iommu)) {
1915 		/*
1916 		 * Queued Invalidate not enabled, use Register Based Invalidate
1917 		 */
1918 		iommu->flush.flush_context = __iommu_flush_context;
1919 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1920 		pr_info("%s: Using Register based invalidation\n",
1921 			iommu->name);
1922 	} else {
1923 		iommu->flush.flush_context = qi_flush_context;
1924 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1925 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1926 	}
1927 }
1928 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1929 static int copy_context_table(struct intel_iommu *iommu,
1930 			      struct root_entry *old_re,
1931 			      struct context_entry **tbl,
1932 			      int bus, bool ext)
1933 {
1934 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1935 	struct context_entry *new_ce = NULL, ce;
1936 	struct context_entry *old_ce = NULL;
1937 	struct root_entry re;
1938 	phys_addr_t old_ce_phys;
1939 
1940 	tbl_idx = ext ? bus * 2 : bus;
1941 	memcpy(&re, old_re, sizeof(re));
1942 
1943 	for (devfn = 0; devfn < 256; devfn++) {
1944 		/* First calculate the correct index */
1945 		idx = (ext ? devfn * 2 : devfn) % 256;
1946 
1947 		if (idx == 0) {
1948 			/* First save what we may have and clean up */
1949 			if (new_ce) {
1950 				tbl[tbl_idx] = new_ce;
1951 				__iommu_flush_cache(iommu, new_ce,
1952 						    VTD_PAGE_SIZE);
1953 				pos = 1;
1954 			}
1955 
1956 			if (old_ce)
1957 				memunmap(old_ce);
1958 
1959 			ret = 0;
1960 			if (devfn < 0x80)
1961 				old_ce_phys = root_entry_lctp(&re);
1962 			else
1963 				old_ce_phys = root_entry_uctp(&re);
1964 
1965 			if (!old_ce_phys) {
1966 				if (ext && devfn == 0) {
1967 					/* No LCTP, try UCTP */
1968 					devfn = 0x7f;
1969 					continue;
1970 				} else {
1971 					goto out;
1972 				}
1973 			}
1974 
1975 			ret = -ENOMEM;
1976 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1977 					MEMREMAP_WB);
1978 			if (!old_ce)
1979 				goto out;
1980 
1981 			new_ce = iommu_alloc_pages_node_sz(iommu->node,
1982 							   GFP_KERNEL, SZ_4K);
1983 			if (!new_ce)
1984 				goto out_unmap;
1985 
1986 			ret = 0;
1987 		}
1988 
1989 		/* Now copy the context entry */
1990 		memcpy(&ce, old_ce + idx, sizeof(ce));
1991 
1992 		if (!context_present(&ce))
1993 			continue;
1994 
1995 		did = context_domain_id(&ce);
1996 		if (did >= 0 && did < cap_ndoms(iommu->cap))
1997 			ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
1998 
1999 		set_context_copied(iommu, bus, devfn);
2000 		new_ce[idx] = ce;
2001 	}
2002 
2003 	tbl[tbl_idx + pos] = new_ce;
2004 
2005 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2006 
2007 out_unmap:
2008 	memunmap(old_ce);
2009 
2010 out:
2011 	return ret;
2012 }
2013 
copy_translation_tables(struct intel_iommu * iommu)2014 static int copy_translation_tables(struct intel_iommu *iommu)
2015 {
2016 	struct context_entry **ctxt_tbls;
2017 	struct root_entry *old_rt;
2018 	phys_addr_t old_rt_phys;
2019 	int ctxt_table_entries;
2020 	u64 rtaddr_reg;
2021 	int bus, ret;
2022 	bool new_ext, ext;
2023 
2024 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2025 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2026 	new_ext    = !!sm_supported(iommu);
2027 
2028 	/*
2029 	 * The RTT bit can only be changed when translation is disabled,
2030 	 * but disabling translation means to open a window for data
2031 	 * corruption. So bail out and don't copy anything if we would
2032 	 * have to change the bit.
2033 	 */
2034 	if (new_ext != ext)
2035 		return -EINVAL;
2036 
2037 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2038 	if (!iommu->copied_tables)
2039 		return -ENOMEM;
2040 
2041 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2042 	if (!old_rt_phys)
2043 		return -EINVAL;
2044 
2045 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2046 	if (!old_rt)
2047 		return -ENOMEM;
2048 
2049 	/* This is too big for the stack - allocate it from slab */
2050 	ctxt_table_entries = ext ? 512 : 256;
2051 	ret = -ENOMEM;
2052 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2053 	if (!ctxt_tbls)
2054 		goto out_unmap;
2055 
2056 	for (bus = 0; bus < 256; bus++) {
2057 		ret = copy_context_table(iommu, &old_rt[bus],
2058 					 ctxt_tbls, bus, ext);
2059 		if (ret) {
2060 			pr_err("%s: Failed to copy context table for bus %d\n",
2061 				iommu->name, bus);
2062 			continue;
2063 		}
2064 	}
2065 
2066 	spin_lock(&iommu->lock);
2067 
2068 	/* Context tables are copied, now write them to the root_entry table */
2069 	for (bus = 0; bus < 256; bus++) {
2070 		int idx = ext ? bus * 2 : bus;
2071 		u64 val;
2072 
2073 		if (ctxt_tbls[idx]) {
2074 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2075 			iommu->root_entry[bus].lo = val;
2076 		}
2077 
2078 		if (!ext || !ctxt_tbls[idx + 1])
2079 			continue;
2080 
2081 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2082 		iommu->root_entry[bus].hi = val;
2083 	}
2084 
2085 	spin_unlock(&iommu->lock);
2086 
2087 	kfree(ctxt_tbls);
2088 
2089 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2090 
2091 	ret = 0;
2092 
2093 out_unmap:
2094 	memunmap(old_rt);
2095 
2096 	return ret;
2097 }
2098 
init_dmars(void)2099 static int __init init_dmars(void)
2100 {
2101 	struct dmar_drhd_unit *drhd;
2102 	struct intel_iommu *iommu;
2103 	int ret;
2104 
2105 	for_each_iommu(iommu, drhd) {
2106 		if (drhd->ignored) {
2107 			iommu_disable_translation(iommu);
2108 			continue;
2109 		}
2110 
2111 		/*
2112 		 * Find the max pasid size of all IOMMU's in the system.
2113 		 * We need to ensure the system pasid table is no bigger
2114 		 * than the smallest supported.
2115 		 */
2116 		if (pasid_supported(iommu)) {
2117 			u32 temp = 2 << ecap_pss(iommu->ecap);
2118 
2119 			intel_pasid_max_id = min_t(u32, temp,
2120 						   intel_pasid_max_id);
2121 		}
2122 
2123 		intel_iommu_init_qi(iommu);
2124 		init_translation_status(iommu);
2125 
2126 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2127 			iommu_disable_translation(iommu);
2128 			clear_translation_pre_enabled(iommu);
2129 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2130 				iommu->name);
2131 		}
2132 
2133 		/*
2134 		 * TBD:
2135 		 * we could share the same root & context tables
2136 		 * among all IOMMU's. Need to Split it later.
2137 		 */
2138 		ret = iommu_alloc_root_entry(iommu);
2139 		if (ret)
2140 			goto free_iommu;
2141 
2142 		if (translation_pre_enabled(iommu)) {
2143 			pr_info("Translation already enabled - trying to copy translation structures\n");
2144 
2145 			ret = copy_translation_tables(iommu);
2146 			if (ret) {
2147 				/*
2148 				 * We found the IOMMU with translation
2149 				 * enabled - but failed to copy over the
2150 				 * old root-entry table. Try to proceed
2151 				 * by disabling translation now and
2152 				 * allocating a clean root-entry table.
2153 				 * This might cause DMAR faults, but
2154 				 * probably the dump will still succeed.
2155 				 */
2156 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2157 				       iommu->name);
2158 				iommu_disable_translation(iommu);
2159 				clear_translation_pre_enabled(iommu);
2160 			} else {
2161 				pr_info("Copied translation tables from previous kernel for %s\n",
2162 					iommu->name);
2163 			}
2164 		}
2165 
2166 		intel_svm_check(iommu);
2167 	}
2168 
2169 	/*
2170 	 * Now that qi is enabled on all iommus, set the root entry and flush
2171 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2172 	 * flush_context function will loop forever and the boot hangs.
2173 	 */
2174 	for_each_active_iommu(iommu, drhd) {
2175 		iommu_flush_write_buffer(iommu);
2176 		iommu_set_root_entry(iommu);
2177 	}
2178 
2179 	check_tylersburg_isoch();
2180 
2181 	/*
2182 	 * for each drhd
2183 	 *   enable fault log
2184 	 *   global invalidate context cache
2185 	 *   global invalidate iotlb
2186 	 *   enable translation
2187 	 */
2188 	for_each_iommu(iommu, drhd) {
2189 		if (drhd->ignored) {
2190 			/*
2191 			 * we always have to disable PMRs or DMA may fail on
2192 			 * this device
2193 			 */
2194 			if (force_on)
2195 				iommu_disable_protect_mem_regions(iommu);
2196 			continue;
2197 		}
2198 
2199 		iommu_flush_write_buffer(iommu);
2200 
2201 		if (ecap_prs(iommu->ecap)) {
2202 			/*
2203 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2204 			 * could cause possible lock race condition.
2205 			 */
2206 			up_write(&dmar_global_lock);
2207 			ret = intel_iommu_enable_prq(iommu);
2208 			down_write(&dmar_global_lock);
2209 			if (ret)
2210 				goto free_iommu;
2211 		}
2212 
2213 		ret = dmar_set_interrupt(iommu);
2214 		if (ret)
2215 			goto free_iommu;
2216 	}
2217 
2218 	return 0;
2219 
2220 free_iommu:
2221 	for_each_active_iommu(iommu, drhd) {
2222 		disable_dmar_iommu(iommu);
2223 		free_dmar_iommu(iommu);
2224 	}
2225 
2226 	return ret;
2227 }
2228 
init_no_remapping_devices(void)2229 static void __init init_no_remapping_devices(void)
2230 {
2231 	struct dmar_drhd_unit *drhd;
2232 	struct device *dev;
2233 	int i;
2234 
2235 	for_each_drhd_unit(drhd) {
2236 		if (!drhd->include_all) {
2237 			for_each_active_dev_scope(drhd->devices,
2238 						  drhd->devices_cnt, i, dev)
2239 				break;
2240 			/* ignore DMAR unit if no devices exist */
2241 			if (i == drhd->devices_cnt)
2242 				drhd->ignored = 1;
2243 		}
2244 	}
2245 
2246 	for_each_active_drhd_unit(drhd) {
2247 		if (drhd->include_all)
2248 			continue;
2249 
2250 		for_each_active_dev_scope(drhd->devices,
2251 					  drhd->devices_cnt, i, dev)
2252 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2253 				break;
2254 		if (i < drhd->devices_cnt)
2255 			continue;
2256 
2257 		/* This IOMMU has *only* gfx devices. Either bypass it or
2258 		   set the gfx_mapped flag, as appropriate */
2259 		drhd->gfx_dedicated = 1;
2260 		if (disable_igfx_iommu)
2261 			drhd->ignored = 1;
2262 	}
2263 }
2264 
2265 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2266 static int init_iommu_hw(void)
2267 {
2268 	struct dmar_drhd_unit *drhd;
2269 	struct intel_iommu *iommu = NULL;
2270 	int ret;
2271 
2272 	for_each_active_iommu(iommu, drhd) {
2273 		if (iommu->qi) {
2274 			ret = dmar_reenable_qi(iommu);
2275 			if (ret)
2276 				return ret;
2277 		}
2278 	}
2279 
2280 	for_each_iommu(iommu, drhd) {
2281 		if (drhd->ignored) {
2282 			/*
2283 			 * we always have to disable PMRs or DMA may fail on
2284 			 * this device
2285 			 */
2286 			if (force_on)
2287 				iommu_disable_protect_mem_regions(iommu);
2288 			continue;
2289 		}
2290 
2291 		iommu_flush_write_buffer(iommu);
2292 		iommu_set_root_entry(iommu);
2293 		iommu_enable_translation(iommu);
2294 		iommu_disable_protect_mem_regions(iommu);
2295 	}
2296 
2297 	return 0;
2298 }
2299 
iommu_flush_all(void)2300 static void iommu_flush_all(void)
2301 {
2302 	struct dmar_drhd_unit *drhd;
2303 	struct intel_iommu *iommu;
2304 
2305 	for_each_active_iommu(iommu, drhd) {
2306 		iommu->flush.flush_context(iommu, 0, 0, 0,
2307 					   DMA_CCMD_GLOBAL_INVL);
2308 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2309 					 DMA_TLB_GLOBAL_FLUSH);
2310 	}
2311 }
2312 
iommu_suspend(void)2313 static int iommu_suspend(void)
2314 {
2315 	struct dmar_drhd_unit *drhd;
2316 	struct intel_iommu *iommu = NULL;
2317 	unsigned long flag;
2318 
2319 	iommu_flush_all();
2320 
2321 	for_each_active_iommu(iommu, drhd) {
2322 		iommu_disable_translation(iommu);
2323 
2324 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2325 
2326 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2327 			readl(iommu->reg + DMAR_FECTL_REG);
2328 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2329 			readl(iommu->reg + DMAR_FEDATA_REG);
2330 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2331 			readl(iommu->reg + DMAR_FEADDR_REG);
2332 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2333 			readl(iommu->reg + DMAR_FEUADDR_REG);
2334 
2335 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2336 	}
2337 	return 0;
2338 }
2339 
iommu_resume(void)2340 static void iommu_resume(void)
2341 {
2342 	struct dmar_drhd_unit *drhd;
2343 	struct intel_iommu *iommu = NULL;
2344 	unsigned long flag;
2345 
2346 	if (init_iommu_hw()) {
2347 		if (force_on)
2348 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2349 		else
2350 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2351 		return;
2352 	}
2353 
2354 	for_each_active_iommu(iommu, drhd) {
2355 
2356 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2357 
2358 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2359 			iommu->reg + DMAR_FECTL_REG);
2360 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2361 			iommu->reg + DMAR_FEDATA_REG);
2362 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2363 			iommu->reg + DMAR_FEADDR_REG);
2364 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2365 			iommu->reg + DMAR_FEUADDR_REG);
2366 
2367 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2368 	}
2369 }
2370 
2371 static struct syscore_ops iommu_syscore_ops = {
2372 	.resume		= iommu_resume,
2373 	.suspend	= iommu_suspend,
2374 };
2375 
init_iommu_pm_ops(void)2376 static void __init init_iommu_pm_ops(void)
2377 {
2378 	register_syscore_ops(&iommu_syscore_ops);
2379 }
2380 
2381 #else
init_iommu_pm_ops(void)2382 static inline void init_iommu_pm_ops(void) {}
2383 #endif	/* CONFIG_PM */
2384 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2385 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2386 {
2387 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2388 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2389 	    rmrr->end_address <= rmrr->base_address ||
2390 	    arch_rmrr_sanity_check(rmrr))
2391 		return -EINVAL;
2392 
2393 	return 0;
2394 }
2395 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2396 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2397 {
2398 	struct acpi_dmar_reserved_memory *rmrr;
2399 	struct dmar_rmrr_unit *rmrru;
2400 
2401 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2402 	if (rmrr_sanity_check(rmrr)) {
2403 		pr_warn(FW_BUG
2404 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2405 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2406 			   rmrr->base_address, rmrr->end_address,
2407 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2408 			   dmi_get_system_info(DMI_BIOS_VERSION),
2409 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2410 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2411 	}
2412 
2413 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2414 	if (!rmrru)
2415 		goto out;
2416 
2417 	rmrru->hdr = header;
2418 
2419 	rmrru->base_address = rmrr->base_address;
2420 	rmrru->end_address = rmrr->end_address;
2421 
2422 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2423 				((void *)rmrr) + rmrr->header.length,
2424 				&rmrru->devices_cnt);
2425 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2426 		goto free_rmrru;
2427 
2428 	list_add(&rmrru->list, &dmar_rmrr_units);
2429 
2430 	return 0;
2431 free_rmrru:
2432 	kfree(rmrru);
2433 out:
2434 	return -ENOMEM;
2435 }
2436 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2437 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2438 {
2439 	struct dmar_atsr_unit *atsru;
2440 	struct acpi_dmar_atsr *tmp;
2441 
2442 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2443 				dmar_rcu_check()) {
2444 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2445 		if (atsr->segment != tmp->segment)
2446 			continue;
2447 		if (atsr->header.length != tmp->header.length)
2448 			continue;
2449 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2450 			return atsru;
2451 	}
2452 
2453 	return NULL;
2454 }
2455 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2456 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2457 {
2458 	struct acpi_dmar_atsr *atsr;
2459 	struct dmar_atsr_unit *atsru;
2460 
2461 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2462 		return 0;
2463 
2464 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2465 	atsru = dmar_find_atsr(atsr);
2466 	if (atsru)
2467 		return 0;
2468 
2469 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2470 	if (!atsru)
2471 		return -ENOMEM;
2472 
2473 	/*
2474 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2475 	 * copy the memory content because the memory buffer will be freed
2476 	 * on return.
2477 	 */
2478 	atsru->hdr = (void *)(atsru + 1);
2479 	memcpy(atsru->hdr, hdr, hdr->length);
2480 	atsru->include_all = atsr->flags & 0x1;
2481 	if (!atsru->include_all) {
2482 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2483 				(void *)atsr + atsr->header.length,
2484 				&atsru->devices_cnt);
2485 		if (atsru->devices_cnt && atsru->devices == NULL) {
2486 			kfree(atsru);
2487 			return -ENOMEM;
2488 		}
2489 	}
2490 
2491 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2492 
2493 	return 0;
2494 }
2495 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2496 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2497 {
2498 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2499 	kfree(atsru);
2500 }
2501 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2502 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2503 {
2504 	struct acpi_dmar_atsr *atsr;
2505 	struct dmar_atsr_unit *atsru;
2506 
2507 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2508 	atsru = dmar_find_atsr(atsr);
2509 	if (atsru) {
2510 		list_del_rcu(&atsru->list);
2511 		synchronize_rcu();
2512 		intel_iommu_free_atsr(atsru);
2513 	}
2514 
2515 	return 0;
2516 }
2517 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2518 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2519 {
2520 	int i;
2521 	struct device *dev;
2522 	struct acpi_dmar_atsr *atsr;
2523 	struct dmar_atsr_unit *atsru;
2524 
2525 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2526 	atsru = dmar_find_atsr(atsr);
2527 	if (!atsru)
2528 		return 0;
2529 
2530 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2531 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2532 					  i, dev)
2533 			return -EBUSY;
2534 	}
2535 
2536 	return 0;
2537 }
2538 
dmar_find_satc(struct acpi_dmar_satc * satc)2539 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2540 {
2541 	struct dmar_satc_unit *satcu;
2542 	struct acpi_dmar_satc *tmp;
2543 
2544 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2545 				dmar_rcu_check()) {
2546 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2547 		if (satc->segment != tmp->segment)
2548 			continue;
2549 		if (satc->header.length != tmp->header.length)
2550 			continue;
2551 		if (memcmp(satc, tmp, satc->header.length) == 0)
2552 			return satcu;
2553 	}
2554 
2555 	return NULL;
2556 }
2557 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2558 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2559 {
2560 	struct acpi_dmar_satc *satc;
2561 	struct dmar_satc_unit *satcu;
2562 
2563 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2564 		return 0;
2565 
2566 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2567 	satcu = dmar_find_satc(satc);
2568 	if (satcu)
2569 		return 0;
2570 
2571 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2572 	if (!satcu)
2573 		return -ENOMEM;
2574 
2575 	satcu->hdr = (void *)(satcu + 1);
2576 	memcpy(satcu->hdr, hdr, hdr->length);
2577 	satcu->atc_required = satc->flags & 0x1;
2578 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2579 					      (void *)satc + satc->header.length,
2580 					      &satcu->devices_cnt);
2581 	if (satcu->devices_cnt && !satcu->devices) {
2582 		kfree(satcu);
2583 		return -ENOMEM;
2584 	}
2585 	list_add_rcu(&satcu->list, &dmar_satc_units);
2586 
2587 	return 0;
2588 }
2589 
intel_iommu_add(struct dmar_drhd_unit * dmaru)2590 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2591 {
2592 	struct intel_iommu *iommu = dmaru->iommu;
2593 	int ret;
2594 
2595 	/*
2596 	 * Disable translation if already enabled prior to OS handover.
2597 	 */
2598 	if (iommu->gcmd & DMA_GCMD_TE)
2599 		iommu_disable_translation(iommu);
2600 
2601 	ret = iommu_alloc_root_entry(iommu);
2602 	if (ret)
2603 		goto out;
2604 
2605 	intel_svm_check(iommu);
2606 
2607 	if (dmaru->ignored) {
2608 		/*
2609 		 * we always have to disable PMRs or DMA may fail on this device
2610 		 */
2611 		if (force_on)
2612 			iommu_disable_protect_mem_regions(iommu);
2613 		return 0;
2614 	}
2615 
2616 	intel_iommu_init_qi(iommu);
2617 	iommu_flush_write_buffer(iommu);
2618 
2619 	if (ecap_prs(iommu->ecap)) {
2620 		ret = intel_iommu_enable_prq(iommu);
2621 		if (ret)
2622 			goto disable_iommu;
2623 	}
2624 
2625 	ret = dmar_set_interrupt(iommu);
2626 	if (ret)
2627 		goto disable_iommu;
2628 
2629 	iommu_set_root_entry(iommu);
2630 	iommu_enable_translation(iommu);
2631 
2632 	iommu_disable_protect_mem_regions(iommu);
2633 	return 0;
2634 
2635 disable_iommu:
2636 	disable_dmar_iommu(iommu);
2637 out:
2638 	free_dmar_iommu(iommu);
2639 	return ret;
2640 }
2641 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2642 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2643 {
2644 	int ret = 0;
2645 	struct intel_iommu *iommu = dmaru->iommu;
2646 
2647 	if (!intel_iommu_enabled)
2648 		return 0;
2649 	if (iommu == NULL)
2650 		return -EINVAL;
2651 
2652 	if (insert) {
2653 		ret = intel_iommu_add(dmaru);
2654 	} else {
2655 		disable_dmar_iommu(iommu);
2656 		free_dmar_iommu(iommu);
2657 	}
2658 
2659 	return ret;
2660 }
2661 
intel_iommu_free_dmars(void)2662 static void intel_iommu_free_dmars(void)
2663 {
2664 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2665 	struct dmar_atsr_unit *atsru, *atsr_n;
2666 	struct dmar_satc_unit *satcu, *satc_n;
2667 
2668 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2669 		list_del(&rmrru->list);
2670 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2671 		kfree(rmrru);
2672 	}
2673 
2674 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2675 		list_del(&atsru->list);
2676 		intel_iommu_free_atsr(atsru);
2677 	}
2678 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2679 		list_del(&satcu->list);
2680 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2681 		kfree(satcu);
2682 	}
2683 }
2684 
dmar_find_matched_satc_unit(struct pci_dev * dev)2685 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2686 {
2687 	struct dmar_satc_unit *satcu;
2688 	struct acpi_dmar_satc *satc;
2689 	struct device *tmp;
2690 	int i;
2691 
2692 	rcu_read_lock();
2693 
2694 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2695 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2696 		if (satc->segment != pci_domain_nr(dev->bus))
2697 			continue;
2698 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2699 			if (to_pci_dev(tmp) == dev)
2700 				goto out;
2701 	}
2702 	satcu = NULL;
2703 out:
2704 	rcu_read_unlock();
2705 	return satcu;
2706 }
2707 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2708 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2709 {
2710 	struct pci_dev *bridge = NULL;
2711 	struct dmar_atsr_unit *atsru;
2712 	struct dmar_satc_unit *satcu;
2713 	struct acpi_dmar_atsr *atsr;
2714 	bool supported = true;
2715 	struct pci_bus *bus;
2716 	struct device *tmp;
2717 	int i;
2718 
2719 	dev = pci_physfn(dev);
2720 	satcu = dmar_find_matched_satc_unit(dev);
2721 	if (satcu)
2722 		/*
2723 		 * This device supports ATS as it is in SATC table.
2724 		 * When IOMMU is in legacy mode, enabling ATS is done
2725 		 * automatically by HW for the device that requires
2726 		 * ATS, hence OS should not enable this device ATS
2727 		 * to avoid duplicated TLB invalidation.
2728 		 */
2729 		return !(satcu->atc_required && !sm_supported(iommu));
2730 
2731 	for (bus = dev->bus; bus; bus = bus->parent) {
2732 		bridge = bus->self;
2733 		/* If it's an integrated device, allow ATS */
2734 		if (!bridge)
2735 			return true;
2736 		/* Connected via non-PCIe: no ATS */
2737 		if (!pci_is_pcie(bridge) ||
2738 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2739 			return false;
2740 		/* If we found the root port, look it up in the ATSR */
2741 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2742 			break;
2743 	}
2744 
2745 	rcu_read_lock();
2746 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2747 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2748 		if (atsr->segment != pci_domain_nr(dev->bus))
2749 			continue;
2750 
2751 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2752 			if (tmp == &bridge->dev)
2753 				goto out;
2754 
2755 		if (atsru->include_all)
2756 			goto out;
2757 	}
2758 	supported = false;
2759 out:
2760 	rcu_read_unlock();
2761 
2762 	return supported;
2763 }
2764 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2765 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2766 {
2767 	int ret;
2768 	struct dmar_rmrr_unit *rmrru;
2769 	struct dmar_atsr_unit *atsru;
2770 	struct dmar_satc_unit *satcu;
2771 	struct acpi_dmar_atsr *atsr;
2772 	struct acpi_dmar_reserved_memory *rmrr;
2773 	struct acpi_dmar_satc *satc;
2774 
2775 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2776 		return 0;
2777 
2778 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2779 		rmrr = container_of(rmrru->hdr,
2780 				    struct acpi_dmar_reserved_memory, header);
2781 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2782 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2783 				((void *)rmrr) + rmrr->header.length,
2784 				rmrr->segment, rmrru->devices,
2785 				rmrru->devices_cnt);
2786 			if (ret < 0)
2787 				return ret;
2788 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2789 			dmar_remove_dev_scope(info, rmrr->segment,
2790 				rmrru->devices, rmrru->devices_cnt);
2791 		}
2792 	}
2793 
2794 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2795 		if (atsru->include_all)
2796 			continue;
2797 
2798 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2799 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2800 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2801 					(void *)atsr + atsr->header.length,
2802 					atsr->segment, atsru->devices,
2803 					atsru->devices_cnt);
2804 			if (ret > 0)
2805 				break;
2806 			else if (ret < 0)
2807 				return ret;
2808 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2809 			if (dmar_remove_dev_scope(info, atsr->segment,
2810 					atsru->devices, atsru->devices_cnt))
2811 				break;
2812 		}
2813 	}
2814 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2815 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2816 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2817 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2818 					(void *)satc + satc->header.length,
2819 					satc->segment, satcu->devices,
2820 					satcu->devices_cnt);
2821 			if (ret > 0)
2822 				break;
2823 			else if (ret < 0)
2824 				return ret;
2825 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2826 			if (dmar_remove_dev_scope(info, satc->segment,
2827 					satcu->devices, satcu->devices_cnt))
2828 				break;
2829 		}
2830 	}
2831 
2832 	return 0;
2833 }
2834 
intel_disable_iommus(void)2835 static void intel_disable_iommus(void)
2836 {
2837 	struct intel_iommu *iommu = NULL;
2838 	struct dmar_drhd_unit *drhd;
2839 
2840 	for_each_iommu(iommu, drhd)
2841 		iommu_disable_translation(iommu);
2842 }
2843 
intel_iommu_shutdown(void)2844 void intel_iommu_shutdown(void)
2845 {
2846 	struct dmar_drhd_unit *drhd;
2847 	struct intel_iommu *iommu = NULL;
2848 
2849 	if (no_iommu || dmar_disabled)
2850 		return;
2851 
2852 	/*
2853 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2854 	 * no lock and RCU checking needed anymore
2855 	 */
2856 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2857 		iommu = drhd->iommu;
2858 
2859 		/* Disable PMRs explicitly here. */
2860 		iommu_disable_protect_mem_regions(iommu);
2861 
2862 		/* Make sure the IOMMUs are switched off */
2863 		iommu_disable_translation(iommu);
2864 	}
2865 }
2866 
dev_to_intel_iommu(struct device * dev)2867 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2868 {
2869 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2870 
2871 	return container_of(iommu_dev, struct intel_iommu, iommu);
2872 }
2873 
version_show(struct device * dev,struct device_attribute * attr,char * buf)2874 static ssize_t version_show(struct device *dev,
2875 			    struct device_attribute *attr, char *buf)
2876 {
2877 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2878 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2879 	return sysfs_emit(buf, "%d:%d\n",
2880 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2881 }
2882 static DEVICE_ATTR_RO(version);
2883 
address_show(struct device * dev,struct device_attribute * attr,char * buf)2884 static ssize_t address_show(struct device *dev,
2885 			    struct device_attribute *attr, char *buf)
2886 {
2887 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2888 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2889 }
2890 static DEVICE_ATTR_RO(address);
2891 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2892 static ssize_t cap_show(struct device *dev,
2893 			struct device_attribute *attr, char *buf)
2894 {
2895 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2896 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2897 }
2898 static DEVICE_ATTR_RO(cap);
2899 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2900 static ssize_t ecap_show(struct device *dev,
2901 			 struct device_attribute *attr, char *buf)
2902 {
2903 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2904 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2905 }
2906 static DEVICE_ATTR_RO(ecap);
2907 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2908 static ssize_t domains_supported_show(struct device *dev,
2909 				      struct device_attribute *attr, char *buf)
2910 {
2911 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2912 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2913 }
2914 static DEVICE_ATTR_RO(domains_supported);
2915 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2916 static ssize_t domains_used_show(struct device *dev,
2917 				 struct device_attribute *attr, char *buf)
2918 {
2919 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2920 	unsigned int count = 0;
2921 	int id;
2922 
2923 	for (id = 0; id < cap_ndoms(iommu->cap); id++)
2924 		if (ida_exists(&iommu->domain_ida, id))
2925 			count++;
2926 
2927 	return sysfs_emit(buf, "%d\n", count);
2928 }
2929 static DEVICE_ATTR_RO(domains_used);
2930 
2931 static struct attribute *intel_iommu_attrs[] = {
2932 	&dev_attr_version.attr,
2933 	&dev_attr_address.attr,
2934 	&dev_attr_cap.attr,
2935 	&dev_attr_ecap.attr,
2936 	&dev_attr_domains_supported.attr,
2937 	&dev_attr_domains_used.attr,
2938 	NULL,
2939 };
2940 
2941 static struct attribute_group intel_iommu_group = {
2942 	.name = "intel-iommu",
2943 	.attrs = intel_iommu_attrs,
2944 };
2945 
2946 const struct attribute_group *intel_iommu_groups[] = {
2947 	&intel_iommu_group,
2948 	NULL,
2949 };
2950 
has_external_pci(void)2951 static bool has_external_pci(void)
2952 {
2953 	struct pci_dev *pdev = NULL;
2954 
2955 	for_each_pci_dev(pdev)
2956 		if (pdev->external_facing) {
2957 			pci_dev_put(pdev);
2958 			return true;
2959 		}
2960 
2961 	return false;
2962 }
2963 
platform_optin_force_iommu(void)2964 static int __init platform_optin_force_iommu(void)
2965 {
2966 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2967 		return 0;
2968 
2969 	if (no_iommu || dmar_disabled)
2970 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2971 
2972 	/*
2973 	 * If Intel-IOMMU is disabled by default, we will apply identity
2974 	 * map for all devices except those marked as being untrusted.
2975 	 */
2976 	if (dmar_disabled)
2977 		iommu_set_default_passthrough(false);
2978 
2979 	dmar_disabled = 0;
2980 	no_iommu = 0;
2981 
2982 	return 1;
2983 }
2984 
probe_acpi_namespace_devices(void)2985 static int __init probe_acpi_namespace_devices(void)
2986 {
2987 	struct dmar_drhd_unit *drhd;
2988 	/* To avoid a -Wunused-but-set-variable warning. */
2989 	struct intel_iommu *iommu __maybe_unused;
2990 	struct device *dev;
2991 	int i, ret = 0;
2992 
2993 	for_each_active_iommu(iommu, drhd) {
2994 		for_each_active_dev_scope(drhd->devices,
2995 					  drhd->devices_cnt, i, dev) {
2996 			struct acpi_device_physical_node *pn;
2997 			struct acpi_device *adev;
2998 
2999 			if (dev->bus != &acpi_bus_type)
3000 				continue;
3001 
3002 			up_read(&dmar_global_lock);
3003 			adev = to_acpi_device(dev);
3004 			mutex_lock(&adev->physical_node_lock);
3005 			list_for_each_entry(pn,
3006 					    &adev->physical_node_list, node) {
3007 				ret = iommu_probe_device(pn->dev);
3008 				if (ret)
3009 					break;
3010 			}
3011 			mutex_unlock(&adev->physical_node_lock);
3012 			down_read(&dmar_global_lock);
3013 
3014 			if (ret)
3015 				return ret;
3016 		}
3017 	}
3018 
3019 	return 0;
3020 }
3021 
tboot_force_iommu(void)3022 static __init int tboot_force_iommu(void)
3023 {
3024 	if (!tboot_enabled())
3025 		return 0;
3026 
3027 	if (no_iommu || dmar_disabled)
3028 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3029 
3030 	dmar_disabled = 0;
3031 	no_iommu = 0;
3032 
3033 	return 1;
3034 }
3035 
intel_iommu_init(void)3036 int __init intel_iommu_init(void)
3037 {
3038 	int ret = -ENODEV;
3039 	struct dmar_drhd_unit *drhd;
3040 	struct intel_iommu *iommu;
3041 
3042 	/*
3043 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3044 	 * opt in, so enforce that.
3045 	 */
3046 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3047 		    platform_optin_force_iommu();
3048 
3049 	down_write(&dmar_global_lock);
3050 	if (dmar_table_init()) {
3051 		if (force_on)
3052 			panic("tboot: Failed to initialize DMAR table\n");
3053 		goto out_free_dmar;
3054 	}
3055 
3056 	if (dmar_dev_scope_init() < 0) {
3057 		if (force_on)
3058 			panic("tboot: Failed to initialize DMAR device scope\n");
3059 		goto out_free_dmar;
3060 	}
3061 
3062 	up_write(&dmar_global_lock);
3063 
3064 	/*
3065 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3066 	 * complain later when we register it under the lock.
3067 	 */
3068 	dmar_register_bus_notifier();
3069 
3070 	down_write(&dmar_global_lock);
3071 
3072 	if (!no_iommu)
3073 		intel_iommu_debugfs_init();
3074 
3075 	if (no_iommu || dmar_disabled) {
3076 		/*
3077 		 * We exit the function here to ensure IOMMU's remapping and
3078 		 * mempool aren't setup, which means that the IOMMU's PMRs
3079 		 * won't be disabled via the call to init_dmars(). So disable
3080 		 * it explicitly here. The PMRs were setup by tboot prior to
3081 		 * calling SENTER, but the kernel is expected to reset/tear
3082 		 * down the PMRs.
3083 		 */
3084 		if (intel_iommu_tboot_noforce) {
3085 			for_each_iommu(iommu, drhd)
3086 				iommu_disable_protect_mem_regions(iommu);
3087 		}
3088 
3089 		/*
3090 		 * Make sure the IOMMUs are switched off, even when we
3091 		 * boot into a kexec kernel and the previous kernel left
3092 		 * them enabled
3093 		 */
3094 		intel_disable_iommus();
3095 		goto out_free_dmar;
3096 	}
3097 
3098 	if (list_empty(&dmar_rmrr_units))
3099 		pr_info("No RMRR found\n");
3100 
3101 	if (list_empty(&dmar_atsr_units))
3102 		pr_info("No ATSR found\n");
3103 
3104 	if (list_empty(&dmar_satc_units))
3105 		pr_info("No SATC found\n");
3106 
3107 	init_no_remapping_devices();
3108 
3109 	ret = init_dmars();
3110 	if (ret) {
3111 		if (force_on)
3112 			panic("tboot: Failed to initialize DMARs\n");
3113 		pr_err("Initialization failed\n");
3114 		goto out_free_dmar;
3115 	}
3116 	up_write(&dmar_global_lock);
3117 
3118 	init_iommu_pm_ops();
3119 
3120 	down_read(&dmar_global_lock);
3121 	for_each_active_iommu(iommu, drhd) {
3122 		/*
3123 		 * The flush queue implementation does not perform
3124 		 * page-selective invalidations that are required for efficient
3125 		 * TLB flushes in virtual environments.  The benefit of batching
3126 		 * is likely to be much lower than the overhead of synchronizing
3127 		 * the virtual and physical IOMMU page-tables.
3128 		 */
3129 		if (cap_caching_mode(iommu->cap) &&
3130 		    !first_level_by_default(iommu)) {
3131 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3132 			iommu_set_dma_strict();
3133 		}
3134 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3135 				       intel_iommu_groups,
3136 				       "%s", iommu->name);
3137 		/*
3138 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3139 		 * Release the dmar_global_lock before entering the device probe path
3140 		 * to avoid unnecessary lock order splat.
3141 		 */
3142 		up_read(&dmar_global_lock);
3143 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3144 		down_read(&dmar_global_lock);
3145 
3146 		iommu_pmu_register(iommu);
3147 	}
3148 
3149 	if (probe_acpi_namespace_devices())
3150 		pr_warn("ACPI name space devices didn't probe correctly\n");
3151 
3152 	/* Finally, we enable the DMA remapping hardware. */
3153 	for_each_iommu(iommu, drhd) {
3154 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3155 			iommu_enable_translation(iommu);
3156 
3157 		iommu_disable_protect_mem_regions(iommu);
3158 	}
3159 	up_read(&dmar_global_lock);
3160 
3161 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3162 
3163 	intel_iommu_enabled = 1;
3164 
3165 	return 0;
3166 
3167 out_free_dmar:
3168 	intel_iommu_free_dmars();
3169 	up_write(&dmar_global_lock);
3170 	return ret;
3171 }
3172 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3173 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3174 {
3175 	struct device_domain_info *info = opaque;
3176 
3177 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3178 	return 0;
3179 }
3180 
3181 /*
3182  * NB - intel-iommu lacks any sort of reference counting for the users of
3183  * dependent devices.  If multiple endpoints have intersecting dependent
3184  * devices, unbinding the driver from any one of them will possibly leave
3185  * the others unable to operate.
3186  */
domain_context_clear(struct device_domain_info * info)3187 static void domain_context_clear(struct device_domain_info *info)
3188 {
3189 	if (!dev_is_pci(info->dev)) {
3190 		domain_context_clear_one(info, info->bus, info->devfn);
3191 		return;
3192 	}
3193 
3194 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3195 			       &domain_context_clear_one_cb, info);
3196 	iommu_disable_pci_ats(info);
3197 }
3198 
3199 /*
3200  * Clear the page table pointer in context or pasid table entries so that
3201  * all DMA requests without PASID from the device are blocked. If the page
3202  * table has been set, clean up the data structures.
3203  */
device_block_translation(struct device * dev)3204 void device_block_translation(struct device *dev)
3205 {
3206 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3207 	struct intel_iommu *iommu = info->iommu;
3208 	unsigned long flags;
3209 
3210 	/* Device in DMA blocking state. Noting to do. */
3211 	if (!info->domain_attached)
3212 		return;
3213 
3214 	if (info->domain)
3215 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3216 
3217 	if (!dev_is_real_dma_subdevice(dev)) {
3218 		if (sm_supported(iommu))
3219 			intel_pasid_tear_down_entry(iommu, dev,
3220 						    IOMMU_NO_PASID, false);
3221 		else
3222 			domain_context_clear(info);
3223 	}
3224 
3225 	/* Device now in DMA blocking state. */
3226 	info->domain_attached = false;
3227 
3228 	if (!info->domain)
3229 		return;
3230 
3231 	spin_lock_irqsave(&info->domain->lock, flags);
3232 	list_del(&info->link);
3233 	spin_unlock_irqrestore(&info->domain->lock, flags);
3234 
3235 	domain_detach_iommu(info->domain, iommu);
3236 	info->domain = NULL;
3237 }
3238 
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3239 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3240 				      struct device *dev)
3241 {
3242 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3243 
3244 	iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
3245 	device_block_translation(dev);
3246 	return 0;
3247 }
3248 
3249 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3250 					 struct device *dev, ioasid_t pasid,
3251 					 struct iommu_domain *old);
3252 
3253 static struct iommu_domain blocking_domain = {
3254 	.type = IOMMU_DOMAIN_BLOCKED,
3255 	.ops = &(const struct iommu_domain_ops) {
3256 		.attach_dev	= blocking_domain_attach_dev,
3257 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3258 	}
3259 };
3260 
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3261 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3262 {
3263 	if (!intel_iommu_superpage)
3264 		return 0;
3265 
3266 	if (first_stage)
3267 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3268 
3269 	return fls(cap_super_page_val(iommu->cap));
3270 }
3271 
paging_domain_alloc(struct device * dev,bool first_stage)3272 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3273 {
3274 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3275 	struct intel_iommu *iommu = info->iommu;
3276 	struct dmar_domain *domain;
3277 	int addr_width;
3278 
3279 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3280 	if (!domain)
3281 		return ERR_PTR(-ENOMEM);
3282 
3283 	INIT_LIST_HEAD(&domain->devices);
3284 	INIT_LIST_HEAD(&domain->dev_pasids);
3285 	INIT_LIST_HEAD(&domain->cache_tags);
3286 	spin_lock_init(&domain->lock);
3287 	spin_lock_init(&domain->cache_lock);
3288 	xa_init(&domain->iommu_array);
3289 
3290 	domain->nid = dev_to_node(dev);
3291 	domain->use_first_level = first_stage;
3292 
3293 	/* calculate the address width */
3294 	addr_width = agaw_to_width(iommu->agaw);
3295 	if (addr_width > cap_mgaw(iommu->cap))
3296 		addr_width = cap_mgaw(iommu->cap);
3297 	domain->gaw = addr_width;
3298 	domain->agaw = iommu->agaw;
3299 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3300 
3301 	/* iommu memory access coherency */
3302 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3303 
3304 	/* pagesize bitmap */
3305 	domain->domain.pgsize_bitmap = SZ_4K;
3306 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3307 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3308 
3309 	/*
3310 	 * IOVA aperture: First-level translation restricts the input-address
3311 	 * to a canonical address (i.e., address bits 63:N have the same value
3312 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3313 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3314 	 */
3315 	domain->domain.geometry.force_aperture = true;
3316 	domain->domain.geometry.aperture_start = 0;
3317 	if (first_stage)
3318 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3319 	else
3320 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3321 
3322 	/* always allocate the top pgd */
3323 	domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
3324 	if (!domain->pgd) {
3325 		kfree(domain);
3326 		return ERR_PTR(-ENOMEM);
3327 	}
3328 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3329 
3330 	return domain;
3331 }
3332 
3333 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3334 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3335 				      const struct iommu_user_data *user_data)
3336 {
3337 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3338 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3339 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3340 	struct intel_iommu *iommu = info->iommu;
3341 	struct dmar_domain *dmar_domain;
3342 	struct iommu_domain *domain;
3343 	bool first_stage;
3344 
3345 	if (flags &
3346 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3347 	       IOMMU_HWPT_ALLOC_PASID)))
3348 		return ERR_PTR(-EOPNOTSUPP);
3349 	if (nested_parent && !nested_supported(iommu))
3350 		return ERR_PTR(-EOPNOTSUPP);
3351 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3352 		return ERR_PTR(-EOPNOTSUPP);
3353 
3354 	/*
3355 	 * Always allocate the guest compatible page table unless
3356 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3357 	 * is specified.
3358 	 */
3359 	if (nested_parent || dirty_tracking) {
3360 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3361 			return ERR_PTR(-EOPNOTSUPP);
3362 		first_stage = false;
3363 	} else {
3364 		first_stage = first_level_by_default(iommu);
3365 	}
3366 
3367 	dmar_domain = paging_domain_alloc(dev, first_stage);
3368 	if (IS_ERR(dmar_domain))
3369 		return ERR_CAST(dmar_domain);
3370 	domain = &dmar_domain->domain;
3371 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3372 	domain->owner = &intel_iommu_ops;
3373 	domain->ops = intel_iommu_ops.default_domain_ops;
3374 
3375 	if (nested_parent) {
3376 		dmar_domain->nested_parent = true;
3377 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3378 		spin_lock_init(&dmar_domain->s1_lock);
3379 	}
3380 
3381 	if (dirty_tracking) {
3382 		if (dmar_domain->use_first_level) {
3383 			iommu_domain_free(domain);
3384 			return ERR_PTR(-EOPNOTSUPP);
3385 		}
3386 		domain->dirty_ops = &intel_dirty_ops;
3387 	}
3388 
3389 	return domain;
3390 }
3391 
intel_iommu_domain_free(struct iommu_domain * domain)3392 static void intel_iommu_domain_free(struct iommu_domain *domain)
3393 {
3394 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3395 
3396 	WARN_ON(dmar_domain->nested_parent &&
3397 		!list_empty(&dmar_domain->s1_domains));
3398 	domain_exit(dmar_domain);
3399 }
3400 
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3401 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3402 {
3403 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3404 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3405 	struct intel_iommu *iommu = info->iommu;
3406 	int addr_width;
3407 
3408 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3409 		return -EPERM;
3410 
3411 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3412 		return -EINVAL;
3413 
3414 	if (domain->dirty_ops && !ssads_supported(iommu))
3415 		return -EINVAL;
3416 
3417 	if (dmar_domain->iommu_coherency !=
3418 			iommu_paging_structure_coherency(iommu))
3419 		return -EINVAL;
3420 
3421 	if (dmar_domain->iommu_superpage !=
3422 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3423 		return -EINVAL;
3424 
3425 	if (dmar_domain->use_first_level &&
3426 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3427 		return -EINVAL;
3428 
3429 	/* check if this iommu agaw is sufficient for max mapped address */
3430 	addr_width = agaw_to_width(iommu->agaw);
3431 	if (addr_width > cap_mgaw(iommu->cap))
3432 		addr_width = cap_mgaw(iommu->cap);
3433 
3434 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3435 		return -EINVAL;
3436 
3437 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3438 	    context_copied(iommu, info->bus, info->devfn))
3439 		return intel_pasid_setup_sm_context(dev);
3440 
3441 	return 0;
3442 }
3443 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3444 static int intel_iommu_attach_device(struct iommu_domain *domain,
3445 				     struct device *dev)
3446 {
3447 	int ret;
3448 
3449 	device_block_translation(dev);
3450 
3451 	ret = paging_domain_compatible(domain, dev);
3452 	if (ret)
3453 		return ret;
3454 
3455 	ret = iopf_for_domain_set(domain, dev);
3456 	if (ret)
3457 		return ret;
3458 
3459 	ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
3460 	if (ret)
3461 		iopf_for_domain_remove(domain, dev);
3462 
3463 	return ret;
3464 }
3465 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3466 static int intel_iommu_map(struct iommu_domain *domain,
3467 			   unsigned long iova, phys_addr_t hpa,
3468 			   size_t size, int iommu_prot, gfp_t gfp)
3469 {
3470 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3471 	u64 max_addr;
3472 	int prot = 0;
3473 
3474 	if (iommu_prot & IOMMU_READ)
3475 		prot |= DMA_PTE_READ;
3476 	if (iommu_prot & IOMMU_WRITE)
3477 		prot |= DMA_PTE_WRITE;
3478 	if (dmar_domain->set_pte_snp)
3479 		prot |= DMA_PTE_SNP;
3480 
3481 	max_addr = iova + size;
3482 	if (dmar_domain->max_addr < max_addr) {
3483 		u64 end;
3484 
3485 		/* check if minimum agaw is sufficient for mapped address */
3486 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3487 		if (end < max_addr) {
3488 			pr_err("%s: iommu width (%d) is not "
3489 			       "sufficient for the mapped address (%llx)\n",
3490 			       __func__, dmar_domain->gaw, max_addr);
3491 			return -EFAULT;
3492 		}
3493 		dmar_domain->max_addr = max_addr;
3494 	}
3495 	/* Round up size to next multiple of PAGE_SIZE, if it and
3496 	   the low bits of hpa would take us onto the next page */
3497 	size = aligned_nrpages(hpa, size);
3498 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3499 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3500 }
3501 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3502 static int intel_iommu_map_pages(struct iommu_domain *domain,
3503 				 unsigned long iova, phys_addr_t paddr,
3504 				 size_t pgsize, size_t pgcount,
3505 				 int prot, gfp_t gfp, size_t *mapped)
3506 {
3507 	unsigned long pgshift = __ffs(pgsize);
3508 	size_t size = pgcount << pgshift;
3509 	int ret;
3510 
3511 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3512 		return -EINVAL;
3513 
3514 	if (!IS_ALIGNED(iova | paddr, pgsize))
3515 		return -EINVAL;
3516 
3517 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3518 	if (!ret && mapped)
3519 		*mapped = size;
3520 
3521 	return ret;
3522 }
3523 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3524 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3525 				unsigned long iova, size_t size,
3526 				struct iommu_iotlb_gather *gather)
3527 {
3528 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3529 	unsigned long start_pfn, last_pfn;
3530 	int level = 0;
3531 
3532 	/* Cope with horrid API which requires us to unmap more than the
3533 	   size argument if it happens to be a large-page mapping. */
3534 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3535 				     &level, GFP_ATOMIC)))
3536 		return 0;
3537 
3538 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3539 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3540 
3541 	start_pfn = iova >> VTD_PAGE_SHIFT;
3542 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3543 
3544 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3545 
3546 	if (dmar_domain->max_addr == iova + size)
3547 		dmar_domain->max_addr = iova;
3548 
3549 	/*
3550 	 * We do not use page-selective IOTLB invalidation in flush queue,
3551 	 * so there is no need to track page and sync iotlb.
3552 	 */
3553 	if (!iommu_iotlb_gather_queued(gather))
3554 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3555 
3556 	return size;
3557 }
3558 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3559 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3560 				      unsigned long iova,
3561 				      size_t pgsize, size_t pgcount,
3562 				      struct iommu_iotlb_gather *gather)
3563 {
3564 	unsigned long pgshift = __ffs(pgsize);
3565 	size_t size = pgcount << pgshift;
3566 
3567 	return intel_iommu_unmap(domain, iova, size, gather);
3568 }
3569 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3570 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3571 				 struct iommu_iotlb_gather *gather)
3572 {
3573 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3574 			      gather->end,
3575 			      iommu_pages_list_empty(&gather->freelist));
3576 	iommu_put_pages_list(&gather->freelist);
3577 }
3578 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3579 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3580 					    dma_addr_t iova)
3581 {
3582 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3583 	struct dma_pte *pte;
3584 	int level = 0;
3585 	u64 phys = 0;
3586 
3587 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3588 			     GFP_ATOMIC);
3589 	if (pte && dma_pte_present(pte))
3590 		phys = dma_pte_addr(pte) +
3591 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3592 						VTD_PAGE_SHIFT) - 1));
3593 
3594 	return phys;
3595 }
3596 
domain_support_force_snooping(struct dmar_domain * domain)3597 static bool domain_support_force_snooping(struct dmar_domain *domain)
3598 {
3599 	struct device_domain_info *info;
3600 	bool support = true;
3601 
3602 	assert_spin_locked(&domain->lock);
3603 	list_for_each_entry(info, &domain->devices, link) {
3604 		if (!ecap_sc_support(info->iommu->ecap)) {
3605 			support = false;
3606 			break;
3607 		}
3608 	}
3609 
3610 	return support;
3611 }
3612 
domain_set_force_snooping(struct dmar_domain * domain)3613 static void domain_set_force_snooping(struct dmar_domain *domain)
3614 {
3615 	struct device_domain_info *info;
3616 
3617 	assert_spin_locked(&domain->lock);
3618 	/*
3619 	 * Second level page table supports per-PTE snoop control. The
3620 	 * iommu_map() interface will handle this by setting SNP bit.
3621 	 */
3622 	if (!domain->use_first_level) {
3623 		domain->set_pte_snp = true;
3624 		return;
3625 	}
3626 
3627 	list_for_each_entry(info, &domain->devices, link)
3628 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3629 						     IOMMU_NO_PASID);
3630 }
3631 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3632 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3633 {
3634 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3635 	unsigned long flags;
3636 
3637 	if (dmar_domain->force_snooping)
3638 		return true;
3639 
3640 	spin_lock_irqsave(&dmar_domain->lock, flags);
3641 	if (!domain_support_force_snooping(dmar_domain) ||
3642 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3643 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3644 		return false;
3645 	}
3646 
3647 	domain_set_force_snooping(dmar_domain);
3648 	dmar_domain->force_snooping = true;
3649 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3650 
3651 	return true;
3652 }
3653 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3654 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3655 {
3656 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3657 
3658 	switch (cap) {
3659 	case IOMMU_CAP_CACHE_COHERENCY:
3660 	case IOMMU_CAP_DEFERRED_FLUSH:
3661 		return true;
3662 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3663 		return dmar_platform_optin();
3664 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3665 		return ecap_sc_support(info->iommu->ecap);
3666 	case IOMMU_CAP_DIRTY_TRACKING:
3667 		return ssads_supported(info->iommu);
3668 	default:
3669 		return false;
3670 	}
3671 }
3672 
intel_iommu_probe_device(struct device * dev)3673 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3674 {
3675 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3676 	struct device_domain_info *info;
3677 	struct intel_iommu *iommu;
3678 	u8 bus, devfn;
3679 	int ret;
3680 
3681 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3682 	if (!iommu || !iommu->iommu.ops)
3683 		return ERR_PTR(-ENODEV);
3684 
3685 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3686 	if (!info)
3687 		return ERR_PTR(-ENOMEM);
3688 
3689 	if (dev_is_real_dma_subdevice(dev)) {
3690 		info->bus = pdev->bus->number;
3691 		info->devfn = pdev->devfn;
3692 		info->segment = pci_domain_nr(pdev->bus);
3693 	} else {
3694 		info->bus = bus;
3695 		info->devfn = devfn;
3696 		info->segment = iommu->segment;
3697 	}
3698 
3699 	info->dev = dev;
3700 	info->iommu = iommu;
3701 	if (dev_is_pci(dev)) {
3702 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3703 		    pci_ats_supported(pdev) &&
3704 		    dmar_ats_supported(pdev, iommu)) {
3705 			info->ats_supported = 1;
3706 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3707 
3708 			/*
3709 			 * For IOMMU that supports device IOTLB throttling
3710 			 * (DIT), we assign PFSID to the invalidation desc
3711 			 * of a VF such that IOMMU HW can gauge queue depth
3712 			 * at PF level. If DIT is not set, PFSID will be
3713 			 * treated as reserved, which should be set to 0.
3714 			 */
3715 			if (ecap_dit(iommu->ecap))
3716 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3717 			info->ats_qdep = pci_ats_queue_depth(pdev);
3718 		}
3719 		if (sm_supported(iommu)) {
3720 			if (pasid_supported(iommu)) {
3721 				int features = pci_pasid_features(pdev);
3722 
3723 				if (features >= 0)
3724 					info->pasid_supported = features | 1;
3725 			}
3726 
3727 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3728 			    pci_pri_supported(pdev))
3729 				info->pri_supported = 1;
3730 		}
3731 	}
3732 
3733 	dev_iommu_priv_set(dev, info);
3734 	if (pdev && pci_ats_supported(pdev)) {
3735 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3736 		ret = device_rbtree_insert(iommu, info);
3737 		if (ret)
3738 			goto free;
3739 	}
3740 
3741 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3742 		ret = intel_pasid_alloc_table(dev);
3743 		if (ret) {
3744 			dev_err(dev, "PASID table allocation failed\n");
3745 			goto clear_rbtree;
3746 		}
3747 
3748 		if (!context_copied(iommu, info->bus, info->devfn)) {
3749 			ret = intel_pasid_setup_sm_context(dev);
3750 			if (ret)
3751 				goto free_table;
3752 		}
3753 	}
3754 
3755 	intel_iommu_debugfs_create_dev(info);
3756 
3757 	return &iommu->iommu;
3758 free_table:
3759 	intel_pasid_free_table(dev);
3760 clear_rbtree:
3761 	device_rbtree_remove(info);
3762 free:
3763 	kfree(info);
3764 
3765 	return ERR_PTR(ret);
3766 }
3767 
intel_iommu_probe_finalize(struct device * dev)3768 static void intel_iommu_probe_finalize(struct device *dev)
3769 {
3770 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3771 	struct intel_iommu *iommu = info->iommu;
3772 
3773 	/*
3774 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3775 	 * device is undefined if you enable PASID support after ATS support.
3776 	 * So always enable PASID support on devices which have it, even if
3777 	 * we can't yet know if we're ever going to use it.
3778 	 */
3779 	if (info->pasid_supported &&
3780 	    !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3781 		info->pasid_enabled = 1;
3782 
3783 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev))
3784 		iommu_enable_pci_ats(info);
3785 	iommu_enable_pci_pri(info);
3786 }
3787 
intel_iommu_release_device(struct device * dev)3788 static void intel_iommu_release_device(struct device *dev)
3789 {
3790 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3791 	struct intel_iommu *iommu = info->iommu;
3792 
3793 	iommu_disable_pci_pri(info);
3794 	iommu_disable_pci_ats(info);
3795 
3796 	if (info->pasid_enabled) {
3797 		pci_disable_pasid(to_pci_dev(dev));
3798 		info->pasid_enabled = 0;
3799 	}
3800 
3801 	mutex_lock(&iommu->iopf_lock);
3802 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3803 		device_rbtree_remove(info);
3804 	mutex_unlock(&iommu->iopf_lock);
3805 
3806 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3807 	    !context_copied(iommu, info->bus, info->devfn))
3808 		intel_pasid_teardown_sm_context(dev);
3809 
3810 	intel_pasid_free_table(dev);
3811 	intel_iommu_debugfs_remove_dev(info);
3812 	kfree(info);
3813 }
3814 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3815 static void intel_iommu_get_resv_regions(struct device *device,
3816 					 struct list_head *head)
3817 {
3818 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3819 	struct iommu_resv_region *reg;
3820 	struct dmar_rmrr_unit *rmrr;
3821 	struct device *i_dev;
3822 	int i;
3823 
3824 	rcu_read_lock();
3825 	for_each_rmrr_units(rmrr) {
3826 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3827 					  i, i_dev) {
3828 			struct iommu_resv_region *resv;
3829 			enum iommu_resv_type type;
3830 			size_t length;
3831 
3832 			if (i_dev != device &&
3833 			    !is_downstream_to_pci_bridge(device, i_dev))
3834 				continue;
3835 
3836 			length = rmrr->end_address - rmrr->base_address + 1;
3837 
3838 			type = device_rmrr_is_relaxable(device) ?
3839 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3840 
3841 			resv = iommu_alloc_resv_region(rmrr->base_address,
3842 						       length, prot, type,
3843 						       GFP_ATOMIC);
3844 			if (!resv)
3845 				break;
3846 
3847 			list_add_tail(&resv->list, head);
3848 		}
3849 	}
3850 	rcu_read_unlock();
3851 
3852 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3853 	if (dev_is_pci(device)) {
3854 		struct pci_dev *pdev = to_pci_dev(device);
3855 
3856 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3857 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3858 					IOMMU_RESV_DIRECT_RELAXABLE,
3859 					GFP_KERNEL);
3860 			if (reg)
3861 				list_add_tail(&reg->list, head);
3862 		}
3863 	}
3864 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3865 
3866 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3867 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3868 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3869 	if (!reg)
3870 		return;
3871 	list_add_tail(&reg->list, head);
3872 }
3873 
intel_iommu_device_group(struct device * dev)3874 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3875 {
3876 	if (dev_is_pci(dev))
3877 		return pci_device_group(dev);
3878 	return generic_device_group(dev);
3879 }
3880 
intel_iommu_enable_iopf(struct device * dev)3881 int intel_iommu_enable_iopf(struct device *dev)
3882 {
3883 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3884 	struct intel_iommu *iommu = info->iommu;
3885 	int ret;
3886 
3887 	if (!info->pri_enabled)
3888 		return -ENODEV;
3889 
3890 	/* pri_enabled is protected by the group mutex. */
3891 	iommu_group_mutex_assert(dev);
3892 	if (info->iopf_refcount) {
3893 		info->iopf_refcount++;
3894 		return 0;
3895 	}
3896 
3897 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3898 	if (ret)
3899 		return ret;
3900 
3901 	info->iopf_refcount = 1;
3902 
3903 	return 0;
3904 }
3905 
intel_iommu_disable_iopf(struct device * dev)3906 void intel_iommu_disable_iopf(struct device *dev)
3907 {
3908 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3909 	struct intel_iommu *iommu = info->iommu;
3910 
3911 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3912 		return;
3913 
3914 	iommu_group_mutex_assert(dev);
3915 	if (--info->iopf_refcount)
3916 		return;
3917 
3918 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3919 }
3920 
intel_iommu_is_attach_deferred(struct device * dev)3921 static bool intel_iommu_is_attach_deferred(struct device *dev)
3922 {
3923 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3924 
3925 	return translation_pre_enabled(info->iommu) && !info->domain;
3926 }
3927 
3928 /*
3929  * Check that the device does not live on an external facing PCI port that is
3930  * marked as untrusted. Such devices should not be able to apply quirks and
3931  * thus not be able to bypass the IOMMU restrictions.
3932  */
risky_device(struct pci_dev * pdev)3933 static bool risky_device(struct pci_dev *pdev)
3934 {
3935 	if (pdev->untrusted) {
3936 		pci_info(pdev,
3937 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3938 			 pdev->vendor, pdev->device);
3939 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3940 		return true;
3941 	}
3942 	return false;
3943 }
3944 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)3945 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3946 				      unsigned long iova, size_t size)
3947 {
3948 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
3949 
3950 	return 0;
3951 }
3952 
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)3953 void domain_remove_dev_pasid(struct iommu_domain *domain,
3954 			     struct device *dev, ioasid_t pasid)
3955 {
3956 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3957 	struct dev_pasid_info *curr, *dev_pasid = NULL;
3958 	struct intel_iommu *iommu = info->iommu;
3959 	struct dmar_domain *dmar_domain;
3960 	unsigned long flags;
3961 
3962 	if (!domain)
3963 		return;
3964 
3965 	/* Identity domain has no meta data for pasid. */
3966 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
3967 		return;
3968 
3969 	dmar_domain = to_dmar_domain(domain);
3970 	spin_lock_irqsave(&dmar_domain->lock, flags);
3971 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
3972 		if (curr->dev == dev && curr->pasid == pasid) {
3973 			list_del(&curr->link_domain);
3974 			dev_pasid = curr;
3975 			break;
3976 		}
3977 	}
3978 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3979 
3980 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
3981 	domain_detach_iommu(dmar_domain, iommu);
3982 	if (!WARN_ON_ONCE(!dev_pasid)) {
3983 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
3984 		kfree(dev_pasid);
3985 	}
3986 }
3987 
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)3988 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3989 					 struct device *dev, ioasid_t pasid,
3990 					 struct iommu_domain *old)
3991 {
3992 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3993 
3994 	iopf_for_domain_remove(old, dev);
3995 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
3996 	domain_remove_dev_pasid(old, dev, pasid);
3997 
3998 	return 0;
3999 }
4000 
4001 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4002 domain_add_dev_pasid(struct iommu_domain *domain,
4003 		     struct device *dev, ioasid_t pasid)
4004 {
4005 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4006 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4007 	struct intel_iommu *iommu = info->iommu;
4008 	struct dev_pasid_info *dev_pasid;
4009 	unsigned long flags;
4010 	int ret;
4011 
4012 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4013 	if (!dev_pasid)
4014 		return ERR_PTR(-ENOMEM);
4015 
4016 	ret = domain_attach_iommu(dmar_domain, iommu);
4017 	if (ret)
4018 		goto out_free;
4019 
4020 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4021 	if (ret)
4022 		goto out_detach_iommu;
4023 
4024 	dev_pasid->dev = dev;
4025 	dev_pasid->pasid = pasid;
4026 	spin_lock_irqsave(&dmar_domain->lock, flags);
4027 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4028 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4029 
4030 	return dev_pasid;
4031 out_detach_iommu:
4032 	domain_detach_iommu(dmar_domain, iommu);
4033 out_free:
4034 	kfree(dev_pasid);
4035 	return ERR_PTR(ret);
4036 }
4037 
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4038 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4039 				     struct device *dev, ioasid_t pasid,
4040 				     struct iommu_domain *old)
4041 {
4042 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4043 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4044 	struct intel_iommu *iommu = info->iommu;
4045 	struct dev_pasid_info *dev_pasid;
4046 	int ret;
4047 
4048 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4049 		return -EINVAL;
4050 
4051 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4052 		return -EOPNOTSUPP;
4053 
4054 	if (domain->dirty_ops)
4055 		return -EINVAL;
4056 
4057 	if (context_copied(iommu, info->bus, info->devfn))
4058 		return -EBUSY;
4059 
4060 	ret = paging_domain_compatible(domain, dev);
4061 	if (ret)
4062 		return ret;
4063 
4064 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4065 	if (IS_ERR(dev_pasid))
4066 		return PTR_ERR(dev_pasid);
4067 
4068 	ret = iopf_for_domain_replace(domain, old, dev);
4069 	if (ret)
4070 		goto out_remove_dev_pasid;
4071 
4072 	if (dmar_domain->use_first_level)
4073 		ret = domain_setup_first_level(iommu, dmar_domain,
4074 					       dev, pasid, old);
4075 	else
4076 		ret = domain_setup_second_level(iommu, dmar_domain,
4077 						dev, pasid, old);
4078 	if (ret)
4079 		goto out_unwind_iopf;
4080 
4081 	domain_remove_dev_pasid(old, dev, pasid);
4082 
4083 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4084 
4085 	return 0;
4086 
4087 out_unwind_iopf:
4088 	iopf_for_domain_replace(old, domain, dev);
4089 out_remove_dev_pasid:
4090 	domain_remove_dev_pasid(domain, dev, pasid);
4091 	return ret;
4092 }
4093 
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4094 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4095 {
4096 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4097 	struct intel_iommu *iommu = info->iommu;
4098 	struct iommu_hw_info_vtd *vtd;
4099 
4100 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4101 	if (!vtd)
4102 		return ERR_PTR(-ENOMEM);
4103 
4104 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4105 	vtd->cap_reg = iommu->cap;
4106 	vtd->ecap_reg = iommu->ecap;
4107 	*length = sizeof(*vtd);
4108 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4109 	return vtd;
4110 }
4111 
4112 /*
4113  * Set dirty tracking for the device list of a domain. The caller must
4114  * hold the domain->lock when calling it.
4115  */
device_set_dirty_tracking(struct list_head * devices,bool enable)4116 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4117 {
4118 	struct device_domain_info *info;
4119 	int ret = 0;
4120 
4121 	list_for_each_entry(info, devices, link) {
4122 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4123 						       IOMMU_NO_PASID, enable);
4124 		if (ret)
4125 			break;
4126 	}
4127 
4128 	return ret;
4129 }
4130 
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4131 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4132 					    bool enable)
4133 {
4134 	struct dmar_domain *s1_domain;
4135 	unsigned long flags;
4136 	int ret;
4137 
4138 	spin_lock(&domain->s1_lock);
4139 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4140 		spin_lock_irqsave(&s1_domain->lock, flags);
4141 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4142 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4143 		if (ret)
4144 			goto err_unwind;
4145 	}
4146 	spin_unlock(&domain->s1_lock);
4147 	return 0;
4148 
4149 err_unwind:
4150 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4151 		spin_lock_irqsave(&s1_domain->lock, flags);
4152 		device_set_dirty_tracking(&s1_domain->devices,
4153 					  domain->dirty_tracking);
4154 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4155 	}
4156 	spin_unlock(&domain->s1_lock);
4157 	return ret;
4158 }
4159 
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4160 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4161 					  bool enable)
4162 {
4163 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4164 	int ret;
4165 
4166 	spin_lock(&dmar_domain->lock);
4167 	if (dmar_domain->dirty_tracking == enable)
4168 		goto out_unlock;
4169 
4170 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4171 	if (ret)
4172 		goto err_unwind;
4173 
4174 	if (dmar_domain->nested_parent) {
4175 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4176 		if (ret)
4177 			goto err_unwind;
4178 	}
4179 
4180 	dmar_domain->dirty_tracking = enable;
4181 out_unlock:
4182 	spin_unlock(&dmar_domain->lock);
4183 
4184 	return 0;
4185 
4186 err_unwind:
4187 	device_set_dirty_tracking(&dmar_domain->devices,
4188 				  dmar_domain->dirty_tracking);
4189 	spin_unlock(&dmar_domain->lock);
4190 	return ret;
4191 }
4192 
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4193 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4194 					    unsigned long iova, size_t size,
4195 					    unsigned long flags,
4196 					    struct iommu_dirty_bitmap *dirty)
4197 {
4198 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4199 	unsigned long end = iova + size - 1;
4200 	unsigned long pgsize;
4201 
4202 	/*
4203 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4204 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4205 	 * have occurred when we stopped dirty tracking. This ensures that we
4206 	 * never inherit dirtied bits from a previous cycle.
4207 	 */
4208 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4209 		return -EINVAL;
4210 
4211 	do {
4212 		struct dma_pte *pte;
4213 		int lvl = 0;
4214 
4215 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4216 				     GFP_ATOMIC);
4217 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4218 		if (!pte || !dma_pte_present(pte)) {
4219 			iova += pgsize;
4220 			continue;
4221 		}
4222 
4223 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4224 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4225 		iova += pgsize;
4226 	} while (iova < end);
4227 
4228 	return 0;
4229 }
4230 
4231 static const struct iommu_dirty_ops intel_dirty_ops = {
4232 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4233 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4234 };
4235 
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4236 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4237 {
4238 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4239 	struct intel_iommu *iommu = info->iommu;
4240 	struct context_entry *context;
4241 
4242 	spin_lock(&iommu->lock);
4243 	context = iommu_context_addr(iommu, bus, devfn, 1);
4244 	if (!context) {
4245 		spin_unlock(&iommu->lock);
4246 		return -ENOMEM;
4247 	}
4248 
4249 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4250 		spin_unlock(&iommu->lock);
4251 		return 0;
4252 	}
4253 
4254 	copied_context_tear_down(iommu, context, bus, devfn);
4255 	context_clear_entry(context);
4256 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4257 
4258 	/*
4259 	 * In pass through mode, AW must be programmed to indicate the largest
4260 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4261 	 */
4262 	context_set_address_width(context, iommu->msagaw);
4263 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4264 	context_set_fault_enable(context);
4265 	context_set_present(context);
4266 	if (!ecap_coherent(iommu->ecap))
4267 		clflush_cache_range(context, sizeof(*context));
4268 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4269 	spin_unlock(&iommu->lock);
4270 
4271 	return 0;
4272 }
4273 
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4274 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4275 {
4276 	struct device *dev = data;
4277 
4278 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4279 }
4280 
device_setup_pass_through(struct device * dev)4281 static int device_setup_pass_through(struct device *dev)
4282 {
4283 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4284 
4285 	if (!dev_is_pci(dev))
4286 		return context_setup_pass_through(dev, info->bus, info->devfn);
4287 
4288 	return pci_for_each_dma_alias(to_pci_dev(dev),
4289 				      context_setup_pass_through_cb, dev);
4290 }
4291 
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4292 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4293 {
4294 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4295 	struct intel_iommu *iommu = info->iommu;
4296 	int ret;
4297 
4298 	device_block_translation(dev);
4299 
4300 	if (dev_is_real_dma_subdevice(dev))
4301 		return 0;
4302 
4303 	/*
4304 	 * No PRI support with the global identity domain. No need to enable or
4305 	 * disable PRI in this path as the iommu has been put in the blocking
4306 	 * state.
4307 	 */
4308 	if (sm_supported(iommu))
4309 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4310 	else
4311 		ret = device_setup_pass_through(dev);
4312 
4313 	if (!ret)
4314 		info->domain_attached = true;
4315 
4316 	return ret;
4317 }
4318 
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4319 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4320 					 struct device *dev, ioasid_t pasid,
4321 					 struct iommu_domain *old)
4322 {
4323 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4324 	struct intel_iommu *iommu = info->iommu;
4325 	int ret;
4326 
4327 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4328 		return -EOPNOTSUPP;
4329 
4330 	ret = iopf_for_domain_replace(domain, old, dev);
4331 	if (ret)
4332 		return ret;
4333 
4334 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4335 	if (ret) {
4336 		iopf_for_domain_replace(old, domain, dev);
4337 		return ret;
4338 	}
4339 
4340 	domain_remove_dev_pasid(old, dev, pasid);
4341 	return 0;
4342 }
4343 
4344 static struct iommu_domain identity_domain = {
4345 	.type = IOMMU_DOMAIN_IDENTITY,
4346 	.ops = &(const struct iommu_domain_ops) {
4347 		.attach_dev	= identity_domain_attach_dev,
4348 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4349 	},
4350 };
4351 
4352 const struct iommu_ops intel_iommu_ops = {
4353 	.blocked_domain		= &blocking_domain,
4354 	.release_domain		= &blocking_domain,
4355 	.identity_domain	= &identity_domain,
4356 	.capable		= intel_iommu_capable,
4357 	.hw_info		= intel_iommu_hw_info,
4358 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4359 	.domain_alloc_sva	= intel_svm_domain_alloc,
4360 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4361 	.probe_device		= intel_iommu_probe_device,
4362 	.probe_finalize		= intel_iommu_probe_finalize,
4363 	.release_device		= intel_iommu_release_device,
4364 	.get_resv_regions	= intel_iommu_get_resv_regions,
4365 	.device_group		= intel_iommu_device_group,
4366 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4367 	.def_domain_type	= device_def_domain_type,
4368 	.pgsize_bitmap		= SZ_4K,
4369 	.page_response		= intel_iommu_page_response,
4370 	.default_domain_ops = &(const struct iommu_domain_ops) {
4371 		.attach_dev		= intel_iommu_attach_device,
4372 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4373 		.map_pages		= intel_iommu_map_pages,
4374 		.unmap_pages		= intel_iommu_unmap_pages,
4375 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4376 		.flush_iotlb_all        = intel_flush_iotlb_all,
4377 		.iotlb_sync		= intel_iommu_tlb_sync,
4378 		.iova_to_phys		= intel_iommu_iova_to_phys,
4379 		.free			= intel_iommu_domain_free,
4380 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4381 	}
4382 };
4383 
quirk_iommu_igfx(struct pci_dev * dev)4384 static void quirk_iommu_igfx(struct pci_dev *dev)
4385 {
4386 	if (risky_device(dev))
4387 		return;
4388 
4389 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4390 	disable_igfx_iommu = 1;
4391 }
4392 
4393 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4398 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4401 
4402 /* QM57/QS57 integrated gfx malfunctions with dmar */
4403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4404 
4405 /* Broadwell igfx malfunctions with dmar */
4406 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4421 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4422 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4424 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4425 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4430 
quirk_iommu_rwbf(struct pci_dev * dev)4431 static void quirk_iommu_rwbf(struct pci_dev *dev)
4432 {
4433 	if (risky_device(dev))
4434 		return;
4435 
4436 	/*
4437 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4438 	 * but needs it. Same seems to hold for the desktop versions.
4439 	 */
4440 	pci_info(dev, "Forcing write-buffer flush capability\n");
4441 	rwbf_quirk = 1;
4442 }
4443 
4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4451 
4452 #define GGC 0x52
4453 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4454 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4455 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4456 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4457 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4458 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4459 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4460 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4461 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4462 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4463 {
4464 	unsigned short ggc;
4465 
4466 	if (risky_device(dev))
4467 		return;
4468 
4469 	if (pci_read_config_word(dev, GGC, &ggc))
4470 		return;
4471 
4472 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4473 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4474 		disable_igfx_iommu = 1;
4475 	} else if (!disable_igfx_iommu) {
4476 		/* we have to ensure the gfx device is idle before we flush */
4477 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4478 		iommu_set_dma_strict();
4479 	}
4480 }
4481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4484 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4485 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4486 {
4487 	unsigned short ver;
4488 
4489 	if (!IS_GFX_DEVICE(dev))
4490 		return;
4491 
4492 	ver = (dev->device >> 8) & 0xff;
4493 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4494 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4495 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4496 		return;
4497 
4498 	if (risky_device(dev))
4499 		return;
4500 
4501 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4502 	iommu_skip_te_disable = 1;
4503 }
4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4505 
4506 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4507    ISOCH DMAR unit for the Azalia sound device, but not give it any
4508    TLB entries, which causes it to deadlock. Check for that.  We do
4509    this in a function called from init_dmars(), instead of in a PCI
4510    quirk, because we don't want to print the obnoxious "BIOS broken"
4511    message if VT-d is actually disabled.
4512 */
check_tylersburg_isoch(void)4513 static void __init check_tylersburg_isoch(void)
4514 {
4515 	struct pci_dev *pdev;
4516 	uint32_t vtisochctrl;
4517 
4518 	/* If there's no Azalia in the system anyway, forget it. */
4519 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4520 	if (!pdev)
4521 		return;
4522 
4523 	if (risky_device(pdev)) {
4524 		pci_dev_put(pdev);
4525 		return;
4526 	}
4527 
4528 	pci_dev_put(pdev);
4529 
4530 	/* System Management Registers. Might be hidden, in which case
4531 	   we can't do the sanity check. But that's OK, because the
4532 	   known-broken BIOSes _don't_ actually hide it, so far. */
4533 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4534 	if (!pdev)
4535 		return;
4536 
4537 	if (risky_device(pdev)) {
4538 		pci_dev_put(pdev);
4539 		return;
4540 	}
4541 
4542 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4543 		pci_dev_put(pdev);
4544 		return;
4545 	}
4546 
4547 	pci_dev_put(pdev);
4548 
4549 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4550 	if (vtisochctrl & 1)
4551 		return;
4552 
4553 	/* Drop all bits other than the number of TLB entries */
4554 	vtisochctrl &= 0x1c;
4555 
4556 	/* If we have the recommended number of TLB entries (16), fine. */
4557 	if (vtisochctrl == 0x10)
4558 		return;
4559 
4560 	/* Zero TLB entries? You get to ride the short bus to school. */
4561 	if (!vtisochctrl) {
4562 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4563 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4564 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4565 		     dmi_get_system_info(DMI_BIOS_VERSION),
4566 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4567 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4568 		return;
4569 	}
4570 
4571 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4572 	       vtisochctrl);
4573 }
4574 
4575 /*
4576  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4577  * invalidation completion before posted writes initiated with translated address
4578  * that utilized translations matching the invalidation address range, violating
4579  * the invalidation completion ordering.
4580  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4581  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4582  * under the control of the trusted/privileged host device driver must use this
4583  * quirk.
4584  * Device TLBs are invalidated under the following six conditions:
4585  * 1. Device driver does DMA API unmap IOVA
4586  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4587  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4588  *    exit_mmap() due to crash
4589  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4590  *    VM has to free pages that were unmapped
4591  * 5. Userspace driver unmaps a DMA buffer
4592  * 6. Cache invalidation in vSVA usage (upcoming)
4593  *
4594  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4595  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4596  * invalidate TLB the same way as normal user unmap which will use this quirk.
4597  * The dTLB invalidation after PASID cache flush does not need this quirk.
4598  *
4599  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4600  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4601 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4602 			       unsigned long address, unsigned long mask,
4603 			       u32 pasid, u16 qdep)
4604 {
4605 	u16 sid;
4606 
4607 	if (likely(!info->dtlb_extra_inval))
4608 		return;
4609 
4610 	sid = PCI_DEVID(info->bus, info->devfn);
4611 	if (pasid == IOMMU_NO_PASID) {
4612 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4613 				   qdep, address, mask);
4614 	} else {
4615 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4616 					 pasid, qdep, address, mask);
4617 	}
4618 }
4619 
4620 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4621 
4622 /*
4623  * Function to submit a command to the enhanced command interface. The
4624  * valid enhanced command descriptions are defined in Table 47 of the
4625  * VT-d spec. The VT-d hardware implementation may support some but not
4626  * all commands, which can be determined by checking the Enhanced
4627  * Command Capability Register.
4628  *
4629  * Return values:
4630  *  - 0: Command successful without any error;
4631  *  - Negative: software error value;
4632  *  - Nonzero positive: failure status code defined in Table 48.
4633  */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4634 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4635 {
4636 	unsigned long flags;
4637 	u64 res;
4638 	int ret;
4639 
4640 	if (!cap_ecmds(iommu->cap))
4641 		return -ENODEV;
4642 
4643 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4644 
4645 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4646 	if (res & DMA_ECMD_ECRSP_IP) {
4647 		ret = -EBUSY;
4648 		goto err;
4649 	}
4650 
4651 	/*
4652 	 * Unconditionally write the operand B, because
4653 	 * - There is no side effect if an ecmd doesn't require an
4654 	 *   operand B, but we set the register to some value.
4655 	 * - It's not invoked in any critical path. The extra MMIO
4656 	 *   write doesn't bring any performance concerns.
4657 	 */
4658 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4659 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4660 
4661 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4662 		      !(res & DMA_ECMD_ECRSP_IP), res);
4663 
4664 	if (res & DMA_ECMD_ECRSP_IP) {
4665 		ret = -ETIMEDOUT;
4666 		goto err;
4667 	}
4668 
4669 	ret = ecmd_get_status_code(res);
4670 err:
4671 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4672 
4673 	return ret;
4674 }
4675