xref: /linux/drivers/iommu/intel/iommu.c (revision 336b4dae6dfecc9aa53a3a68c71b9c1c1d466388)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 /*
61  * set to 1 to panic kernel if can't successfully enable VT-d
62  * (used when kernel is launched w/ TXT)
63  */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67 
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69 
70 /*
71  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72  * if marked present.
73  */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 	if (!(re->lo & 1))
77 		return 0;
78 
79 	return re->lo & VTD_PAGE_MASK;
80 }
81 
82 /*
83  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84  * if marked present.
85  */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 	if (!(re->hi & 1))
89 		return 0;
90 
91 	return re->hi & VTD_PAGE_MASK;
92 }
93 
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 	struct device_domain_info *info =
97 		rb_entry(node, struct device_domain_info, node);
98 	const u16 *rid_lhs = key;
99 
100 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 		return -1;
102 
103 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 		return 1;
105 
106 	return 0;
107 }
108 
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 	struct device_domain_info *info =
112 		rb_entry(lhs, struct device_domain_info, node);
113 	u16 key = PCI_DEVID(info->bus, info->devfn);
114 
115 	return device_rid_cmp_key(&key, rhs);
116 }
117 
118 /*
119  * Looks up an IOMMU-probed device using its source ID.
120  *
121  * Returns the pointer to the device if there is a match. Otherwise,
122  * returns NULL.
123  *
124  * Note that this helper doesn't guarantee that the device won't be
125  * released by the iommu subsystem after being returned. The caller
126  * should use its own synchronization mechanism to avoid the device
127  * being released during its use if its possibly the case.
128  */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 	struct device_domain_info *info = NULL;
132 	struct rb_node *node;
133 	unsigned long flags;
134 
135 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 	if (node)
138 		info = rb_entry(node, struct device_domain_info, node);
139 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140 
141 	return info ? info->dev : NULL;
142 }
143 
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 				struct device_domain_info *info)
146 {
147 	struct rb_node *curr;
148 	unsigned long flags;
149 
150 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 	if (WARN_ON(curr))
154 		return -EEXIST;
155 
156 	return 0;
157 }
158 
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 	struct intel_iommu *iommu = info->iommu;
162 	unsigned long flags;
163 
164 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 	rb_erase(&info->node, &iommu->device_rbtree);
166 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168 
169 struct dmar_rmrr_unit {
170 	struct list_head list;		/* list of rmrr units	*/
171 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
172 	u64	base_address;		/* reserved base address*/
173 	u64	end_address;		/* reserved end address */
174 	struct dmar_dev_scope *devices;	/* target devices */
175 	int	devices_cnt;		/* target device count */
176 };
177 
178 struct dmar_atsr_unit {
179 	struct list_head list;		/* list of ATSR units */
180 	struct acpi_dmar_header *hdr;	/* ACPI header */
181 	struct dmar_dev_scope *devices;	/* target devices */
182 	int devices_cnt;		/* target device count */
183 	u8 include_all:1;		/* include all ports */
184 };
185 
186 struct dmar_satc_unit {
187 	struct list_head list;		/* list of SATC units */
188 	struct acpi_dmar_header *hdr;	/* ACPI header */
189 	struct dmar_dev_scope *devices;	/* target devices */
190 	struct intel_iommu *iommu;	/* the corresponding iommu */
191 	int devices_cnt;		/* target device count */
192 	u8 atc_required:1;		/* ATS is required */
193 };
194 
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198 
199 #define for_each_rmrr_units(rmrr) \
200 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201 
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203 
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206 
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209 
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214 
215 #define IDENTMAP_AZALIA		4
216 
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219 
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224 
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229 
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 	u32 gsts;
233 
234 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 	if (gsts & DMA_GSTS_TES)
236 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238 
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 	if (!str)
242 		return -EINVAL;
243 
244 	while (*str) {
245 		if (!strncmp(str, "on", 2)) {
246 			dmar_disabled = 0;
247 			pr_info("IOMMU enabled\n");
248 		} else if (!strncmp(str, "off", 3)) {
249 			dmar_disabled = 1;
250 			no_platform_optin = 1;
251 			pr_info("IOMMU disabled\n");
252 		} else if (!strncmp(str, "igfx_off", 8)) {
253 			disable_igfx_iommu = 1;
254 			pr_info("Disable GFX device mapping\n");
255 		} else if (!strncmp(str, "forcedac", 8)) {
256 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 			iommu_dma_forcedac = true;
258 		} else if (!strncmp(str, "strict", 6)) {
259 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 			iommu_set_dma_strict();
261 		} else if (!strncmp(str, "sp_off", 6)) {
262 			pr_info("Disable supported super page\n");
263 			intel_iommu_superpage = 0;
264 		} else if (!strncmp(str, "sm_on", 5)) {
265 			pr_info("Enable scalable mode if hardware supports\n");
266 			intel_iommu_sm = 1;
267 		} else if (!strncmp(str, "sm_off", 6)) {
268 			pr_info("Scalable mode is disallowed\n");
269 			intel_iommu_sm = 0;
270 		} else if (!strncmp(str, "tboot_noforce", 13)) {
271 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 			intel_iommu_tboot_noforce = 1;
273 		} else {
274 			pr_notice("Unknown option - '%s'\n", str);
275 		}
276 
277 		str += strcspn(str, ",");
278 		while (*str == ',')
279 			str++;
280 	}
281 
282 	return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289 
290 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292 
293 /*
294  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296  * the returned SAGAW.
297  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 	unsigned long fl_sagaw, sl_sagaw;
301 
302 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 	sl_sagaw = cap_sagaw(iommu->cap);
304 
305 	/* Second level only. */
306 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 		return sl_sagaw;
308 
309 	/* First level only. */
310 	if (!ecap_slts(iommu->ecap))
311 		return fl_sagaw;
312 
313 	return fl_sagaw & sl_sagaw;
314 }
315 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 	unsigned long sagaw;
319 	int agaw;
320 
321 	sagaw = __iommu_calculate_sagaw(iommu);
322 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 		if (test_bit(agaw, &sagaw))
324 			break;
325 	}
326 
327 	return agaw;
328 }
329 
330 /*
331  * Calculate max SAGAW for each iommu.
332  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337 
338 /*
339  * calculate agaw for each iommu.
340  * "SAGAW" may be different across iommus, use a default agaw, and
341  * get a supported less agaw for iommus that don't support the default agaw.
342  */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347 
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 	return sm_supported(iommu) ?
351 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353 
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 	unsigned long bitmap = 0;
358 
359 	/*
360 	 * 1-level super page supports page size of 2MiB, 2-level super page
361 	 * supports page size of both 2MiB and 1GiB.
362 	 */
363 	if (domain->iommu_superpage == 1)
364 		bitmap |= SZ_2M;
365 	else if (domain->iommu_superpage == 2)
366 		bitmap |= SZ_2M | SZ_1G;
367 
368 	return bitmap;
369 }
370 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 					 u8 devfn, int alloc)
373 {
374 	struct root_entry *root = &iommu->root_entry[bus];
375 	struct context_entry *context;
376 	u64 *entry;
377 
378 	/*
379 	 * Except that the caller requested to allocate a new entry,
380 	 * returning a copied context entry makes no sense.
381 	 */
382 	if (!alloc && context_copied(iommu, bus, devfn))
383 		return NULL;
384 
385 	entry = &root->lo;
386 	if (sm_supported(iommu)) {
387 		if (devfn >= 0x80) {
388 			devfn -= 0x80;
389 			entry = &root->hi;
390 		}
391 		devfn *= 2;
392 	}
393 	if (*entry & 1)
394 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 	else {
396 		unsigned long phy_addr;
397 		if (!alloc)
398 			return NULL;
399 
400 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 		if (!context)
402 			return NULL;
403 
404 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 		phy_addr = virt_to_phys((void *)context);
406 		*entry = phy_addr | 1;
407 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
408 	}
409 	return &context[devfn];
410 }
411 
412 /**
413  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414  *				 sub-hierarchy of a candidate PCI-PCI bridge
415  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416  * @bridge: the candidate PCI-PCI bridge
417  *
418  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419  */
420 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 	struct pci_dev *pdev, *pbridge;
424 
425 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 		return false;
427 
428 	pdev = to_pci_dev(dev);
429 	pbridge = to_pci_dev(bridge);
430 
431 	if (pbridge->subordinate &&
432 	    pbridge->subordinate->number <= pdev->bus->number &&
433 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 		return true;
435 
436 	return false;
437 }
438 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 	struct dmar_drhd_unit *drhd;
442 	u32 vtbar;
443 	int rc;
444 
445 	/* We know that this device on this chipset has its own IOMMU.
446 	 * If we find it under a different IOMMU, then the BIOS is lying
447 	 * to us. Hope that the IOMMU for this device is actually
448 	 * disabled, and it needs no translation...
449 	 */
450 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 	if (rc) {
452 		/* "can't" happen */
453 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 		return false;
455 	}
456 	vtbar &= 0xffff0000;
457 
458 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
459 	drhd = dmar_find_matched_drhd_unit(pdev);
460 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 		return true;
464 	}
465 
466 	return false;
467 }
468 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 	if (!iommu || iommu->drhd->ignored)
472 		return true;
473 
474 	if (dev_is_pci(dev)) {
475 		struct pci_dev *pdev = to_pci_dev(dev);
476 
477 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 		    quirk_ioat_snb_local_iommu(pdev))
480 			return true;
481 	}
482 
483 	return false;
484 }
485 
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 	struct dmar_drhd_unit *drhd = NULL;
489 	struct pci_dev *pdev = NULL;
490 	struct intel_iommu *iommu;
491 	struct device *tmp;
492 	u16 segment = 0;
493 	int i;
494 
495 	if (!dev)
496 		return NULL;
497 
498 	if (dev_is_pci(dev)) {
499 		struct pci_dev *pf_pdev;
500 
501 		pdev = pci_real_dma_dev(to_pci_dev(dev));
502 
503 		/* VFs aren't listed in scope tables; we need to look up
504 		 * the PF instead to find the IOMMU. */
505 		pf_pdev = pci_physfn(pdev);
506 		dev = &pf_pdev->dev;
507 		segment = pci_domain_nr(pdev->bus);
508 	} else if (has_acpi_companion(dev))
509 		dev = &ACPI_COMPANION(dev)->dev;
510 
511 	rcu_read_lock();
512 	for_each_iommu(iommu, drhd) {
513 		if (pdev && segment != drhd->segment)
514 			continue;
515 
516 		for_each_active_dev_scope(drhd->devices,
517 					  drhd->devices_cnt, i, tmp) {
518 			if (tmp == dev) {
519 				/* For a VF use its original BDF# not that of the PF
520 				 * which we used for the IOMMU lookup. Strictly speaking
521 				 * we could do this for all PCI devices; we only need to
522 				 * get the BDF# from the scope table for ACPI matches. */
523 				if (pdev && pdev->is_virtfn)
524 					goto got_pdev;
525 
526 				if (bus && devfn) {
527 					*bus = drhd->devices[i].bus;
528 					*devfn = drhd->devices[i].devfn;
529 				}
530 				goto out;
531 			}
532 
533 			if (is_downstream_to_pci_bridge(dev, tmp))
534 				goto got_pdev;
535 		}
536 
537 		if (pdev && drhd->include_all) {
538 got_pdev:
539 			if (bus && devfn) {
540 				*bus = pdev->bus->number;
541 				*devfn = pdev->devfn;
542 			}
543 			goto out;
544 		}
545 	}
546 	iommu = NULL;
547 out:
548 	if (iommu_is_dummy(iommu, dev))
549 		iommu = NULL;
550 
551 	rcu_read_unlock();
552 
553 	return iommu;
554 }
555 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)556 static void domain_flush_cache(struct dmar_domain *domain,
557 			       void *addr, int size)
558 {
559 	if (!domain->iommu_coherency)
560 		clflush_cache_range(addr, size);
561 }
562 
free_context_table(struct intel_iommu * iommu)563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 	struct context_entry *context;
566 	int i;
567 
568 	if (!iommu->root_entry)
569 		return;
570 
571 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 		context = iommu_context_addr(iommu, i, 0, 0);
573 		if (context)
574 			iommu_free_page(context);
575 
576 		if (!sm_supported(iommu))
577 			continue;
578 
579 		context = iommu_context_addr(iommu, i, 0x80, 0);
580 		if (context)
581 			iommu_free_page(context);
582 	}
583 
584 	iommu_free_page(iommu->root_entry);
585 	iommu->root_entry = NULL;
586 }
587 
588 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 	struct dma_pte *pte;
593 	int offset;
594 
595 	while (1) {
596 		offset = pfn_level_offset(pfn, level);
597 		pte = &parent[offset];
598 
599 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600 
601 		if (!dma_pte_present(pte)) {
602 			pr_info("page table not present at level %d\n", level - 1);
603 			break;
604 		}
605 
606 		if (level == 1 || dma_pte_superpage(pte))
607 			break;
608 
609 		parent = phys_to_virt(dma_pte_addr(pte));
610 		level--;
611 	}
612 }
613 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 			  unsigned long long addr, u32 pasid)
616 {
617 	struct pasid_dir_entry *dir, *pde;
618 	struct pasid_entry *entries, *pte;
619 	struct context_entry *ctx_entry;
620 	struct root_entry *rt_entry;
621 	int i, dir_index, index, level;
622 	u8 devfn = source_id & 0xff;
623 	u8 bus = source_id >> 8;
624 	struct dma_pte *pgtable;
625 
626 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627 
628 	/* root entry dump */
629 	if (!iommu->root_entry) {
630 		pr_info("root table is not present\n");
631 		return;
632 	}
633 	rt_entry = &iommu->root_entry[bus];
634 
635 	if (sm_supported(iommu))
636 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 			rt_entry->hi, rt_entry->lo);
638 	else
639 		pr_info("root entry: 0x%016llx", rt_entry->lo);
640 
641 	/* context entry dump */
642 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 	if (!ctx_entry) {
644 		pr_info("context table is not present\n");
645 		return;
646 	}
647 
648 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 		ctx_entry->hi, ctx_entry->lo);
650 
651 	/* legacy mode does not require PASID entries */
652 	if (!sm_supported(iommu)) {
653 		if (!context_present(ctx_entry)) {
654 			pr_info("legacy mode page table is not present\n");
655 			return;
656 		}
657 		level = agaw_to_level(ctx_entry->hi & 7);
658 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 		goto pgtable_walk;
660 	}
661 
662 	if (!context_present(ctx_entry)) {
663 		pr_info("pasid directory table is not present\n");
664 		return;
665 	}
666 
667 	/* get the pointer to pasid directory entry */
668 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669 
670 	/* For request-without-pasid, get the pasid from context entry */
671 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 		pasid = IOMMU_NO_PASID;
673 
674 	dir_index = pasid >> PASID_PDE_SHIFT;
675 	pde = &dir[dir_index];
676 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677 
678 	/* get the pointer to the pasid table entry */
679 	entries = get_pasid_table_from_pde(pde);
680 	if (!entries) {
681 		pr_info("pasid table is not present\n");
682 		return;
683 	}
684 	index = pasid & PASID_PTE_MASK;
685 	pte = &entries[index];
686 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688 
689 	if (!pasid_pte_is_present(pte)) {
690 		pr_info("scalable mode page table is not present\n");
691 		return;
692 	}
693 
694 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 	} else {
698 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 	}
701 
702 pgtable_walk:
703 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 				      unsigned long pfn, int *target_level,
709 				      gfp_t gfp)
710 {
711 	struct dma_pte *parent, *pte;
712 	int level = agaw_to_level(domain->agaw);
713 	int offset;
714 
715 	if (!domain_pfn_supported(domain, pfn))
716 		/* Address beyond IOMMU's addressing capabilities. */
717 		return NULL;
718 
719 	parent = domain->pgd;
720 
721 	while (1) {
722 		void *tmp_page;
723 
724 		offset = pfn_level_offset(pfn, level);
725 		pte = &parent[offset];
726 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 			break;
728 		if (level == *target_level)
729 			break;
730 
731 		if (!dma_pte_present(pte)) {
732 			uint64_t pteval, tmp;
733 
734 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735 
736 			if (!tmp_page)
737 				return NULL;
738 
739 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 			pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
741 				 DMA_PTE_WRITE;
742 			if (domain->use_first_level)
743 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744 
745 			tmp = 0ULL;
746 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 				/* Someone else set it while we were thinking; use theirs. */
748 				iommu_free_page(tmp_page);
749 			else
750 				domain_flush_cache(domain, pte, sizeof(*pte));
751 		}
752 		if (level == 1)
753 			break;
754 
755 		parent = phys_to_virt(dma_pte_addr(pte));
756 		level--;
757 	}
758 
759 	if (!*target_level)
760 		*target_level = level;
761 
762 	return pte;
763 }
764 
765 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 					 unsigned long pfn,
768 					 int level, int *large_page)
769 {
770 	struct dma_pte *parent, *pte;
771 	int total = agaw_to_level(domain->agaw);
772 	int offset;
773 
774 	parent = domain->pgd;
775 	while (level <= total) {
776 		offset = pfn_level_offset(pfn, total);
777 		pte = &parent[offset];
778 		if (level == total)
779 			return pte;
780 
781 		if (!dma_pte_present(pte)) {
782 			*large_page = total;
783 			break;
784 		}
785 
786 		if (dma_pte_superpage(pte)) {
787 			*large_page = total;
788 			return pte;
789 		}
790 
791 		parent = phys_to_virt(dma_pte_addr(pte));
792 		total--;
793 	}
794 	return NULL;
795 }
796 
797 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)798 static void dma_pte_clear_range(struct dmar_domain *domain,
799 				unsigned long start_pfn,
800 				unsigned long last_pfn)
801 {
802 	unsigned int large_page;
803 	struct dma_pte *first_pte, *pte;
804 
805 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 	    WARN_ON(start_pfn > last_pfn))
807 		return;
808 
809 	/* we don't need lock here; nobody else touches the iova range */
810 	do {
811 		large_page = 1;
812 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 		if (!pte) {
814 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 			continue;
816 		}
817 		do {
818 			dma_clear_pte(pte);
819 			start_pfn += lvl_to_nr_pages(large_page);
820 			pte++;
821 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822 
823 		domain_flush_cache(domain, first_pte,
824 				   (void *)pte - (void *)first_pte);
825 
826 	} while (start_pfn && start_pfn <= last_pfn);
827 }
828 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)829 static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 			       int retain_level, struct dma_pte *pte,
831 			       unsigned long pfn, unsigned long start_pfn,
832 			       unsigned long last_pfn)
833 {
834 	pfn = max(start_pfn, pfn);
835 	pte = &pte[pfn_level_offset(pfn, level)];
836 
837 	do {
838 		unsigned long level_pfn;
839 		struct dma_pte *level_pte;
840 
841 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 			goto next;
843 
844 		level_pfn = pfn & level_mask(level);
845 		level_pte = phys_to_virt(dma_pte_addr(pte));
846 
847 		if (level > 2) {
848 			dma_pte_free_level(domain, level - 1, retain_level,
849 					   level_pte, level_pfn, start_pfn,
850 					   last_pfn);
851 		}
852 
853 		/*
854 		 * Free the page table if we're below the level we want to
855 		 * retain and the range covers the entire table.
856 		 */
857 		if (level < retain_level && !(start_pfn > level_pfn ||
858 		      last_pfn < level_pfn + level_size(level) - 1)) {
859 			dma_clear_pte(pte);
860 			domain_flush_cache(domain, pte, sizeof(*pte));
861 			iommu_free_page(level_pte);
862 		}
863 next:
864 		pfn += level_size(level);
865 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866 }
867 
868 /*
869  * clear last level (leaf) ptes and free page table pages below the
870  * level we wish to keep intact.
871  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)872 static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 				   unsigned long start_pfn,
874 				   unsigned long last_pfn,
875 				   int retain_level)
876 {
877 	dma_pte_clear_range(domain, start_pfn, last_pfn);
878 
879 	/* We don't need lock here; nobody else touches the iova range */
880 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 			   domain->pgd, 0, start_pfn, last_pfn);
882 
883 	/* free pgd */
884 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 		iommu_free_page(domain->pgd);
886 		domain->pgd = NULL;
887 	}
888 }
889 
890 /* When a page at a given level is being unlinked from its parent, we don't
891    need to *modify* it at all. All we need to do is make a list of all the
892    pages which can be freed just as soon as we've flushed the IOTLB and we
893    know the hardware page-walk will no longer touch them.
894    The 'pte' argument is the *parent* PTE, pointing to the page that is to
895    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)896 static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 				    int level, struct dma_pte *pte,
898 				    struct list_head *freelist)
899 {
900 	struct page *pg;
901 
902 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 	list_add_tail(&pg->lru, freelist);
904 
905 	if (level == 1)
906 		return;
907 
908 	pte = page_address(pg);
909 	do {
910 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 		pte++;
913 	} while (!first_pte_in_page(pte));
914 }
915 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 				struct dma_pte *pte, unsigned long pfn,
918 				unsigned long start_pfn, unsigned long last_pfn,
919 				struct list_head *freelist)
920 {
921 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
922 
923 	pfn = max(start_pfn, pfn);
924 	pte = &pte[pfn_level_offset(pfn, level)];
925 
926 	do {
927 		unsigned long level_pfn = pfn & level_mask(level);
928 
929 		if (!dma_pte_present(pte))
930 			goto next;
931 
932 		/* If range covers entire pagetable, free it */
933 		if (start_pfn <= level_pfn &&
934 		    last_pfn >= level_pfn + level_size(level) - 1) {
935 			/* These suborbinate page tables are going away entirely. Don't
936 			   bother to clear them; we're just going to *free* them. */
937 			if (level > 1 && !dma_pte_superpage(pte))
938 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939 
940 			dma_clear_pte(pte);
941 			if (!first_pte)
942 				first_pte = pte;
943 			last_pte = pte;
944 		} else if (level > 1) {
945 			/* Recurse down into a level that isn't *entirely* obsolete */
946 			dma_pte_clear_level(domain, level - 1,
947 					    phys_to_virt(dma_pte_addr(pte)),
948 					    level_pfn, start_pfn, last_pfn,
949 					    freelist);
950 		}
951 next:
952 		pfn = level_pfn + level_size(level);
953 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 
955 	if (first_pte)
956 		domain_flush_cache(domain, first_pte,
957 				   (void *)++last_pte - (void *)first_pte);
958 }
959 
960 /* We can't just free the pages because the IOMMU may still be walking
961    the page tables, and may have cached the intermediate levels. The
962    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 			 unsigned long last_pfn, struct list_head *freelist)
965 {
966 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 	    WARN_ON(start_pfn > last_pfn))
968 		return;
969 
970 	/* we don't need lock here; nobody else touches the iova range */
971 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
973 
974 	/* free pgd */
975 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 		struct page *pgd_page = virt_to_page(domain->pgd);
977 		list_add_tail(&pgd_page->lru, freelist);
978 		domain->pgd = NULL;
979 	}
980 }
981 
982 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 	struct root_entry *root;
986 
987 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 	if (!root) {
989 		pr_err("Allocating root entry for %s failed\n",
990 			iommu->name);
991 		return -ENOMEM;
992 	}
993 
994 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
995 	iommu->root_entry = root;
996 
997 	return 0;
998 }
999 
iommu_set_root_entry(struct intel_iommu * iommu)1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 	u64 addr;
1003 	u32 sts;
1004 	unsigned long flag;
1005 
1006 	addr = virt_to_phys(iommu->root_entry);
1007 	if (sm_supported(iommu))
1008 		addr |= DMA_RTADDR_SMT;
1009 
1010 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012 
1013 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014 
1015 	/* Make sure hardware complete it */
1016 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 		      readl, (sts & DMA_GSTS_RTPS), sts);
1018 
1019 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020 
1021 	/*
1022 	 * Hardware invalidates all DMA remapping hardware translation
1023 	 * caches as part of SRTP flow.
1024 	 */
1025 	if (cap_esrtps(iommu->cap))
1026 		return;
1027 
1028 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 	if (sm_supported(iommu))
1030 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033 
iommu_flush_write_buffer(struct intel_iommu * iommu)1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 	u32 val;
1037 	unsigned long flag;
1038 
1039 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 		return;
1041 
1042 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044 
1045 	/* Make sure hardware complete it */
1046 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048 
1049 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051 
1052 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 				  u16 did, u16 source_id, u8 function_mask,
1055 				  u64 type)
1056 {
1057 	u64 val = 0;
1058 	unsigned long flag;
1059 
1060 	switch (type) {
1061 	case DMA_CCMD_GLOBAL_INVL:
1062 		val = DMA_CCMD_GLOBAL_INVL;
1063 		break;
1064 	case DMA_CCMD_DOMAIN_INVL:
1065 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 		break;
1067 	case DMA_CCMD_DEVICE_INVL:
1068 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 		break;
1071 	default:
1072 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 			iommu->name, type);
1074 		return;
1075 	}
1076 	val |= DMA_CCMD_ICC;
1077 
1078 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080 
1081 	/* Make sure hardware complete it */
1082 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084 
1085 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087 
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 			 unsigned int size_order, u64 type)
1090 {
1091 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 	u64 val = 0, val_iva = 0;
1093 	unsigned long flag;
1094 
1095 	switch (type) {
1096 	case DMA_TLB_GLOBAL_FLUSH:
1097 		/* global flush doesn't need set IVA_REG */
1098 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 		break;
1100 	case DMA_TLB_DSI_FLUSH:
1101 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 		break;
1103 	case DMA_TLB_PSI_FLUSH:
1104 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 		/* IH bit is passed in as part of address */
1106 		val_iva = size_order | addr;
1107 		break;
1108 	default:
1109 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 			iommu->name, type);
1111 		return;
1112 	}
1113 
1114 	if (cap_write_drain(iommu->cap))
1115 		val |= DMA_TLB_WRITE_DRAIN;
1116 
1117 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 	/* Note: Only uses first TLB reg currently */
1119 	if (val_iva)
1120 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122 
1123 	/* Make sure hardware complete it */
1124 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126 
1127 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128 
1129 	/* check IOTLB invalidation granularity */
1130 	if (DMA_TLB_IAIG(val) == 0)
1131 		pr_err("Flush IOTLB failed\n");
1132 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 			(unsigned long long)DMA_TLB_IIRG(type),
1135 			(unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137 
1138 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 	struct device_domain_info *info;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&domain->lock, flags);
1146 	list_for_each_entry(info, &domain->devices, link) {
1147 		if (info->iommu == iommu && info->bus == bus &&
1148 		    info->devfn == devfn) {
1149 			spin_unlock_irqrestore(&domain->lock, flags);
1150 			return info;
1151 		}
1152 	}
1153 	spin_unlock_irqrestore(&domain->lock, flags);
1154 
1155 	return NULL;
1156 }
1157 
1158 /*
1159  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161  * check because it applies only to the built-in QAT devices and it doesn't
1162  * grant additional privileges.
1163  */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 		return false;
1169 
1170 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 		return false;
1172 
1173 	return true;
1174 }
1175 
iommu_enable_pci_ats(struct device_domain_info * info)1176 static void iommu_enable_pci_ats(struct device_domain_info *info)
1177 {
1178 	struct pci_dev *pdev;
1179 
1180 	if (!info->ats_supported)
1181 		return;
1182 
1183 	pdev = to_pci_dev(info->dev);
1184 	if (!pci_ats_page_aligned(pdev))
1185 		return;
1186 
1187 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1188 		info->ats_enabled = 1;
1189 }
1190 
iommu_disable_pci_ats(struct device_domain_info * info)1191 static void iommu_disable_pci_ats(struct device_domain_info *info)
1192 {
1193 	if (!info->ats_enabled)
1194 		return;
1195 
1196 	pci_disable_ats(to_pci_dev(info->dev));
1197 	info->ats_enabled = 0;
1198 }
1199 
iommu_enable_pci_pri(struct device_domain_info * info)1200 static void iommu_enable_pci_pri(struct device_domain_info *info)
1201 {
1202 	struct pci_dev *pdev;
1203 
1204 	if (!info->ats_enabled || !info->pri_supported)
1205 		return;
1206 
1207 	pdev = to_pci_dev(info->dev);
1208 	/* PASID is required in PRG Response Message. */
1209 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210 		return;
1211 
1212 	if (pci_reset_pri(pdev))
1213 		return;
1214 
1215 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216 		info->pri_enabled = 1;
1217 }
1218 
iommu_disable_pci_pri(struct device_domain_info * info)1219 static void iommu_disable_pci_pri(struct device_domain_info *info)
1220 {
1221 	if (!info->pri_enabled)
1222 		return;
1223 
1224 	if (WARN_ON(info->iopf_refcount))
1225 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1226 
1227 	pci_disable_pri(to_pci_dev(info->dev));
1228 	info->pri_enabled = 0;
1229 }
1230 
intel_flush_iotlb_all(struct iommu_domain * domain)1231 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232 {
1233 	cache_tag_flush_all(to_dmar_domain(domain));
1234 }
1235 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237 {
1238 	u32 pmen;
1239 	unsigned long flags;
1240 
1241 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242 		return;
1243 
1244 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1246 	pmen &= ~DMA_PMEN_EPM;
1247 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1248 
1249 	/* wait for the protected region status bit to clear */
1250 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1252 
1253 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254 }
1255 
iommu_enable_translation(struct intel_iommu * iommu)1256 static void iommu_enable_translation(struct intel_iommu *iommu)
1257 {
1258 	u32 sts;
1259 	unsigned long flags;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262 	iommu->gcmd |= DMA_GCMD_TE;
1263 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1264 
1265 	/* Make sure hardware complete it */
1266 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267 		      readl, (sts & DMA_GSTS_TES), sts);
1268 
1269 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270 }
1271 
iommu_disable_translation(struct intel_iommu * iommu)1272 static void iommu_disable_translation(struct intel_iommu *iommu)
1273 {
1274 	u32 sts;
1275 	unsigned long flag;
1276 
1277 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1279 		return;
1280 
1281 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 	iommu->gcmd &= ~DMA_GCMD_TE;
1283 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1284 
1285 	/* Make sure hardware complete it */
1286 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1288 
1289 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 }
1291 
iommu_init_domains(struct intel_iommu * iommu)1292 static int iommu_init_domains(struct intel_iommu *iommu)
1293 {
1294 	u32 ndomains;
1295 
1296 	ndomains = cap_ndoms(iommu->cap);
1297 	pr_debug("%s: Number of Domains supported <%d>\n",
1298 		 iommu->name, ndomains);
1299 
1300 	spin_lock_init(&iommu->lock);
1301 
1302 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1303 	if (!iommu->domain_ids)
1304 		return -ENOMEM;
1305 
1306 	/*
1307 	 * If Caching mode is set, then invalid translations are tagged
1308 	 * with domain-id 0, hence we need to pre-allocate it. We also
1309 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1310 	 * make sure it is not used for a real domain.
1311 	 */
1312 	set_bit(0, iommu->domain_ids);
1313 
1314 	/*
1315 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1316 	 * entry for first-level or pass-through translation modes should
1317 	 * be programmed with a domain id different from those used for
1318 	 * second-level or nested translation. We reserve a domain id for
1319 	 * this purpose. This domain id is also used for identity domain
1320 	 * in legacy mode.
1321 	 */
1322 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1323 
1324 	return 0;
1325 }
1326 
disable_dmar_iommu(struct intel_iommu * iommu)1327 static void disable_dmar_iommu(struct intel_iommu *iommu)
1328 {
1329 	if (!iommu->domain_ids)
1330 		return;
1331 
1332 	/*
1333 	 * All iommu domains must have been detached from the devices,
1334 	 * hence there should be no domain IDs in use.
1335 	 */
1336 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1337 		    > NUM_RESERVED_DID))
1338 		return;
1339 
1340 	if (iommu->gcmd & DMA_GCMD_TE)
1341 		iommu_disable_translation(iommu);
1342 }
1343 
free_dmar_iommu(struct intel_iommu * iommu)1344 static void free_dmar_iommu(struct intel_iommu *iommu)
1345 {
1346 	if (iommu->domain_ids) {
1347 		bitmap_free(iommu->domain_ids);
1348 		iommu->domain_ids = NULL;
1349 	}
1350 
1351 	if (iommu->copied_tables) {
1352 		bitmap_free(iommu->copied_tables);
1353 		iommu->copied_tables = NULL;
1354 	}
1355 
1356 	/* free context mapping */
1357 	free_context_table(iommu);
1358 
1359 	if (ecap_prs(iommu->ecap))
1360 		intel_iommu_finish_prq(iommu);
1361 }
1362 
1363 /*
1364  * Check and return whether first level is used by default for
1365  * DMA translation.
1366  */
first_level_by_default(struct intel_iommu * iommu)1367 static bool first_level_by_default(struct intel_iommu *iommu)
1368 {
1369 	/* Only SL is available in legacy mode */
1370 	if (!sm_supported(iommu))
1371 		return false;
1372 
1373 	/* Only level (either FL or SL) is available, just use it */
1374 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1375 		return ecap_flts(iommu->ecap);
1376 
1377 	return true;
1378 }
1379 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1380 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1381 {
1382 	struct iommu_domain_info *info, *curr;
1383 	unsigned long ndomains;
1384 	int num, ret = -ENOSPC;
1385 
1386 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387 		return 0;
1388 
1389 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1390 	if (!info)
1391 		return -ENOMEM;
1392 
1393 	spin_lock(&iommu->lock);
1394 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1395 	if (curr) {
1396 		curr->refcnt++;
1397 		spin_unlock(&iommu->lock);
1398 		kfree(info);
1399 		return 0;
1400 	}
1401 
1402 	ndomains = cap_ndoms(iommu->cap);
1403 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1404 	if (num >= ndomains) {
1405 		pr_err("%s: No free domain ids\n", iommu->name);
1406 		goto err_unlock;
1407 	}
1408 
1409 	set_bit(num, iommu->domain_ids);
1410 	info->refcnt	= 1;
1411 	info->did	= num;
1412 	info->iommu	= iommu;
1413 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1414 			  NULL, info, GFP_ATOMIC);
1415 	if (curr) {
1416 		ret = xa_err(curr) ? : -EBUSY;
1417 		goto err_clear;
1418 	}
1419 
1420 	spin_unlock(&iommu->lock);
1421 	return 0;
1422 
1423 err_clear:
1424 	clear_bit(info->did, iommu->domain_ids);
1425 err_unlock:
1426 	spin_unlock(&iommu->lock);
1427 	kfree(info);
1428 	return ret;
1429 }
1430 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1431 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1432 {
1433 	struct iommu_domain_info *info;
1434 
1435 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1436 		return;
1437 
1438 	spin_lock(&iommu->lock);
1439 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1440 	if (--info->refcnt == 0) {
1441 		clear_bit(info->did, iommu->domain_ids);
1442 		xa_erase(&domain->iommu_array, iommu->seq_id);
1443 		domain->nid = NUMA_NO_NODE;
1444 		kfree(info);
1445 	}
1446 	spin_unlock(&iommu->lock);
1447 }
1448 
domain_exit(struct dmar_domain * domain)1449 static void domain_exit(struct dmar_domain *domain)
1450 {
1451 	if (domain->pgd) {
1452 		LIST_HEAD(freelist);
1453 
1454 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1455 		iommu_put_pages_list(&freelist);
1456 	}
1457 
1458 	if (WARN_ON(!list_empty(&domain->devices)))
1459 		return;
1460 
1461 	kfree(domain->qi_batch);
1462 	kfree(domain);
1463 }
1464 
1465 /*
1466  * For kdump cases, old valid entries may be cached due to the
1467  * in-flight DMA and copied pgtable, but there is no unmapping
1468  * behaviour for them, thus we need an explicit cache flush for
1469  * the newly-mapped device. For kdump, at this point, the device
1470  * is supposed to finish reset at its driver probe stage, so no
1471  * in-flight DMA will exist, and we don't need to worry anymore
1472  * hereafter.
1473  */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1474 static void copied_context_tear_down(struct intel_iommu *iommu,
1475 				     struct context_entry *context,
1476 				     u8 bus, u8 devfn)
1477 {
1478 	u16 did_old;
1479 
1480 	if (!context_copied(iommu, bus, devfn))
1481 		return;
1482 
1483 	assert_spin_locked(&iommu->lock);
1484 
1485 	did_old = context_domain_id(context);
1486 	context_clear_entry(context);
1487 
1488 	if (did_old < cap_ndoms(iommu->cap)) {
1489 		iommu->flush.flush_context(iommu, did_old,
1490 					   PCI_DEVID(bus, devfn),
1491 					   DMA_CCMD_MASK_NOBIT,
1492 					   DMA_CCMD_DEVICE_INVL);
1493 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1494 					 DMA_TLB_DSI_FLUSH);
1495 	}
1496 
1497 	clear_context_copied(iommu, bus, devfn);
1498 }
1499 
1500 /*
1501  * It's a non-present to present mapping. If hardware doesn't cache
1502  * non-present entry we only need to flush the write-buffer. If the
1503  * _does_ cache non-present entries, then it does so in the special
1504  * domain #0, which we have to flush:
1505  */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1506 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1507 					u8 bus, u8 devfn)
1508 {
1509 	if (cap_caching_mode(iommu->cap)) {
1510 		iommu->flush.flush_context(iommu, 0,
1511 					   PCI_DEVID(bus, devfn),
1512 					   DMA_CCMD_MASK_NOBIT,
1513 					   DMA_CCMD_DEVICE_INVL);
1514 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1515 	} else {
1516 		iommu_flush_write_buffer(iommu);
1517 	}
1518 }
1519 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1520 static int domain_context_mapping_one(struct dmar_domain *domain,
1521 				      struct intel_iommu *iommu,
1522 				      u8 bus, u8 devfn)
1523 {
1524 	struct device_domain_info *info =
1525 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1526 	u16 did = domain_id_iommu(domain, iommu);
1527 	int translation = CONTEXT_TT_MULTI_LEVEL;
1528 	struct dma_pte *pgd = domain->pgd;
1529 	struct context_entry *context;
1530 	int ret;
1531 
1532 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1533 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1534 
1535 	spin_lock(&iommu->lock);
1536 	ret = -ENOMEM;
1537 	context = iommu_context_addr(iommu, bus, devfn, 1);
1538 	if (!context)
1539 		goto out_unlock;
1540 
1541 	ret = 0;
1542 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1543 		goto out_unlock;
1544 
1545 	copied_context_tear_down(iommu, context, bus, devfn);
1546 	context_clear_entry(context);
1547 	context_set_domain_id(context, did);
1548 
1549 	if (info && info->ats_supported)
1550 		translation = CONTEXT_TT_DEV_IOTLB;
1551 	else
1552 		translation = CONTEXT_TT_MULTI_LEVEL;
1553 
1554 	context_set_address_root(context, virt_to_phys(pgd));
1555 	context_set_address_width(context, domain->agaw);
1556 	context_set_translation_type(context, translation);
1557 	context_set_fault_enable(context);
1558 	context_set_present(context);
1559 	if (!ecap_coherent(iommu->ecap))
1560 		clflush_cache_range(context, sizeof(*context));
1561 	context_present_cache_flush(iommu, did, bus, devfn);
1562 	ret = 0;
1563 
1564 out_unlock:
1565 	spin_unlock(&iommu->lock);
1566 
1567 	return ret;
1568 }
1569 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1570 static int domain_context_mapping_cb(struct pci_dev *pdev,
1571 				     u16 alias, void *opaque)
1572 {
1573 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1574 	struct intel_iommu *iommu = info->iommu;
1575 	struct dmar_domain *domain = opaque;
1576 
1577 	return domain_context_mapping_one(domain, iommu,
1578 					  PCI_BUS_NUM(alias), alias & 0xff);
1579 }
1580 
1581 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1582 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1583 {
1584 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1585 	struct intel_iommu *iommu = info->iommu;
1586 	u8 bus = info->bus, devfn = info->devfn;
1587 	int ret;
1588 
1589 	if (!dev_is_pci(dev))
1590 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1591 
1592 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1593 				     domain_context_mapping_cb, domain);
1594 	if (ret)
1595 		return ret;
1596 
1597 	iommu_enable_pci_ats(info);
1598 
1599 	return 0;
1600 }
1601 
1602 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1603 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1604 				   unsigned long phy_pfn, unsigned long pages)
1605 {
1606 	int support, level = 1;
1607 	unsigned long pfnmerge;
1608 
1609 	support = domain->iommu_superpage;
1610 
1611 	/* To use a large page, the virtual *and* physical addresses
1612 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1613 	   of them will mean we have to use smaller pages. So just
1614 	   merge them and check both at once. */
1615 	pfnmerge = iov_pfn | phy_pfn;
1616 
1617 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1618 		pages >>= VTD_STRIDE_SHIFT;
1619 		if (!pages)
1620 			break;
1621 		pfnmerge >>= VTD_STRIDE_SHIFT;
1622 		level++;
1623 		support--;
1624 	}
1625 	return level;
1626 }
1627 
1628 /*
1629  * Ensure that old small page tables are removed to make room for superpage(s).
1630  * We're going to add new large pages, so make sure we don't remove their parent
1631  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1632  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1633 static void switch_to_super_page(struct dmar_domain *domain,
1634 				 unsigned long start_pfn,
1635 				 unsigned long end_pfn, int level)
1636 {
1637 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1638 	struct dma_pte *pte = NULL;
1639 
1640 	while (start_pfn <= end_pfn) {
1641 		if (!pte)
1642 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1643 					     GFP_ATOMIC);
1644 
1645 		if (dma_pte_present(pte)) {
1646 			dma_pte_free_pagetable(domain, start_pfn,
1647 					       start_pfn + lvl_pages - 1,
1648 					       level + 1);
1649 
1650 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1651 					      end_pfn << VTD_PAGE_SHIFT, 0);
1652 		}
1653 
1654 		pte++;
1655 		start_pfn += lvl_pages;
1656 		if (first_pte_in_page(pte))
1657 			pte = NULL;
1658 	}
1659 }
1660 
1661 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1662 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1663 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1664 		 gfp_t gfp)
1665 {
1666 	struct dma_pte *first_pte = NULL, *pte = NULL;
1667 	unsigned int largepage_lvl = 0;
1668 	unsigned long lvl_pages = 0;
1669 	phys_addr_t pteval;
1670 	u64 attr;
1671 
1672 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1673 		return -EINVAL;
1674 
1675 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1676 		return -EINVAL;
1677 
1678 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1679 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1680 		return -EINVAL;
1681 	}
1682 
1683 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1684 	attr |= DMA_FL_PTE_PRESENT;
1685 	if (domain->use_first_level) {
1686 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1687 		if (prot & DMA_PTE_WRITE)
1688 			attr |= DMA_FL_PTE_DIRTY;
1689 	}
1690 
1691 	domain->has_mappings = true;
1692 
1693 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1694 
1695 	while (nr_pages > 0) {
1696 		uint64_t tmp;
1697 
1698 		if (!pte) {
1699 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1700 					phys_pfn, nr_pages);
1701 
1702 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1703 					     gfp);
1704 			if (!pte)
1705 				return -ENOMEM;
1706 			first_pte = pte;
1707 
1708 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1709 
1710 			/* It is large page*/
1711 			if (largepage_lvl > 1) {
1712 				unsigned long end_pfn;
1713 				unsigned long pages_to_remove;
1714 
1715 				pteval |= DMA_PTE_LARGE_PAGE;
1716 				pages_to_remove = min_t(unsigned long, nr_pages,
1717 							nr_pte_to_next_page(pte) * lvl_pages);
1718 				end_pfn = iov_pfn + pages_to_remove - 1;
1719 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1720 			} else {
1721 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1722 			}
1723 
1724 		}
1725 		/* We don't need lock here, nobody else
1726 		 * touches the iova range
1727 		 */
1728 		tmp = 0ULL;
1729 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1730 			static int dumps = 5;
1731 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1732 				iov_pfn, tmp, (unsigned long long)pteval);
1733 			if (dumps) {
1734 				dumps--;
1735 				debug_dma_dump_mappings(NULL);
1736 			}
1737 			WARN_ON(1);
1738 		}
1739 
1740 		nr_pages -= lvl_pages;
1741 		iov_pfn += lvl_pages;
1742 		phys_pfn += lvl_pages;
1743 		pteval += lvl_pages * VTD_PAGE_SIZE;
1744 
1745 		/* If the next PTE would be the first in a new page, then we
1746 		 * need to flush the cache on the entries we've just written.
1747 		 * And then we'll need to recalculate 'pte', so clear it and
1748 		 * let it get set again in the if (!pte) block above.
1749 		 *
1750 		 * If we're done (!nr_pages) we need to flush the cache too.
1751 		 *
1752 		 * Also if we've been setting superpages, we may need to
1753 		 * recalculate 'pte' and switch back to smaller pages for the
1754 		 * end of the mapping, if the trailing size is not enough to
1755 		 * use another superpage (i.e. nr_pages < lvl_pages).
1756 		 */
1757 		pte++;
1758 		if (!nr_pages || first_pte_in_page(pte) ||
1759 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1760 			domain_flush_cache(domain, first_pte,
1761 					   (void *)pte - (void *)first_pte);
1762 			pte = NULL;
1763 		}
1764 	}
1765 
1766 	return 0;
1767 }
1768 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1769 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1770 {
1771 	struct intel_iommu *iommu = info->iommu;
1772 	struct context_entry *context;
1773 	u16 did;
1774 
1775 	spin_lock(&iommu->lock);
1776 	context = iommu_context_addr(iommu, bus, devfn, 0);
1777 	if (!context) {
1778 		spin_unlock(&iommu->lock);
1779 		return;
1780 	}
1781 
1782 	did = context_domain_id(context);
1783 	context_clear_entry(context);
1784 	__iommu_flush_cache(iommu, context, sizeof(*context));
1785 	spin_unlock(&iommu->lock);
1786 	intel_context_flush_no_pasid(info, context, did);
1787 }
1788 
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1789 int __domain_setup_first_level(struct intel_iommu *iommu,
1790 			       struct device *dev, ioasid_t pasid,
1791 			       u16 did, pgd_t *pgd, int flags,
1792 			       struct iommu_domain *old)
1793 {
1794 	if (!old)
1795 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1796 						     pasid, did, flags);
1797 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1798 					       iommu_domain_did(old, iommu),
1799 					       flags);
1800 }
1801 
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1802 static int domain_setup_second_level(struct intel_iommu *iommu,
1803 				     struct dmar_domain *domain,
1804 				     struct device *dev, ioasid_t pasid,
1805 				     struct iommu_domain *old)
1806 {
1807 	if (!old)
1808 		return intel_pasid_setup_second_level(iommu, domain,
1809 						      dev, pasid);
1810 	return intel_pasid_replace_second_level(iommu, domain, dev,
1811 						iommu_domain_did(old, iommu),
1812 						pasid);
1813 }
1814 
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1815 static int domain_setup_passthrough(struct intel_iommu *iommu,
1816 				    struct device *dev, ioasid_t pasid,
1817 				    struct iommu_domain *old)
1818 {
1819 	if (!old)
1820 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1821 	return intel_pasid_replace_pass_through(iommu, dev,
1822 						iommu_domain_did(old, iommu),
1823 						pasid);
1824 }
1825 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1826 static int domain_setup_first_level(struct intel_iommu *iommu,
1827 				    struct dmar_domain *domain,
1828 				    struct device *dev,
1829 				    u32 pasid, struct iommu_domain *old)
1830 {
1831 	struct dma_pte *pgd = domain->pgd;
1832 	int level, flags = 0;
1833 
1834 	level = agaw_to_level(domain->agaw);
1835 	if (level != 4 && level != 5)
1836 		return -EINVAL;
1837 
1838 	if (level == 5)
1839 		flags |= PASID_FLAG_FL5LP;
1840 
1841 	if (domain->force_snooping)
1842 		flags |= PASID_FLAG_PAGE_SNOOP;
1843 
1844 	return __domain_setup_first_level(iommu, dev, pasid,
1845 					  domain_id_iommu(domain, iommu),
1846 					  (pgd_t *)pgd, flags, old);
1847 }
1848 
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1849 static int dmar_domain_attach_device(struct dmar_domain *domain,
1850 				     struct device *dev)
1851 {
1852 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1853 	struct intel_iommu *iommu = info->iommu;
1854 	unsigned long flags;
1855 	int ret;
1856 
1857 	ret = domain_attach_iommu(domain, iommu);
1858 	if (ret)
1859 		return ret;
1860 
1861 	info->domain = domain;
1862 	spin_lock_irqsave(&domain->lock, flags);
1863 	list_add(&info->link, &domain->devices);
1864 	spin_unlock_irqrestore(&domain->lock, flags);
1865 
1866 	if (dev_is_real_dma_subdevice(dev))
1867 		return 0;
1868 
1869 	if (!sm_supported(iommu))
1870 		ret = domain_context_mapping(domain, dev);
1871 	else if (domain->use_first_level)
1872 		ret = domain_setup_first_level(iommu, domain, dev,
1873 					       IOMMU_NO_PASID, NULL);
1874 	else
1875 		ret = domain_setup_second_level(iommu, domain, dev,
1876 						IOMMU_NO_PASID, NULL);
1877 
1878 	if (ret)
1879 		goto out_block_translation;
1880 
1881 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1882 	if (ret)
1883 		goto out_block_translation;
1884 
1885 	return 0;
1886 
1887 out_block_translation:
1888 	device_block_translation(dev);
1889 	return ret;
1890 }
1891 
1892 /**
1893  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1894  * is relaxable (ie. is allowed to be not enforced under some conditions)
1895  * @dev: device handle
1896  *
1897  * We assume that PCI USB devices with RMRRs have them largely
1898  * for historical reasons and that the RMRR space is not actively used post
1899  * boot.  This exclusion may change if vendors begin to abuse it.
1900  *
1901  * The same exception is made for graphics devices, with the requirement that
1902  * any use of the RMRR regions will be torn down before assigning the device
1903  * to a guest.
1904  *
1905  * Return: true if the RMRR is relaxable, false otherwise
1906  */
device_rmrr_is_relaxable(struct device * dev)1907 static bool device_rmrr_is_relaxable(struct device *dev)
1908 {
1909 	struct pci_dev *pdev;
1910 
1911 	if (!dev_is_pci(dev))
1912 		return false;
1913 
1914 	pdev = to_pci_dev(dev);
1915 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1916 		return true;
1917 	else
1918 		return false;
1919 }
1920 
device_def_domain_type(struct device * dev)1921 static int device_def_domain_type(struct device *dev)
1922 {
1923 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1924 	struct intel_iommu *iommu = info->iommu;
1925 
1926 	/*
1927 	 * Hardware does not support the passthrough translation mode.
1928 	 * Always use a dynamaic mapping domain.
1929 	 */
1930 	if (!ecap_pass_through(iommu->ecap))
1931 		return IOMMU_DOMAIN_DMA;
1932 
1933 	if (dev_is_pci(dev)) {
1934 		struct pci_dev *pdev = to_pci_dev(dev);
1935 
1936 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1937 			return IOMMU_DOMAIN_IDENTITY;
1938 	}
1939 
1940 	return 0;
1941 }
1942 
intel_iommu_init_qi(struct intel_iommu * iommu)1943 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1944 {
1945 	/*
1946 	 * Start from the sane iommu hardware state.
1947 	 * If the queued invalidation is already initialized by us
1948 	 * (for example, while enabling interrupt-remapping) then
1949 	 * we got the things already rolling from a sane state.
1950 	 */
1951 	if (!iommu->qi) {
1952 		/*
1953 		 * Clear any previous faults.
1954 		 */
1955 		dmar_fault(-1, iommu);
1956 		/*
1957 		 * Disable queued invalidation if supported and already enabled
1958 		 * before OS handover.
1959 		 */
1960 		dmar_disable_qi(iommu);
1961 	}
1962 
1963 	if (dmar_enable_qi(iommu)) {
1964 		/*
1965 		 * Queued Invalidate not enabled, use Register Based Invalidate
1966 		 */
1967 		iommu->flush.flush_context = __iommu_flush_context;
1968 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1969 		pr_info("%s: Using Register based invalidation\n",
1970 			iommu->name);
1971 	} else {
1972 		iommu->flush.flush_context = qi_flush_context;
1973 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1974 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1975 	}
1976 }
1977 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1978 static int copy_context_table(struct intel_iommu *iommu,
1979 			      struct root_entry *old_re,
1980 			      struct context_entry **tbl,
1981 			      int bus, bool ext)
1982 {
1983 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1984 	struct context_entry *new_ce = NULL, ce;
1985 	struct context_entry *old_ce = NULL;
1986 	struct root_entry re;
1987 	phys_addr_t old_ce_phys;
1988 
1989 	tbl_idx = ext ? bus * 2 : bus;
1990 	memcpy(&re, old_re, sizeof(re));
1991 
1992 	for (devfn = 0; devfn < 256; devfn++) {
1993 		/* First calculate the correct index */
1994 		idx = (ext ? devfn * 2 : devfn) % 256;
1995 
1996 		if (idx == 0) {
1997 			/* First save what we may have and clean up */
1998 			if (new_ce) {
1999 				tbl[tbl_idx] = new_ce;
2000 				__iommu_flush_cache(iommu, new_ce,
2001 						    VTD_PAGE_SIZE);
2002 				pos = 1;
2003 			}
2004 
2005 			if (old_ce)
2006 				memunmap(old_ce);
2007 
2008 			ret = 0;
2009 			if (devfn < 0x80)
2010 				old_ce_phys = root_entry_lctp(&re);
2011 			else
2012 				old_ce_phys = root_entry_uctp(&re);
2013 
2014 			if (!old_ce_phys) {
2015 				if (ext && devfn == 0) {
2016 					/* No LCTP, try UCTP */
2017 					devfn = 0x7f;
2018 					continue;
2019 				} else {
2020 					goto out;
2021 				}
2022 			}
2023 
2024 			ret = -ENOMEM;
2025 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2026 					MEMREMAP_WB);
2027 			if (!old_ce)
2028 				goto out;
2029 
2030 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2031 			if (!new_ce)
2032 				goto out_unmap;
2033 
2034 			ret = 0;
2035 		}
2036 
2037 		/* Now copy the context entry */
2038 		memcpy(&ce, old_ce + idx, sizeof(ce));
2039 
2040 		if (!context_present(&ce))
2041 			continue;
2042 
2043 		did = context_domain_id(&ce);
2044 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2045 			set_bit(did, iommu->domain_ids);
2046 
2047 		set_context_copied(iommu, bus, devfn);
2048 		new_ce[idx] = ce;
2049 	}
2050 
2051 	tbl[tbl_idx + pos] = new_ce;
2052 
2053 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2054 
2055 out_unmap:
2056 	memunmap(old_ce);
2057 
2058 out:
2059 	return ret;
2060 }
2061 
copy_translation_tables(struct intel_iommu * iommu)2062 static int copy_translation_tables(struct intel_iommu *iommu)
2063 {
2064 	struct context_entry **ctxt_tbls;
2065 	struct root_entry *old_rt;
2066 	phys_addr_t old_rt_phys;
2067 	int ctxt_table_entries;
2068 	u64 rtaddr_reg;
2069 	int bus, ret;
2070 	bool new_ext, ext;
2071 
2072 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2073 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2074 	new_ext    = !!sm_supported(iommu);
2075 
2076 	/*
2077 	 * The RTT bit can only be changed when translation is disabled,
2078 	 * but disabling translation means to open a window for data
2079 	 * corruption. So bail out and don't copy anything if we would
2080 	 * have to change the bit.
2081 	 */
2082 	if (new_ext != ext)
2083 		return -EINVAL;
2084 
2085 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2086 	if (!iommu->copied_tables)
2087 		return -ENOMEM;
2088 
2089 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2090 	if (!old_rt_phys)
2091 		return -EINVAL;
2092 
2093 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2094 	if (!old_rt)
2095 		return -ENOMEM;
2096 
2097 	/* This is too big for the stack - allocate it from slab */
2098 	ctxt_table_entries = ext ? 512 : 256;
2099 	ret = -ENOMEM;
2100 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2101 	if (!ctxt_tbls)
2102 		goto out_unmap;
2103 
2104 	for (bus = 0; bus < 256; bus++) {
2105 		ret = copy_context_table(iommu, &old_rt[bus],
2106 					 ctxt_tbls, bus, ext);
2107 		if (ret) {
2108 			pr_err("%s: Failed to copy context table for bus %d\n",
2109 				iommu->name, bus);
2110 			continue;
2111 		}
2112 	}
2113 
2114 	spin_lock(&iommu->lock);
2115 
2116 	/* Context tables are copied, now write them to the root_entry table */
2117 	for (bus = 0; bus < 256; bus++) {
2118 		int idx = ext ? bus * 2 : bus;
2119 		u64 val;
2120 
2121 		if (ctxt_tbls[idx]) {
2122 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2123 			iommu->root_entry[bus].lo = val;
2124 		}
2125 
2126 		if (!ext || !ctxt_tbls[idx + 1])
2127 			continue;
2128 
2129 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2130 		iommu->root_entry[bus].hi = val;
2131 	}
2132 
2133 	spin_unlock(&iommu->lock);
2134 
2135 	kfree(ctxt_tbls);
2136 
2137 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2138 
2139 	ret = 0;
2140 
2141 out_unmap:
2142 	memunmap(old_rt);
2143 
2144 	return ret;
2145 }
2146 
init_dmars(void)2147 static int __init init_dmars(void)
2148 {
2149 	struct dmar_drhd_unit *drhd;
2150 	struct intel_iommu *iommu;
2151 	int ret;
2152 
2153 	for_each_iommu(iommu, drhd) {
2154 		if (drhd->ignored) {
2155 			iommu_disable_translation(iommu);
2156 			continue;
2157 		}
2158 
2159 		/*
2160 		 * Find the max pasid size of all IOMMU's in the system.
2161 		 * We need to ensure the system pasid table is no bigger
2162 		 * than the smallest supported.
2163 		 */
2164 		if (pasid_supported(iommu)) {
2165 			u32 temp = 2 << ecap_pss(iommu->ecap);
2166 
2167 			intel_pasid_max_id = min_t(u32, temp,
2168 						   intel_pasid_max_id);
2169 		}
2170 
2171 		intel_iommu_init_qi(iommu);
2172 
2173 		ret = iommu_init_domains(iommu);
2174 		if (ret)
2175 			goto free_iommu;
2176 
2177 		init_translation_status(iommu);
2178 
2179 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2180 			iommu_disable_translation(iommu);
2181 			clear_translation_pre_enabled(iommu);
2182 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2183 				iommu->name);
2184 		}
2185 
2186 		/*
2187 		 * TBD:
2188 		 * we could share the same root & context tables
2189 		 * among all IOMMU's. Need to Split it later.
2190 		 */
2191 		ret = iommu_alloc_root_entry(iommu);
2192 		if (ret)
2193 			goto free_iommu;
2194 
2195 		if (translation_pre_enabled(iommu)) {
2196 			pr_info("Translation already enabled - trying to copy translation structures\n");
2197 
2198 			ret = copy_translation_tables(iommu);
2199 			if (ret) {
2200 				/*
2201 				 * We found the IOMMU with translation
2202 				 * enabled - but failed to copy over the
2203 				 * old root-entry table. Try to proceed
2204 				 * by disabling translation now and
2205 				 * allocating a clean root-entry table.
2206 				 * This might cause DMAR faults, but
2207 				 * probably the dump will still succeed.
2208 				 */
2209 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2210 				       iommu->name);
2211 				iommu_disable_translation(iommu);
2212 				clear_translation_pre_enabled(iommu);
2213 			} else {
2214 				pr_info("Copied translation tables from previous kernel for %s\n",
2215 					iommu->name);
2216 			}
2217 		}
2218 
2219 		intel_svm_check(iommu);
2220 	}
2221 
2222 	/*
2223 	 * Now that qi is enabled on all iommus, set the root entry and flush
2224 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2225 	 * flush_context function will loop forever and the boot hangs.
2226 	 */
2227 	for_each_active_iommu(iommu, drhd) {
2228 		iommu_flush_write_buffer(iommu);
2229 		iommu_set_root_entry(iommu);
2230 	}
2231 
2232 	check_tylersburg_isoch();
2233 
2234 	/*
2235 	 * for each drhd
2236 	 *   enable fault log
2237 	 *   global invalidate context cache
2238 	 *   global invalidate iotlb
2239 	 *   enable translation
2240 	 */
2241 	for_each_iommu(iommu, drhd) {
2242 		if (drhd->ignored) {
2243 			/*
2244 			 * we always have to disable PMRs or DMA may fail on
2245 			 * this device
2246 			 */
2247 			if (force_on)
2248 				iommu_disable_protect_mem_regions(iommu);
2249 			continue;
2250 		}
2251 
2252 		iommu_flush_write_buffer(iommu);
2253 
2254 		if (ecap_prs(iommu->ecap)) {
2255 			/*
2256 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2257 			 * could cause possible lock race condition.
2258 			 */
2259 			up_write(&dmar_global_lock);
2260 			ret = intel_iommu_enable_prq(iommu);
2261 			down_write(&dmar_global_lock);
2262 			if (ret)
2263 				goto free_iommu;
2264 		}
2265 
2266 		ret = dmar_set_interrupt(iommu);
2267 		if (ret)
2268 			goto free_iommu;
2269 	}
2270 
2271 	return 0;
2272 
2273 free_iommu:
2274 	for_each_active_iommu(iommu, drhd) {
2275 		disable_dmar_iommu(iommu);
2276 		free_dmar_iommu(iommu);
2277 	}
2278 
2279 	return ret;
2280 }
2281 
init_no_remapping_devices(void)2282 static void __init init_no_remapping_devices(void)
2283 {
2284 	struct dmar_drhd_unit *drhd;
2285 	struct device *dev;
2286 	int i;
2287 
2288 	for_each_drhd_unit(drhd) {
2289 		if (!drhd->include_all) {
2290 			for_each_active_dev_scope(drhd->devices,
2291 						  drhd->devices_cnt, i, dev)
2292 				break;
2293 			/* ignore DMAR unit if no devices exist */
2294 			if (i == drhd->devices_cnt)
2295 				drhd->ignored = 1;
2296 		}
2297 	}
2298 
2299 	for_each_active_drhd_unit(drhd) {
2300 		if (drhd->include_all)
2301 			continue;
2302 
2303 		for_each_active_dev_scope(drhd->devices,
2304 					  drhd->devices_cnt, i, dev)
2305 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2306 				break;
2307 		if (i < drhd->devices_cnt)
2308 			continue;
2309 
2310 		/* This IOMMU has *only* gfx devices. Either bypass it or
2311 		   set the gfx_mapped flag, as appropriate */
2312 		drhd->gfx_dedicated = 1;
2313 		if (disable_igfx_iommu)
2314 			drhd->ignored = 1;
2315 	}
2316 }
2317 
2318 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2319 static int init_iommu_hw(void)
2320 {
2321 	struct dmar_drhd_unit *drhd;
2322 	struct intel_iommu *iommu = NULL;
2323 	int ret;
2324 
2325 	for_each_active_iommu(iommu, drhd) {
2326 		if (iommu->qi) {
2327 			ret = dmar_reenable_qi(iommu);
2328 			if (ret)
2329 				return ret;
2330 		}
2331 	}
2332 
2333 	for_each_iommu(iommu, drhd) {
2334 		if (drhd->ignored) {
2335 			/*
2336 			 * we always have to disable PMRs or DMA may fail on
2337 			 * this device
2338 			 */
2339 			if (force_on)
2340 				iommu_disable_protect_mem_regions(iommu);
2341 			continue;
2342 		}
2343 
2344 		iommu_flush_write_buffer(iommu);
2345 		iommu_set_root_entry(iommu);
2346 		iommu_enable_translation(iommu);
2347 		iommu_disable_protect_mem_regions(iommu);
2348 	}
2349 
2350 	return 0;
2351 }
2352 
iommu_flush_all(void)2353 static void iommu_flush_all(void)
2354 {
2355 	struct dmar_drhd_unit *drhd;
2356 	struct intel_iommu *iommu;
2357 
2358 	for_each_active_iommu(iommu, drhd) {
2359 		iommu->flush.flush_context(iommu, 0, 0, 0,
2360 					   DMA_CCMD_GLOBAL_INVL);
2361 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2362 					 DMA_TLB_GLOBAL_FLUSH);
2363 	}
2364 }
2365 
iommu_suspend(void)2366 static int iommu_suspend(void)
2367 {
2368 	struct dmar_drhd_unit *drhd;
2369 	struct intel_iommu *iommu = NULL;
2370 	unsigned long flag;
2371 
2372 	iommu_flush_all();
2373 
2374 	for_each_active_iommu(iommu, drhd) {
2375 		iommu_disable_translation(iommu);
2376 
2377 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2378 
2379 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2380 			readl(iommu->reg + DMAR_FECTL_REG);
2381 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2382 			readl(iommu->reg + DMAR_FEDATA_REG);
2383 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2384 			readl(iommu->reg + DMAR_FEADDR_REG);
2385 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2386 			readl(iommu->reg + DMAR_FEUADDR_REG);
2387 
2388 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2389 	}
2390 	return 0;
2391 }
2392 
iommu_resume(void)2393 static void iommu_resume(void)
2394 {
2395 	struct dmar_drhd_unit *drhd;
2396 	struct intel_iommu *iommu = NULL;
2397 	unsigned long flag;
2398 
2399 	if (init_iommu_hw()) {
2400 		if (force_on)
2401 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2402 		else
2403 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2404 		return;
2405 	}
2406 
2407 	for_each_active_iommu(iommu, drhd) {
2408 
2409 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2410 
2411 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2412 			iommu->reg + DMAR_FECTL_REG);
2413 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2414 			iommu->reg + DMAR_FEDATA_REG);
2415 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2416 			iommu->reg + DMAR_FEADDR_REG);
2417 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2418 			iommu->reg + DMAR_FEUADDR_REG);
2419 
2420 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2421 	}
2422 }
2423 
2424 static struct syscore_ops iommu_syscore_ops = {
2425 	.resume		= iommu_resume,
2426 	.suspend	= iommu_suspend,
2427 };
2428 
init_iommu_pm_ops(void)2429 static void __init init_iommu_pm_ops(void)
2430 {
2431 	register_syscore_ops(&iommu_syscore_ops);
2432 }
2433 
2434 #else
init_iommu_pm_ops(void)2435 static inline void init_iommu_pm_ops(void) {}
2436 #endif	/* CONFIG_PM */
2437 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2438 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2439 {
2440 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2441 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2442 	    rmrr->end_address <= rmrr->base_address ||
2443 	    arch_rmrr_sanity_check(rmrr))
2444 		return -EINVAL;
2445 
2446 	return 0;
2447 }
2448 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2449 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2450 {
2451 	struct acpi_dmar_reserved_memory *rmrr;
2452 	struct dmar_rmrr_unit *rmrru;
2453 
2454 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2455 	if (rmrr_sanity_check(rmrr)) {
2456 		pr_warn(FW_BUG
2457 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2458 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2459 			   rmrr->base_address, rmrr->end_address,
2460 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2461 			   dmi_get_system_info(DMI_BIOS_VERSION),
2462 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2463 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2464 	}
2465 
2466 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2467 	if (!rmrru)
2468 		goto out;
2469 
2470 	rmrru->hdr = header;
2471 
2472 	rmrru->base_address = rmrr->base_address;
2473 	rmrru->end_address = rmrr->end_address;
2474 
2475 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2476 				((void *)rmrr) + rmrr->header.length,
2477 				&rmrru->devices_cnt);
2478 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2479 		goto free_rmrru;
2480 
2481 	list_add(&rmrru->list, &dmar_rmrr_units);
2482 
2483 	return 0;
2484 free_rmrru:
2485 	kfree(rmrru);
2486 out:
2487 	return -ENOMEM;
2488 }
2489 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2490 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2491 {
2492 	struct dmar_atsr_unit *atsru;
2493 	struct acpi_dmar_atsr *tmp;
2494 
2495 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2496 				dmar_rcu_check()) {
2497 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2498 		if (atsr->segment != tmp->segment)
2499 			continue;
2500 		if (atsr->header.length != tmp->header.length)
2501 			continue;
2502 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2503 			return atsru;
2504 	}
2505 
2506 	return NULL;
2507 }
2508 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2509 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2510 {
2511 	struct acpi_dmar_atsr *atsr;
2512 	struct dmar_atsr_unit *atsru;
2513 
2514 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2515 		return 0;
2516 
2517 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2518 	atsru = dmar_find_atsr(atsr);
2519 	if (atsru)
2520 		return 0;
2521 
2522 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2523 	if (!atsru)
2524 		return -ENOMEM;
2525 
2526 	/*
2527 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2528 	 * copy the memory content because the memory buffer will be freed
2529 	 * on return.
2530 	 */
2531 	atsru->hdr = (void *)(atsru + 1);
2532 	memcpy(atsru->hdr, hdr, hdr->length);
2533 	atsru->include_all = atsr->flags & 0x1;
2534 	if (!atsru->include_all) {
2535 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2536 				(void *)atsr + atsr->header.length,
2537 				&atsru->devices_cnt);
2538 		if (atsru->devices_cnt && atsru->devices == NULL) {
2539 			kfree(atsru);
2540 			return -ENOMEM;
2541 		}
2542 	}
2543 
2544 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2545 
2546 	return 0;
2547 }
2548 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2549 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2550 {
2551 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2552 	kfree(atsru);
2553 }
2554 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2555 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2556 {
2557 	struct acpi_dmar_atsr *atsr;
2558 	struct dmar_atsr_unit *atsru;
2559 
2560 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2561 	atsru = dmar_find_atsr(atsr);
2562 	if (atsru) {
2563 		list_del_rcu(&atsru->list);
2564 		synchronize_rcu();
2565 		intel_iommu_free_atsr(atsru);
2566 	}
2567 
2568 	return 0;
2569 }
2570 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2571 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2572 {
2573 	int i;
2574 	struct device *dev;
2575 	struct acpi_dmar_atsr *atsr;
2576 	struct dmar_atsr_unit *atsru;
2577 
2578 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2579 	atsru = dmar_find_atsr(atsr);
2580 	if (!atsru)
2581 		return 0;
2582 
2583 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2584 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2585 					  i, dev)
2586 			return -EBUSY;
2587 	}
2588 
2589 	return 0;
2590 }
2591 
dmar_find_satc(struct acpi_dmar_satc * satc)2592 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2593 {
2594 	struct dmar_satc_unit *satcu;
2595 	struct acpi_dmar_satc *tmp;
2596 
2597 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2598 				dmar_rcu_check()) {
2599 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2600 		if (satc->segment != tmp->segment)
2601 			continue;
2602 		if (satc->header.length != tmp->header.length)
2603 			continue;
2604 		if (memcmp(satc, tmp, satc->header.length) == 0)
2605 			return satcu;
2606 	}
2607 
2608 	return NULL;
2609 }
2610 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2611 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2612 {
2613 	struct acpi_dmar_satc *satc;
2614 	struct dmar_satc_unit *satcu;
2615 
2616 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2617 		return 0;
2618 
2619 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2620 	satcu = dmar_find_satc(satc);
2621 	if (satcu)
2622 		return 0;
2623 
2624 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2625 	if (!satcu)
2626 		return -ENOMEM;
2627 
2628 	satcu->hdr = (void *)(satcu + 1);
2629 	memcpy(satcu->hdr, hdr, hdr->length);
2630 	satcu->atc_required = satc->flags & 0x1;
2631 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2632 					      (void *)satc + satc->header.length,
2633 					      &satcu->devices_cnt);
2634 	if (satcu->devices_cnt && !satcu->devices) {
2635 		kfree(satcu);
2636 		return -ENOMEM;
2637 	}
2638 	list_add_rcu(&satcu->list, &dmar_satc_units);
2639 
2640 	return 0;
2641 }
2642 
intel_iommu_add(struct dmar_drhd_unit * dmaru)2643 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2644 {
2645 	struct intel_iommu *iommu = dmaru->iommu;
2646 	int ret;
2647 
2648 	/*
2649 	 * Disable translation if already enabled prior to OS handover.
2650 	 */
2651 	if (iommu->gcmd & DMA_GCMD_TE)
2652 		iommu_disable_translation(iommu);
2653 
2654 	ret = iommu_init_domains(iommu);
2655 	if (ret == 0)
2656 		ret = iommu_alloc_root_entry(iommu);
2657 	if (ret)
2658 		goto out;
2659 
2660 	intel_svm_check(iommu);
2661 
2662 	if (dmaru->ignored) {
2663 		/*
2664 		 * we always have to disable PMRs or DMA may fail on this device
2665 		 */
2666 		if (force_on)
2667 			iommu_disable_protect_mem_regions(iommu);
2668 		return 0;
2669 	}
2670 
2671 	intel_iommu_init_qi(iommu);
2672 	iommu_flush_write_buffer(iommu);
2673 
2674 	if (ecap_prs(iommu->ecap)) {
2675 		ret = intel_iommu_enable_prq(iommu);
2676 		if (ret)
2677 			goto disable_iommu;
2678 	}
2679 
2680 	ret = dmar_set_interrupt(iommu);
2681 	if (ret)
2682 		goto disable_iommu;
2683 
2684 	iommu_set_root_entry(iommu);
2685 	iommu_enable_translation(iommu);
2686 
2687 	iommu_disable_protect_mem_regions(iommu);
2688 	return 0;
2689 
2690 disable_iommu:
2691 	disable_dmar_iommu(iommu);
2692 out:
2693 	free_dmar_iommu(iommu);
2694 	return ret;
2695 }
2696 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2697 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2698 {
2699 	int ret = 0;
2700 	struct intel_iommu *iommu = dmaru->iommu;
2701 
2702 	if (!intel_iommu_enabled)
2703 		return 0;
2704 	if (iommu == NULL)
2705 		return -EINVAL;
2706 
2707 	if (insert) {
2708 		ret = intel_iommu_add(dmaru);
2709 	} else {
2710 		disable_dmar_iommu(iommu);
2711 		free_dmar_iommu(iommu);
2712 	}
2713 
2714 	return ret;
2715 }
2716 
intel_iommu_free_dmars(void)2717 static void intel_iommu_free_dmars(void)
2718 {
2719 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2720 	struct dmar_atsr_unit *atsru, *atsr_n;
2721 	struct dmar_satc_unit *satcu, *satc_n;
2722 
2723 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2724 		list_del(&rmrru->list);
2725 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2726 		kfree(rmrru);
2727 	}
2728 
2729 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2730 		list_del(&atsru->list);
2731 		intel_iommu_free_atsr(atsru);
2732 	}
2733 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2734 		list_del(&satcu->list);
2735 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2736 		kfree(satcu);
2737 	}
2738 }
2739 
dmar_find_matched_satc_unit(struct pci_dev * dev)2740 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2741 {
2742 	struct dmar_satc_unit *satcu;
2743 	struct acpi_dmar_satc *satc;
2744 	struct device *tmp;
2745 	int i;
2746 
2747 	dev = pci_physfn(dev);
2748 	rcu_read_lock();
2749 
2750 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2751 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2752 		if (satc->segment != pci_domain_nr(dev->bus))
2753 			continue;
2754 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2755 			if (to_pci_dev(tmp) == dev)
2756 				goto out;
2757 	}
2758 	satcu = NULL;
2759 out:
2760 	rcu_read_unlock();
2761 	return satcu;
2762 }
2763 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2764 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2765 {
2766 	int i, ret = 1;
2767 	struct pci_bus *bus;
2768 	struct pci_dev *bridge = NULL;
2769 	struct device *tmp;
2770 	struct acpi_dmar_atsr *atsr;
2771 	struct dmar_atsr_unit *atsru;
2772 	struct dmar_satc_unit *satcu;
2773 
2774 	dev = pci_physfn(dev);
2775 	satcu = dmar_find_matched_satc_unit(dev);
2776 	if (satcu)
2777 		/*
2778 		 * This device supports ATS as it is in SATC table.
2779 		 * When IOMMU is in legacy mode, enabling ATS is done
2780 		 * automatically by HW for the device that requires
2781 		 * ATS, hence OS should not enable this device ATS
2782 		 * to avoid duplicated TLB invalidation.
2783 		 */
2784 		return !(satcu->atc_required && !sm_supported(iommu));
2785 
2786 	for (bus = dev->bus; bus; bus = bus->parent) {
2787 		bridge = bus->self;
2788 		/* If it's an integrated device, allow ATS */
2789 		if (!bridge)
2790 			return 1;
2791 		/* Connected via non-PCIe: no ATS */
2792 		if (!pci_is_pcie(bridge) ||
2793 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2794 			return 0;
2795 		/* If we found the root port, look it up in the ATSR */
2796 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2797 			break;
2798 	}
2799 
2800 	rcu_read_lock();
2801 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2802 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2803 		if (atsr->segment != pci_domain_nr(dev->bus))
2804 			continue;
2805 
2806 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2807 			if (tmp == &bridge->dev)
2808 				goto out;
2809 
2810 		if (atsru->include_all)
2811 			goto out;
2812 	}
2813 	ret = 0;
2814 out:
2815 	rcu_read_unlock();
2816 
2817 	return ret;
2818 }
2819 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2820 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2821 {
2822 	int ret;
2823 	struct dmar_rmrr_unit *rmrru;
2824 	struct dmar_atsr_unit *atsru;
2825 	struct dmar_satc_unit *satcu;
2826 	struct acpi_dmar_atsr *atsr;
2827 	struct acpi_dmar_reserved_memory *rmrr;
2828 	struct acpi_dmar_satc *satc;
2829 
2830 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2831 		return 0;
2832 
2833 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2834 		rmrr = container_of(rmrru->hdr,
2835 				    struct acpi_dmar_reserved_memory, header);
2836 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2837 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2838 				((void *)rmrr) + rmrr->header.length,
2839 				rmrr->segment, rmrru->devices,
2840 				rmrru->devices_cnt);
2841 			if (ret < 0)
2842 				return ret;
2843 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2844 			dmar_remove_dev_scope(info, rmrr->segment,
2845 				rmrru->devices, rmrru->devices_cnt);
2846 		}
2847 	}
2848 
2849 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2850 		if (atsru->include_all)
2851 			continue;
2852 
2853 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2854 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2855 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2856 					(void *)atsr + atsr->header.length,
2857 					atsr->segment, atsru->devices,
2858 					atsru->devices_cnt);
2859 			if (ret > 0)
2860 				break;
2861 			else if (ret < 0)
2862 				return ret;
2863 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2864 			if (dmar_remove_dev_scope(info, atsr->segment,
2865 					atsru->devices, atsru->devices_cnt))
2866 				break;
2867 		}
2868 	}
2869 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2870 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2871 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2872 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2873 					(void *)satc + satc->header.length,
2874 					satc->segment, satcu->devices,
2875 					satcu->devices_cnt);
2876 			if (ret > 0)
2877 				break;
2878 			else if (ret < 0)
2879 				return ret;
2880 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2881 			if (dmar_remove_dev_scope(info, satc->segment,
2882 					satcu->devices, satcu->devices_cnt))
2883 				break;
2884 		}
2885 	}
2886 
2887 	return 0;
2888 }
2889 
intel_disable_iommus(void)2890 static void intel_disable_iommus(void)
2891 {
2892 	struct intel_iommu *iommu = NULL;
2893 	struct dmar_drhd_unit *drhd;
2894 
2895 	for_each_iommu(iommu, drhd)
2896 		iommu_disable_translation(iommu);
2897 }
2898 
intel_iommu_shutdown(void)2899 void intel_iommu_shutdown(void)
2900 {
2901 	struct dmar_drhd_unit *drhd;
2902 	struct intel_iommu *iommu = NULL;
2903 
2904 	if (no_iommu || dmar_disabled)
2905 		return;
2906 
2907 	/*
2908 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2909 	 * no lock and RCU checking needed anymore
2910 	 */
2911 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2912 		iommu = drhd->iommu;
2913 
2914 		/* Disable PMRs explicitly here. */
2915 		iommu_disable_protect_mem_regions(iommu);
2916 
2917 		/* Make sure the IOMMUs are switched off */
2918 		iommu_disable_translation(iommu);
2919 	}
2920 }
2921 
dev_to_intel_iommu(struct device * dev)2922 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2923 {
2924 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2925 
2926 	return container_of(iommu_dev, struct intel_iommu, iommu);
2927 }
2928 
version_show(struct device * dev,struct device_attribute * attr,char * buf)2929 static ssize_t version_show(struct device *dev,
2930 			    struct device_attribute *attr, char *buf)
2931 {
2932 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2933 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2934 	return sysfs_emit(buf, "%d:%d\n",
2935 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2936 }
2937 static DEVICE_ATTR_RO(version);
2938 
address_show(struct device * dev,struct device_attribute * attr,char * buf)2939 static ssize_t address_show(struct device *dev,
2940 			    struct device_attribute *attr, char *buf)
2941 {
2942 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2943 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2944 }
2945 static DEVICE_ATTR_RO(address);
2946 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2947 static ssize_t cap_show(struct device *dev,
2948 			struct device_attribute *attr, char *buf)
2949 {
2950 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2951 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2952 }
2953 static DEVICE_ATTR_RO(cap);
2954 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2955 static ssize_t ecap_show(struct device *dev,
2956 			 struct device_attribute *attr, char *buf)
2957 {
2958 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2959 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2960 }
2961 static DEVICE_ATTR_RO(ecap);
2962 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2963 static ssize_t domains_supported_show(struct device *dev,
2964 				      struct device_attribute *attr, char *buf)
2965 {
2966 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2967 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2968 }
2969 static DEVICE_ATTR_RO(domains_supported);
2970 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2971 static ssize_t domains_used_show(struct device *dev,
2972 				 struct device_attribute *attr, char *buf)
2973 {
2974 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2975 	return sysfs_emit(buf, "%d\n",
2976 			  bitmap_weight(iommu->domain_ids,
2977 					cap_ndoms(iommu->cap)));
2978 }
2979 static DEVICE_ATTR_RO(domains_used);
2980 
2981 static struct attribute *intel_iommu_attrs[] = {
2982 	&dev_attr_version.attr,
2983 	&dev_attr_address.attr,
2984 	&dev_attr_cap.attr,
2985 	&dev_attr_ecap.attr,
2986 	&dev_attr_domains_supported.attr,
2987 	&dev_attr_domains_used.attr,
2988 	NULL,
2989 };
2990 
2991 static struct attribute_group intel_iommu_group = {
2992 	.name = "intel-iommu",
2993 	.attrs = intel_iommu_attrs,
2994 };
2995 
2996 const struct attribute_group *intel_iommu_groups[] = {
2997 	&intel_iommu_group,
2998 	NULL,
2999 };
3000 
has_external_pci(void)3001 static bool has_external_pci(void)
3002 {
3003 	struct pci_dev *pdev = NULL;
3004 
3005 	for_each_pci_dev(pdev)
3006 		if (pdev->external_facing) {
3007 			pci_dev_put(pdev);
3008 			return true;
3009 		}
3010 
3011 	return false;
3012 }
3013 
platform_optin_force_iommu(void)3014 static int __init platform_optin_force_iommu(void)
3015 {
3016 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3017 		return 0;
3018 
3019 	if (no_iommu || dmar_disabled)
3020 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3021 
3022 	/*
3023 	 * If Intel-IOMMU is disabled by default, we will apply identity
3024 	 * map for all devices except those marked as being untrusted.
3025 	 */
3026 	if (dmar_disabled)
3027 		iommu_set_default_passthrough(false);
3028 
3029 	dmar_disabled = 0;
3030 	no_iommu = 0;
3031 
3032 	return 1;
3033 }
3034 
probe_acpi_namespace_devices(void)3035 static int __init probe_acpi_namespace_devices(void)
3036 {
3037 	struct dmar_drhd_unit *drhd;
3038 	/* To avoid a -Wunused-but-set-variable warning. */
3039 	struct intel_iommu *iommu __maybe_unused;
3040 	struct device *dev;
3041 	int i, ret = 0;
3042 
3043 	for_each_active_iommu(iommu, drhd) {
3044 		for_each_active_dev_scope(drhd->devices,
3045 					  drhd->devices_cnt, i, dev) {
3046 			struct acpi_device_physical_node *pn;
3047 			struct acpi_device *adev;
3048 
3049 			if (dev->bus != &acpi_bus_type)
3050 				continue;
3051 
3052 			up_read(&dmar_global_lock);
3053 			adev = to_acpi_device(dev);
3054 			mutex_lock(&adev->physical_node_lock);
3055 			list_for_each_entry(pn,
3056 					    &adev->physical_node_list, node) {
3057 				ret = iommu_probe_device(pn->dev);
3058 				if (ret)
3059 					break;
3060 			}
3061 			mutex_unlock(&adev->physical_node_lock);
3062 			down_read(&dmar_global_lock);
3063 
3064 			if (ret)
3065 				return ret;
3066 		}
3067 	}
3068 
3069 	return 0;
3070 }
3071 
tboot_force_iommu(void)3072 static __init int tboot_force_iommu(void)
3073 {
3074 	if (!tboot_enabled())
3075 		return 0;
3076 
3077 	if (no_iommu || dmar_disabled)
3078 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3079 
3080 	dmar_disabled = 0;
3081 	no_iommu = 0;
3082 
3083 	return 1;
3084 }
3085 
intel_iommu_init(void)3086 int __init intel_iommu_init(void)
3087 {
3088 	int ret = -ENODEV;
3089 	struct dmar_drhd_unit *drhd;
3090 	struct intel_iommu *iommu;
3091 
3092 	/*
3093 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3094 	 * opt in, so enforce that.
3095 	 */
3096 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3097 		    platform_optin_force_iommu();
3098 
3099 	down_write(&dmar_global_lock);
3100 	if (dmar_table_init()) {
3101 		if (force_on)
3102 			panic("tboot: Failed to initialize DMAR table\n");
3103 		goto out_free_dmar;
3104 	}
3105 
3106 	if (dmar_dev_scope_init() < 0) {
3107 		if (force_on)
3108 			panic("tboot: Failed to initialize DMAR device scope\n");
3109 		goto out_free_dmar;
3110 	}
3111 
3112 	up_write(&dmar_global_lock);
3113 
3114 	/*
3115 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3116 	 * complain later when we register it under the lock.
3117 	 */
3118 	dmar_register_bus_notifier();
3119 
3120 	down_write(&dmar_global_lock);
3121 
3122 	if (!no_iommu)
3123 		intel_iommu_debugfs_init();
3124 
3125 	if (no_iommu || dmar_disabled) {
3126 		/*
3127 		 * We exit the function here to ensure IOMMU's remapping and
3128 		 * mempool aren't setup, which means that the IOMMU's PMRs
3129 		 * won't be disabled via the call to init_dmars(). So disable
3130 		 * it explicitly here. The PMRs were setup by tboot prior to
3131 		 * calling SENTER, but the kernel is expected to reset/tear
3132 		 * down the PMRs.
3133 		 */
3134 		if (intel_iommu_tboot_noforce) {
3135 			for_each_iommu(iommu, drhd)
3136 				iommu_disable_protect_mem_regions(iommu);
3137 		}
3138 
3139 		/*
3140 		 * Make sure the IOMMUs are switched off, even when we
3141 		 * boot into a kexec kernel and the previous kernel left
3142 		 * them enabled
3143 		 */
3144 		intel_disable_iommus();
3145 		goto out_free_dmar;
3146 	}
3147 
3148 	if (list_empty(&dmar_rmrr_units))
3149 		pr_info("No RMRR found\n");
3150 
3151 	if (list_empty(&dmar_atsr_units))
3152 		pr_info("No ATSR found\n");
3153 
3154 	if (list_empty(&dmar_satc_units))
3155 		pr_info("No SATC found\n");
3156 
3157 	init_no_remapping_devices();
3158 
3159 	ret = init_dmars();
3160 	if (ret) {
3161 		if (force_on)
3162 			panic("tboot: Failed to initialize DMARs\n");
3163 		pr_err("Initialization failed\n");
3164 		goto out_free_dmar;
3165 	}
3166 	up_write(&dmar_global_lock);
3167 
3168 	init_iommu_pm_ops();
3169 
3170 	down_read(&dmar_global_lock);
3171 	for_each_active_iommu(iommu, drhd) {
3172 		/*
3173 		 * The flush queue implementation does not perform
3174 		 * page-selective invalidations that are required for efficient
3175 		 * TLB flushes in virtual environments.  The benefit of batching
3176 		 * is likely to be much lower than the overhead of synchronizing
3177 		 * the virtual and physical IOMMU page-tables.
3178 		 */
3179 		if (cap_caching_mode(iommu->cap) &&
3180 		    !first_level_by_default(iommu)) {
3181 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3182 			iommu_set_dma_strict();
3183 		}
3184 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3185 				       intel_iommu_groups,
3186 				       "%s", iommu->name);
3187 		/*
3188 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3189 		 * Release the dmar_global_lock before entering the device probe path
3190 		 * to avoid unnecessary lock order splat.
3191 		 */
3192 		up_read(&dmar_global_lock);
3193 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3194 		down_read(&dmar_global_lock);
3195 
3196 		iommu_pmu_register(iommu);
3197 	}
3198 
3199 	if (probe_acpi_namespace_devices())
3200 		pr_warn("ACPI name space devices didn't probe correctly\n");
3201 
3202 	/* Finally, we enable the DMA remapping hardware. */
3203 	for_each_iommu(iommu, drhd) {
3204 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3205 			iommu_enable_translation(iommu);
3206 
3207 		iommu_disable_protect_mem_regions(iommu);
3208 	}
3209 	up_read(&dmar_global_lock);
3210 
3211 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3212 
3213 	intel_iommu_enabled = 1;
3214 
3215 	return 0;
3216 
3217 out_free_dmar:
3218 	intel_iommu_free_dmars();
3219 	up_write(&dmar_global_lock);
3220 	return ret;
3221 }
3222 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3223 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3224 {
3225 	struct device_domain_info *info = opaque;
3226 
3227 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3228 	return 0;
3229 }
3230 
3231 /*
3232  * NB - intel-iommu lacks any sort of reference counting for the users of
3233  * dependent devices.  If multiple endpoints have intersecting dependent
3234  * devices, unbinding the driver from any one of them will possibly leave
3235  * the others unable to operate.
3236  */
domain_context_clear(struct device_domain_info * info)3237 static void domain_context_clear(struct device_domain_info *info)
3238 {
3239 	if (!dev_is_pci(info->dev)) {
3240 		domain_context_clear_one(info, info->bus, info->devfn);
3241 		return;
3242 	}
3243 
3244 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3245 			       &domain_context_clear_one_cb, info);
3246 	iommu_disable_pci_ats(info);
3247 }
3248 
3249 /*
3250  * Clear the page table pointer in context or pasid table entries so that
3251  * all DMA requests without PASID from the device are blocked. If the page
3252  * table has been set, clean up the data structures.
3253  */
device_block_translation(struct device * dev)3254 void device_block_translation(struct device *dev)
3255 {
3256 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3257 	struct intel_iommu *iommu = info->iommu;
3258 	unsigned long flags;
3259 
3260 	if (info->domain)
3261 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3262 
3263 	if (!dev_is_real_dma_subdevice(dev)) {
3264 		if (sm_supported(iommu))
3265 			intel_pasid_tear_down_entry(iommu, dev,
3266 						    IOMMU_NO_PASID, false);
3267 		else
3268 			domain_context_clear(info);
3269 	}
3270 
3271 	if (!info->domain)
3272 		return;
3273 
3274 	spin_lock_irqsave(&info->domain->lock, flags);
3275 	list_del(&info->link);
3276 	spin_unlock_irqrestore(&info->domain->lock, flags);
3277 
3278 	domain_detach_iommu(info->domain, iommu);
3279 	info->domain = NULL;
3280 }
3281 
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3282 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3283 				      struct device *dev)
3284 {
3285 	device_block_translation(dev);
3286 	return 0;
3287 }
3288 
3289 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3290 					 struct device *dev, ioasid_t pasid,
3291 					 struct iommu_domain *old);
3292 
3293 static struct iommu_domain blocking_domain = {
3294 	.type = IOMMU_DOMAIN_BLOCKED,
3295 	.ops = &(const struct iommu_domain_ops) {
3296 		.attach_dev	= blocking_domain_attach_dev,
3297 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3298 	}
3299 };
3300 
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3301 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3302 {
3303 	if (!intel_iommu_superpage)
3304 		return 0;
3305 
3306 	if (first_stage)
3307 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3308 
3309 	return fls(cap_super_page_val(iommu->cap));
3310 }
3311 
paging_domain_alloc(struct device * dev,bool first_stage)3312 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3313 {
3314 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3315 	struct intel_iommu *iommu = info->iommu;
3316 	struct dmar_domain *domain;
3317 	int addr_width;
3318 
3319 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3320 	if (!domain)
3321 		return ERR_PTR(-ENOMEM);
3322 
3323 	INIT_LIST_HEAD(&domain->devices);
3324 	INIT_LIST_HEAD(&domain->dev_pasids);
3325 	INIT_LIST_HEAD(&domain->cache_tags);
3326 	spin_lock_init(&domain->lock);
3327 	spin_lock_init(&domain->cache_lock);
3328 	xa_init(&domain->iommu_array);
3329 
3330 	domain->nid = dev_to_node(dev);
3331 	domain->use_first_level = first_stage;
3332 
3333 	/* calculate the address width */
3334 	addr_width = agaw_to_width(iommu->agaw);
3335 	if (addr_width > cap_mgaw(iommu->cap))
3336 		addr_width = cap_mgaw(iommu->cap);
3337 	domain->gaw = addr_width;
3338 	domain->agaw = iommu->agaw;
3339 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3340 
3341 	/* iommu memory access coherency */
3342 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3343 
3344 	/* pagesize bitmap */
3345 	domain->domain.pgsize_bitmap = SZ_4K;
3346 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3347 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3348 
3349 	/*
3350 	 * IOVA aperture: First-level translation restricts the input-address
3351 	 * to a canonical address (i.e., address bits 63:N have the same value
3352 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3353 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3354 	 */
3355 	domain->domain.geometry.force_aperture = true;
3356 	domain->domain.geometry.aperture_start = 0;
3357 	if (first_stage)
3358 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3359 	else
3360 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3361 
3362 	/* always allocate the top pgd */
3363 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3364 	if (!domain->pgd) {
3365 		kfree(domain);
3366 		return ERR_PTR(-ENOMEM);
3367 	}
3368 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3369 
3370 	return domain;
3371 }
3372 
3373 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3374 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3375 				      const struct iommu_user_data *user_data)
3376 {
3377 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3378 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3379 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3380 	struct intel_iommu *iommu = info->iommu;
3381 	struct dmar_domain *dmar_domain;
3382 	struct iommu_domain *domain;
3383 	bool first_stage;
3384 
3385 	if (flags &
3386 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3387 		return ERR_PTR(-EOPNOTSUPP);
3388 	if (nested_parent && !nested_supported(iommu))
3389 		return ERR_PTR(-EOPNOTSUPP);
3390 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3391 		return ERR_PTR(-EOPNOTSUPP);
3392 
3393 	/*
3394 	 * Always allocate the guest compatible page table unless
3395 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3396 	 * is specified.
3397 	 */
3398 	if (nested_parent || dirty_tracking) {
3399 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3400 			return ERR_PTR(-EOPNOTSUPP);
3401 		first_stage = false;
3402 	} else {
3403 		first_stage = first_level_by_default(iommu);
3404 	}
3405 
3406 	dmar_domain = paging_domain_alloc(dev, first_stage);
3407 	if (IS_ERR(dmar_domain))
3408 		return ERR_CAST(dmar_domain);
3409 	domain = &dmar_domain->domain;
3410 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3411 	domain->owner = &intel_iommu_ops;
3412 	domain->ops = intel_iommu_ops.default_domain_ops;
3413 
3414 	if (nested_parent) {
3415 		dmar_domain->nested_parent = true;
3416 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3417 		spin_lock_init(&dmar_domain->s1_lock);
3418 	}
3419 
3420 	if (dirty_tracking) {
3421 		if (dmar_domain->use_first_level) {
3422 			iommu_domain_free(domain);
3423 			return ERR_PTR(-EOPNOTSUPP);
3424 		}
3425 		domain->dirty_ops = &intel_dirty_ops;
3426 	}
3427 
3428 	return domain;
3429 }
3430 
intel_iommu_domain_free(struct iommu_domain * domain)3431 static void intel_iommu_domain_free(struct iommu_domain *domain)
3432 {
3433 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3434 
3435 	WARN_ON(dmar_domain->nested_parent &&
3436 		!list_empty(&dmar_domain->s1_domains));
3437 	domain_exit(dmar_domain);
3438 }
3439 
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3440 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3441 {
3442 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3443 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3444 	struct intel_iommu *iommu = info->iommu;
3445 	int addr_width;
3446 
3447 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3448 		return -EPERM;
3449 
3450 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3451 		return -EINVAL;
3452 
3453 	if (domain->dirty_ops && !ssads_supported(iommu))
3454 		return -EINVAL;
3455 
3456 	if (dmar_domain->iommu_coherency !=
3457 			iommu_paging_structure_coherency(iommu))
3458 		return -EINVAL;
3459 
3460 	if (dmar_domain->iommu_superpage !=
3461 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3462 		return -EINVAL;
3463 
3464 	if (dmar_domain->use_first_level &&
3465 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3466 		return -EINVAL;
3467 
3468 	/* check if this iommu agaw is sufficient for max mapped address */
3469 	addr_width = agaw_to_width(iommu->agaw);
3470 	if (addr_width > cap_mgaw(iommu->cap))
3471 		addr_width = cap_mgaw(iommu->cap);
3472 
3473 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3474 		return -EINVAL;
3475 
3476 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3477 	    context_copied(iommu, info->bus, info->devfn))
3478 		return intel_pasid_setup_sm_context(dev);
3479 
3480 	return 0;
3481 }
3482 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3483 static int intel_iommu_attach_device(struct iommu_domain *domain,
3484 				     struct device *dev)
3485 {
3486 	int ret;
3487 
3488 	device_block_translation(dev);
3489 
3490 	ret = paging_domain_compatible(domain, dev);
3491 	if (ret)
3492 		return ret;
3493 
3494 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3495 }
3496 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3497 static int intel_iommu_map(struct iommu_domain *domain,
3498 			   unsigned long iova, phys_addr_t hpa,
3499 			   size_t size, int iommu_prot, gfp_t gfp)
3500 {
3501 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3502 	u64 max_addr;
3503 	int prot = 0;
3504 
3505 	if (iommu_prot & IOMMU_READ)
3506 		prot |= DMA_PTE_READ;
3507 	if (iommu_prot & IOMMU_WRITE)
3508 		prot |= DMA_PTE_WRITE;
3509 	if (dmar_domain->set_pte_snp)
3510 		prot |= DMA_PTE_SNP;
3511 
3512 	max_addr = iova + size;
3513 	if (dmar_domain->max_addr < max_addr) {
3514 		u64 end;
3515 
3516 		/* check if minimum agaw is sufficient for mapped address */
3517 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3518 		if (end < max_addr) {
3519 			pr_err("%s: iommu width (%d) is not "
3520 			       "sufficient for the mapped address (%llx)\n",
3521 			       __func__, dmar_domain->gaw, max_addr);
3522 			return -EFAULT;
3523 		}
3524 		dmar_domain->max_addr = max_addr;
3525 	}
3526 	/* Round up size to next multiple of PAGE_SIZE, if it and
3527 	   the low bits of hpa would take us onto the next page */
3528 	size = aligned_nrpages(hpa, size);
3529 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3530 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3531 }
3532 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3533 static int intel_iommu_map_pages(struct iommu_domain *domain,
3534 				 unsigned long iova, phys_addr_t paddr,
3535 				 size_t pgsize, size_t pgcount,
3536 				 int prot, gfp_t gfp, size_t *mapped)
3537 {
3538 	unsigned long pgshift = __ffs(pgsize);
3539 	size_t size = pgcount << pgshift;
3540 	int ret;
3541 
3542 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3543 		return -EINVAL;
3544 
3545 	if (!IS_ALIGNED(iova | paddr, pgsize))
3546 		return -EINVAL;
3547 
3548 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3549 	if (!ret && mapped)
3550 		*mapped = size;
3551 
3552 	return ret;
3553 }
3554 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3555 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3556 				unsigned long iova, size_t size,
3557 				struct iommu_iotlb_gather *gather)
3558 {
3559 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3560 	unsigned long start_pfn, last_pfn;
3561 	int level = 0;
3562 
3563 	/* Cope with horrid API which requires us to unmap more than the
3564 	   size argument if it happens to be a large-page mapping. */
3565 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3566 				     &level, GFP_ATOMIC)))
3567 		return 0;
3568 
3569 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3570 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3571 
3572 	start_pfn = iova >> VTD_PAGE_SHIFT;
3573 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3574 
3575 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3576 
3577 	if (dmar_domain->max_addr == iova + size)
3578 		dmar_domain->max_addr = iova;
3579 
3580 	/*
3581 	 * We do not use page-selective IOTLB invalidation in flush queue,
3582 	 * so there is no need to track page and sync iotlb.
3583 	 */
3584 	if (!iommu_iotlb_gather_queued(gather))
3585 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3586 
3587 	return size;
3588 }
3589 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3590 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3591 				      unsigned long iova,
3592 				      size_t pgsize, size_t pgcount,
3593 				      struct iommu_iotlb_gather *gather)
3594 {
3595 	unsigned long pgshift = __ffs(pgsize);
3596 	size_t size = pgcount << pgshift;
3597 
3598 	return intel_iommu_unmap(domain, iova, size, gather);
3599 }
3600 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3601 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3602 				 struct iommu_iotlb_gather *gather)
3603 {
3604 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3605 			      gather->end, list_empty(&gather->freelist));
3606 	iommu_put_pages_list(&gather->freelist);
3607 }
3608 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3609 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3610 					    dma_addr_t iova)
3611 {
3612 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3613 	struct dma_pte *pte;
3614 	int level = 0;
3615 	u64 phys = 0;
3616 
3617 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3618 			     GFP_ATOMIC);
3619 	if (pte && dma_pte_present(pte))
3620 		phys = dma_pte_addr(pte) +
3621 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3622 						VTD_PAGE_SHIFT) - 1));
3623 
3624 	return phys;
3625 }
3626 
domain_support_force_snooping(struct dmar_domain * domain)3627 static bool domain_support_force_snooping(struct dmar_domain *domain)
3628 {
3629 	struct device_domain_info *info;
3630 	bool support = true;
3631 
3632 	assert_spin_locked(&domain->lock);
3633 	list_for_each_entry(info, &domain->devices, link) {
3634 		if (!ecap_sc_support(info->iommu->ecap)) {
3635 			support = false;
3636 			break;
3637 		}
3638 	}
3639 
3640 	return support;
3641 }
3642 
domain_set_force_snooping(struct dmar_domain * domain)3643 static void domain_set_force_snooping(struct dmar_domain *domain)
3644 {
3645 	struct device_domain_info *info;
3646 
3647 	assert_spin_locked(&domain->lock);
3648 	/*
3649 	 * Second level page table supports per-PTE snoop control. The
3650 	 * iommu_map() interface will handle this by setting SNP bit.
3651 	 */
3652 	if (!domain->use_first_level) {
3653 		domain->set_pte_snp = true;
3654 		return;
3655 	}
3656 
3657 	list_for_each_entry(info, &domain->devices, link)
3658 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3659 						     IOMMU_NO_PASID);
3660 }
3661 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3662 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3663 {
3664 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3665 	unsigned long flags;
3666 
3667 	if (dmar_domain->force_snooping)
3668 		return true;
3669 
3670 	spin_lock_irqsave(&dmar_domain->lock, flags);
3671 	if (!domain_support_force_snooping(dmar_domain) ||
3672 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3673 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3674 		return false;
3675 	}
3676 
3677 	domain_set_force_snooping(dmar_domain);
3678 	dmar_domain->force_snooping = true;
3679 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3680 
3681 	return true;
3682 }
3683 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3684 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3685 {
3686 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3687 
3688 	switch (cap) {
3689 	case IOMMU_CAP_CACHE_COHERENCY:
3690 	case IOMMU_CAP_DEFERRED_FLUSH:
3691 		return true;
3692 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3693 		return dmar_platform_optin();
3694 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3695 		return ecap_sc_support(info->iommu->ecap);
3696 	case IOMMU_CAP_DIRTY_TRACKING:
3697 		return ssads_supported(info->iommu);
3698 	default:
3699 		return false;
3700 	}
3701 }
3702 
intel_iommu_probe_device(struct device * dev)3703 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3704 {
3705 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3706 	struct device_domain_info *info;
3707 	struct intel_iommu *iommu;
3708 	u8 bus, devfn;
3709 	int ret;
3710 
3711 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3712 	if (!iommu || !iommu->iommu.ops)
3713 		return ERR_PTR(-ENODEV);
3714 
3715 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3716 	if (!info)
3717 		return ERR_PTR(-ENOMEM);
3718 
3719 	if (dev_is_real_dma_subdevice(dev)) {
3720 		info->bus = pdev->bus->number;
3721 		info->devfn = pdev->devfn;
3722 		info->segment = pci_domain_nr(pdev->bus);
3723 	} else {
3724 		info->bus = bus;
3725 		info->devfn = devfn;
3726 		info->segment = iommu->segment;
3727 	}
3728 
3729 	info->dev = dev;
3730 	info->iommu = iommu;
3731 	if (dev_is_pci(dev)) {
3732 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3733 		    pci_ats_supported(pdev) &&
3734 		    dmar_ats_supported(pdev, iommu)) {
3735 			info->ats_supported = 1;
3736 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3737 
3738 			/*
3739 			 * For IOMMU that supports device IOTLB throttling
3740 			 * (DIT), we assign PFSID to the invalidation desc
3741 			 * of a VF such that IOMMU HW can gauge queue depth
3742 			 * at PF level. If DIT is not set, PFSID will be
3743 			 * treated as reserved, which should be set to 0.
3744 			 */
3745 			if (ecap_dit(iommu->ecap))
3746 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3747 			info->ats_qdep = pci_ats_queue_depth(pdev);
3748 		}
3749 		if (sm_supported(iommu)) {
3750 			if (pasid_supported(iommu)) {
3751 				int features = pci_pasid_features(pdev);
3752 
3753 				if (features >= 0)
3754 					info->pasid_supported = features | 1;
3755 			}
3756 
3757 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3758 			    pci_pri_supported(pdev))
3759 				info->pri_supported = 1;
3760 		}
3761 	}
3762 
3763 	dev_iommu_priv_set(dev, info);
3764 	if (pdev && pci_ats_supported(pdev)) {
3765 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3766 		ret = device_rbtree_insert(iommu, info);
3767 		if (ret)
3768 			goto free;
3769 	}
3770 
3771 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3772 		ret = intel_pasid_alloc_table(dev);
3773 		if (ret) {
3774 			dev_err(dev, "PASID table allocation failed\n");
3775 			goto clear_rbtree;
3776 		}
3777 
3778 		if (!context_copied(iommu, info->bus, info->devfn)) {
3779 			ret = intel_pasid_setup_sm_context(dev);
3780 			if (ret)
3781 				goto free_table;
3782 		}
3783 	}
3784 
3785 	intel_iommu_debugfs_create_dev(info);
3786 
3787 	/*
3788 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3789 	 * device is undefined if you enable PASID support after ATS support.
3790 	 * So always enable PASID support on devices which have it, even if
3791 	 * we can't yet know if we're ever going to use it.
3792 	 */
3793 	if (info->pasid_supported &&
3794 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3795 		info->pasid_enabled = 1;
3796 
3797 	if (sm_supported(iommu))
3798 		iommu_enable_pci_ats(info);
3799 	iommu_enable_pci_pri(info);
3800 
3801 	return &iommu->iommu;
3802 free_table:
3803 	intel_pasid_free_table(dev);
3804 clear_rbtree:
3805 	device_rbtree_remove(info);
3806 free:
3807 	kfree(info);
3808 
3809 	return ERR_PTR(ret);
3810 }
3811 
intel_iommu_release_device(struct device * dev)3812 static void intel_iommu_release_device(struct device *dev)
3813 {
3814 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3815 	struct intel_iommu *iommu = info->iommu;
3816 
3817 	iommu_disable_pci_pri(info);
3818 	iommu_disable_pci_ats(info);
3819 
3820 	if (info->pasid_enabled) {
3821 		pci_disable_pasid(to_pci_dev(dev));
3822 		info->pasid_enabled = 0;
3823 	}
3824 
3825 	mutex_lock(&iommu->iopf_lock);
3826 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3827 		device_rbtree_remove(info);
3828 	mutex_unlock(&iommu->iopf_lock);
3829 
3830 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3831 	    !context_copied(iommu, info->bus, info->devfn))
3832 		intel_pasid_teardown_sm_context(dev);
3833 
3834 	intel_pasid_free_table(dev);
3835 	intel_iommu_debugfs_remove_dev(info);
3836 	kfree(info);
3837 	set_dma_ops(dev, NULL);
3838 }
3839 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3840 static void intel_iommu_get_resv_regions(struct device *device,
3841 					 struct list_head *head)
3842 {
3843 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3844 	struct iommu_resv_region *reg;
3845 	struct dmar_rmrr_unit *rmrr;
3846 	struct device *i_dev;
3847 	int i;
3848 
3849 	rcu_read_lock();
3850 	for_each_rmrr_units(rmrr) {
3851 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3852 					  i, i_dev) {
3853 			struct iommu_resv_region *resv;
3854 			enum iommu_resv_type type;
3855 			size_t length;
3856 
3857 			if (i_dev != device &&
3858 			    !is_downstream_to_pci_bridge(device, i_dev))
3859 				continue;
3860 
3861 			length = rmrr->end_address - rmrr->base_address + 1;
3862 
3863 			type = device_rmrr_is_relaxable(device) ?
3864 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3865 
3866 			resv = iommu_alloc_resv_region(rmrr->base_address,
3867 						       length, prot, type,
3868 						       GFP_ATOMIC);
3869 			if (!resv)
3870 				break;
3871 
3872 			list_add_tail(&resv->list, head);
3873 		}
3874 	}
3875 	rcu_read_unlock();
3876 
3877 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3878 	if (dev_is_pci(device)) {
3879 		struct pci_dev *pdev = to_pci_dev(device);
3880 
3881 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3882 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3883 					IOMMU_RESV_DIRECT_RELAXABLE,
3884 					GFP_KERNEL);
3885 			if (reg)
3886 				list_add_tail(&reg->list, head);
3887 		}
3888 	}
3889 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3890 
3891 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3892 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3893 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3894 	if (!reg)
3895 		return;
3896 	list_add_tail(&reg->list, head);
3897 }
3898 
intel_iommu_device_group(struct device * dev)3899 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3900 {
3901 	if (dev_is_pci(dev))
3902 		return pci_device_group(dev);
3903 	return generic_device_group(dev);
3904 }
3905 
intel_iommu_enable_iopf(struct device * dev)3906 int intel_iommu_enable_iopf(struct device *dev)
3907 {
3908 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3909 	struct intel_iommu *iommu = info->iommu;
3910 	int ret;
3911 
3912 	if (!info->pri_enabled)
3913 		return -ENODEV;
3914 
3915 	if (info->iopf_refcount) {
3916 		info->iopf_refcount++;
3917 		return 0;
3918 	}
3919 
3920 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3921 	if (ret)
3922 		return ret;
3923 
3924 	info->iopf_refcount = 1;
3925 
3926 	return 0;
3927 }
3928 
intel_iommu_disable_iopf(struct device * dev)3929 void intel_iommu_disable_iopf(struct device *dev)
3930 {
3931 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3932 	struct intel_iommu *iommu = info->iommu;
3933 
3934 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3935 		return;
3936 
3937 	if (--info->iopf_refcount)
3938 		return;
3939 
3940 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3941 }
3942 
3943 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)3944 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
3945 {
3946 	switch (feat) {
3947 	case IOMMU_DEV_FEAT_IOPF:
3948 		return intel_iommu_enable_iopf(dev);
3949 
3950 	case IOMMU_DEV_FEAT_SVA:
3951 		return 0;
3952 
3953 	default:
3954 		return -ENODEV;
3955 	}
3956 }
3957 
3958 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)3959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
3960 {
3961 	switch (feat) {
3962 	case IOMMU_DEV_FEAT_IOPF:
3963 		intel_iommu_disable_iopf(dev);
3964 		return 0;
3965 
3966 	case IOMMU_DEV_FEAT_SVA:
3967 		return 0;
3968 
3969 	default:
3970 		return -ENODEV;
3971 	}
3972 }
3973 
intel_iommu_is_attach_deferred(struct device * dev)3974 static bool intel_iommu_is_attach_deferred(struct device *dev)
3975 {
3976 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3977 
3978 	return translation_pre_enabled(info->iommu) && !info->domain;
3979 }
3980 
3981 /*
3982  * Check that the device does not live on an external facing PCI port that is
3983  * marked as untrusted. Such devices should not be able to apply quirks and
3984  * thus not be able to bypass the IOMMU restrictions.
3985  */
risky_device(struct pci_dev * pdev)3986 static bool risky_device(struct pci_dev *pdev)
3987 {
3988 	if (pdev->untrusted) {
3989 		pci_info(pdev,
3990 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3991 			 pdev->vendor, pdev->device);
3992 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3993 		return true;
3994 	}
3995 	return false;
3996 }
3997 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)3998 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3999 				      unsigned long iova, size_t size)
4000 {
4001 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4002 
4003 	return 0;
4004 }
4005 
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4006 void domain_remove_dev_pasid(struct iommu_domain *domain,
4007 			     struct device *dev, ioasid_t pasid)
4008 {
4009 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4010 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4011 	struct intel_iommu *iommu = info->iommu;
4012 	struct dmar_domain *dmar_domain;
4013 	unsigned long flags;
4014 
4015 	if (!domain)
4016 		return;
4017 
4018 	/* Identity domain has no meta data for pasid. */
4019 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4020 		return;
4021 
4022 	dmar_domain = to_dmar_domain(domain);
4023 	spin_lock_irqsave(&dmar_domain->lock, flags);
4024 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4025 		if (curr->dev == dev && curr->pasid == pasid) {
4026 			list_del(&curr->link_domain);
4027 			dev_pasid = curr;
4028 			break;
4029 		}
4030 	}
4031 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4032 
4033 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4034 	domain_detach_iommu(dmar_domain, iommu);
4035 	if (!WARN_ON_ONCE(!dev_pasid)) {
4036 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4037 		kfree(dev_pasid);
4038 	}
4039 }
4040 
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4041 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4042 					 struct device *dev, ioasid_t pasid,
4043 					 struct iommu_domain *old)
4044 {
4045 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4046 
4047 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4048 	domain_remove_dev_pasid(old, dev, pasid);
4049 
4050 	return 0;
4051 }
4052 
4053 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4054 domain_add_dev_pasid(struct iommu_domain *domain,
4055 		     struct device *dev, ioasid_t pasid)
4056 {
4057 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4058 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4059 	struct intel_iommu *iommu = info->iommu;
4060 	struct dev_pasid_info *dev_pasid;
4061 	unsigned long flags;
4062 	int ret;
4063 
4064 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4065 	if (!dev_pasid)
4066 		return ERR_PTR(-ENOMEM);
4067 
4068 	ret = domain_attach_iommu(dmar_domain, iommu);
4069 	if (ret)
4070 		goto out_free;
4071 
4072 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4073 	if (ret)
4074 		goto out_detach_iommu;
4075 
4076 	dev_pasid->dev = dev;
4077 	dev_pasid->pasid = pasid;
4078 	spin_lock_irqsave(&dmar_domain->lock, flags);
4079 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4080 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4081 
4082 	return dev_pasid;
4083 out_detach_iommu:
4084 	domain_detach_iommu(dmar_domain, iommu);
4085 out_free:
4086 	kfree(dev_pasid);
4087 	return ERR_PTR(ret);
4088 }
4089 
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4090 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4091 				     struct device *dev, ioasid_t pasid,
4092 				     struct iommu_domain *old)
4093 {
4094 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4095 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4096 	struct intel_iommu *iommu = info->iommu;
4097 	struct dev_pasid_info *dev_pasid;
4098 	int ret;
4099 
4100 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4101 		return -EINVAL;
4102 
4103 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4104 		return -EOPNOTSUPP;
4105 
4106 	if (domain->dirty_ops)
4107 		return -EINVAL;
4108 
4109 	if (context_copied(iommu, info->bus, info->devfn))
4110 		return -EBUSY;
4111 
4112 	ret = paging_domain_compatible(domain, dev);
4113 	if (ret)
4114 		return ret;
4115 
4116 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4117 	if (IS_ERR(dev_pasid))
4118 		return PTR_ERR(dev_pasid);
4119 
4120 	if (dmar_domain->use_first_level)
4121 		ret = domain_setup_first_level(iommu, dmar_domain,
4122 					       dev, pasid, old);
4123 	else
4124 		ret = domain_setup_second_level(iommu, dmar_domain,
4125 						dev, pasid, old);
4126 	if (ret)
4127 		goto out_remove_dev_pasid;
4128 
4129 	domain_remove_dev_pasid(old, dev, pasid);
4130 
4131 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4132 
4133 	return 0;
4134 
4135 out_remove_dev_pasid:
4136 	domain_remove_dev_pasid(domain, dev, pasid);
4137 	return ret;
4138 }
4139 
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4140 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4141 {
4142 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4143 	struct intel_iommu *iommu = info->iommu;
4144 	struct iommu_hw_info_vtd *vtd;
4145 
4146 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4147 	if (!vtd)
4148 		return ERR_PTR(-ENOMEM);
4149 
4150 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4151 	vtd->cap_reg = iommu->cap;
4152 	vtd->ecap_reg = iommu->ecap;
4153 	*length = sizeof(*vtd);
4154 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4155 	return vtd;
4156 }
4157 
4158 /*
4159  * Set dirty tracking for the device list of a domain. The caller must
4160  * hold the domain->lock when calling it.
4161  */
device_set_dirty_tracking(struct list_head * devices,bool enable)4162 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4163 {
4164 	struct device_domain_info *info;
4165 	int ret = 0;
4166 
4167 	list_for_each_entry(info, devices, link) {
4168 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4169 						       IOMMU_NO_PASID, enable);
4170 		if (ret)
4171 			break;
4172 	}
4173 
4174 	return ret;
4175 }
4176 
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4177 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4178 					    bool enable)
4179 {
4180 	struct dmar_domain *s1_domain;
4181 	unsigned long flags;
4182 	int ret;
4183 
4184 	spin_lock(&domain->s1_lock);
4185 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4186 		spin_lock_irqsave(&s1_domain->lock, flags);
4187 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4188 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4189 		if (ret)
4190 			goto err_unwind;
4191 	}
4192 	spin_unlock(&domain->s1_lock);
4193 	return 0;
4194 
4195 err_unwind:
4196 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4197 		spin_lock_irqsave(&s1_domain->lock, flags);
4198 		device_set_dirty_tracking(&s1_domain->devices,
4199 					  domain->dirty_tracking);
4200 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4201 	}
4202 	spin_unlock(&domain->s1_lock);
4203 	return ret;
4204 }
4205 
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4206 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4207 					  bool enable)
4208 {
4209 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 	int ret;
4211 
4212 	spin_lock(&dmar_domain->lock);
4213 	if (dmar_domain->dirty_tracking == enable)
4214 		goto out_unlock;
4215 
4216 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4217 	if (ret)
4218 		goto err_unwind;
4219 
4220 	if (dmar_domain->nested_parent) {
4221 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4222 		if (ret)
4223 			goto err_unwind;
4224 	}
4225 
4226 	dmar_domain->dirty_tracking = enable;
4227 out_unlock:
4228 	spin_unlock(&dmar_domain->lock);
4229 
4230 	return 0;
4231 
4232 err_unwind:
4233 	device_set_dirty_tracking(&dmar_domain->devices,
4234 				  dmar_domain->dirty_tracking);
4235 	spin_unlock(&dmar_domain->lock);
4236 	return ret;
4237 }
4238 
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4239 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4240 					    unsigned long iova, size_t size,
4241 					    unsigned long flags,
4242 					    struct iommu_dirty_bitmap *dirty)
4243 {
4244 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4245 	unsigned long end = iova + size - 1;
4246 	unsigned long pgsize;
4247 
4248 	/*
4249 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4250 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4251 	 * have occurred when we stopped dirty tracking. This ensures that we
4252 	 * never inherit dirtied bits from a previous cycle.
4253 	 */
4254 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4255 		return -EINVAL;
4256 
4257 	do {
4258 		struct dma_pte *pte;
4259 		int lvl = 0;
4260 
4261 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4262 				     GFP_ATOMIC);
4263 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4264 		if (!pte || !dma_pte_present(pte)) {
4265 			iova += pgsize;
4266 			continue;
4267 		}
4268 
4269 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4270 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4271 		iova += pgsize;
4272 	} while (iova < end);
4273 
4274 	return 0;
4275 }
4276 
4277 static const struct iommu_dirty_ops intel_dirty_ops = {
4278 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4279 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4280 };
4281 
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4282 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4283 {
4284 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4285 	struct intel_iommu *iommu = info->iommu;
4286 	struct context_entry *context;
4287 
4288 	spin_lock(&iommu->lock);
4289 	context = iommu_context_addr(iommu, bus, devfn, 1);
4290 	if (!context) {
4291 		spin_unlock(&iommu->lock);
4292 		return -ENOMEM;
4293 	}
4294 
4295 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4296 		spin_unlock(&iommu->lock);
4297 		return 0;
4298 	}
4299 
4300 	copied_context_tear_down(iommu, context, bus, devfn);
4301 	context_clear_entry(context);
4302 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4303 
4304 	/*
4305 	 * In pass through mode, AW must be programmed to indicate the largest
4306 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4307 	 */
4308 	context_set_address_width(context, iommu->msagaw);
4309 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4310 	context_set_fault_enable(context);
4311 	context_set_present(context);
4312 	if (!ecap_coherent(iommu->ecap))
4313 		clflush_cache_range(context, sizeof(*context));
4314 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4315 	spin_unlock(&iommu->lock);
4316 
4317 	return 0;
4318 }
4319 
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4320 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4321 {
4322 	struct device *dev = data;
4323 
4324 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4325 }
4326 
device_setup_pass_through(struct device * dev)4327 static int device_setup_pass_through(struct device *dev)
4328 {
4329 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4330 
4331 	if (!dev_is_pci(dev))
4332 		return context_setup_pass_through(dev, info->bus, info->devfn);
4333 
4334 	return pci_for_each_dma_alias(to_pci_dev(dev),
4335 				      context_setup_pass_through_cb, dev);
4336 }
4337 
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4338 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4339 {
4340 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4341 	struct intel_iommu *iommu = info->iommu;
4342 	int ret;
4343 
4344 	device_block_translation(dev);
4345 
4346 	if (dev_is_real_dma_subdevice(dev))
4347 		return 0;
4348 
4349 	if (sm_supported(iommu))
4350 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4351 	else
4352 		ret = device_setup_pass_through(dev);
4353 
4354 	return ret;
4355 }
4356 
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4357 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4358 					 struct device *dev, ioasid_t pasid,
4359 					 struct iommu_domain *old)
4360 {
4361 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4362 	struct intel_iommu *iommu = info->iommu;
4363 	int ret;
4364 
4365 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4366 		return -EOPNOTSUPP;
4367 
4368 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4369 	if (ret)
4370 		return ret;
4371 
4372 	domain_remove_dev_pasid(old, dev, pasid);
4373 	return 0;
4374 }
4375 
4376 static struct iommu_domain identity_domain = {
4377 	.type = IOMMU_DOMAIN_IDENTITY,
4378 	.ops = &(const struct iommu_domain_ops) {
4379 		.attach_dev	= identity_domain_attach_dev,
4380 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4381 	},
4382 };
4383 
4384 const struct iommu_ops intel_iommu_ops = {
4385 	.blocked_domain		= &blocking_domain,
4386 	.release_domain		= &blocking_domain,
4387 	.identity_domain	= &identity_domain,
4388 	.capable		= intel_iommu_capable,
4389 	.hw_info		= intel_iommu_hw_info,
4390 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4391 	.domain_alloc_sva	= intel_svm_domain_alloc,
4392 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4393 	.probe_device		= intel_iommu_probe_device,
4394 	.release_device		= intel_iommu_release_device,
4395 	.get_resv_regions	= intel_iommu_get_resv_regions,
4396 	.device_group		= intel_iommu_device_group,
4397 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4398 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4399 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4400 	.def_domain_type	= device_def_domain_type,
4401 	.pgsize_bitmap		= SZ_4K,
4402 	.page_response		= intel_iommu_page_response,
4403 	.default_domain_ops = &(const struct iommu_domain_ops) {
4404 		.attach_dev		= intel_iommu_attach_device,
4405 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4406 		.map_pages		= intel_iommu_map_pages,
4407 		.unmap_pages		= intel_iommu_unmap_pages,
4408 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4409 		.flush_iotlb_all        = intel_flush_iotlb_all,
4410 		.iotlb_sync		= intel_iommu_tlb_sync,
4411 		.iova_to_phys		= intel_iommu_iova_to_phys,
4412 		.free			= intel_iommu_domain_free,
4413 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4414 	}
4415 };
4416 
quirk_iommu_igfx(struct pci_dev * dev)4417 static void quirk_iommu_igfx(struct pci_dev *dev)
4418 {
4419 	if (risky_device(dev))
4420 		return;
4421 
4422 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4423 	disable_igfx_iommu = 1;
4424 }
4425 
4426 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4434 
4435 /* Broadwell igfx malfunctions with dmar */
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4439 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4440 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4442 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4457 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4458 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4459 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4460 
quirk_iommu_rwbf(struct pci_dev * dev)4461 static void quirk_iommu_rwbf(struct pci_dev *dev)
4462 {
4463 	if (risky_device(dev))
4464 		return;
4465 
4466 	/*
4467 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4468 	 * but needs it. Same seems to hold for the desktop versions.
4469 	 */
4470 	pci_info(dev, "Forcing write-buffer flush capability\n");
4471 	rwbf_quirk = 1;
4472 }
4473 
4474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4481 
4482 #define GGC 0x52
4483 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4484 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4485 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4486 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4487 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4488 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4489 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4490 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4491 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4492 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4493 {
4494 	unsigned short ggc;
4495 
4496 	if (risky_device(dev))
4497 		return;
4498 
4499 	if (pci_read_config_word(dev, GGC, &ggc))
4500 		return;
4501 
4502 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4503 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4504 		disable_igfx_iommu = 1;
4505 	} else if (!disable_igfx_iommu) {
4506 		/* we have to ensure the gfx device is idle before we flush */
4507 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4508 		iommu_set_dma_strict();
4509 	}
4510 }
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4515 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4516 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4517 {
4518 	unsigned short ver;
4519 
4520 	if (!IS_GFX_DEVICE(dev))
4521 		return;
4522 
4523 	ver = (dev->device >> 8) & 0xff;
4524 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4525 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4526 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4527 		return;
4528 
4529 	if (risky_device(dev))
4530 		return;
4531 
4532 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4533 	iommu_skip_te_disable = 1;
4534 }
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4536 
4537 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4538    ISOCH DMAR unit for the Azalia sound device, but not give it any
4539    TLB entries, which causes it to deadlock. Check for that.  We do
4540    this in a function called from init_dmars(), instead of in a PCI
4541    quirk, because we don't want to print the obnoxious "BIOS broken"
4542    message if VT-d is actually disabled.
4543 */
check_tylersburg_isoch(void)4544 static void __init check_tylersburg_isoch(void)
4545 {
4546 	struct pci_dev *pdev;
4547 	uint32_t vtisochctrl;
4548 
4549 	/* If there's no Azalia in the system anyway, forget it. */
4550 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4551 	if (!pdev)
4552 		return;
4553 
4554 	if (risky_device(pdev)) {
4555 		pci_dev_put(pdev);
4556 		return;
4557 	}
4558 
4559 	pci_dev_put(pdev);
4560 
4561 	/* System Management Registers. Might be hidden, in which case
4562 	   we can't do the sanity check. But that's OK, because the
4563 	   known-broken BIOSes _don't_ actually hide it, so far. */
4564 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4565 	if (!pdev)
4566 		return;
4567 
4568 	if (risky_device(pdev)) {
4569 		pci_dev_put(pdev);
4570 		return;
4571 	}
4572 
4573 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4574 		pci_dev_put(pdev);
4575 		return;
4576 	}
4577 
4578 	pci_dev_put(pdev);
4579 
4580 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4581 	if (vtisochctrl & 1)
4582 		return;
4583 
4584 	/* Drop all bits other than the number of TLB entries */
4585 	vtisochctrl &= 0x1c;
4586 
4587 	/* If we have the recommended number of TLB entries (16), fine. */
4588 	if (vtisochctrl == 0x10)
4589 		return;
4590 
4591 	/* Zero TLB entries? You get to ride the short bus to school. */
4592 	if (!vtisochctrl) {
4593 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4594 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4595 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4596 		     dmi_get_system_info(DMI_BIOS_VERSION),
4597 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4598 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4599 		return;
4600 	}
4601 
4602 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4603 	       vtisochctrl);
4604 }
4605 
4606 /*
4607  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4608  * invalidation completion before posted writes initiated with translated address
4609  * that utilized translations matching the invalidation address range, violating
4610  * the invalidation completion ordering.
4611  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4612  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4613  * under the control of the trusted/privileged host device driver must use this
4614  * quirk.
4615  * Device TLBs are invalidated under the following six conditions:
4616  * 1. Device driver does DMA API unmap IOVA
4617  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4618  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4619  *    exit_mmap() due to crash
4620  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4621  *    VM has to free pages that were unmapped
4622  * 5. Userspace driver unmaps a DMA buffer
4623  * 6. Cache invalidation in vSVA usage (upcoming)
4624  *
4625  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4626  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4627  * invalidate TLB the same way as normal user unmap which will use this quirk.
4628  * The dTLB invalidation after PASID cache flush does not need this quirk.
4629  *
4630  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4631  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4632 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4633 			       unsigned long address, unsigned long mask,
4634 			       u32 pasid, u16 qdep)
4635 {
4636 	u16 sid;
4637 
4638 	if (likely(!info->dtlb_extra_inval))
4639 		return;
4640 
4641 	sid = PCI_DEVID(info->bus, info->devfn);
4642 	if (pasid == IOMMU_NO_PASID) {
4643 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4644 				   qdep, address, mask);
4645 	} else {
4646 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4647 					 pasid, qdep, address, mask);
4648 	}
4649 }
4650 
4651 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4652 
4653 /*
4654  * Function to submit a command to the enhanced command interface. The
4655  * valid enhanced command descriptions are defined in Table 47 of the
4656  * VT-d spec. The VT-d hardware implementation may support some but not
4657  * all commands, which can be determined by checking the Enhanced
4658  * Command Capability Register.
4659  *
4660  * Return values:
4661  *  - 0: Command successful without any error;
4662  *  - Negative: software error value;
4663  *  - Nonzero positive: failure status code defined in Table 48.
4664  */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4665 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4666 {
4667 	unsigned long flags;
4668 	u64 res;
4669 	int ret;
4670 
4671 	if (!cap_ecmds(iommu->cap))
4672 		return -ENODEV;
4673 
4674 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4675 
4676 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4677 	if (res & DMA_ECMD_ECRSP_IP) {
4678 		ret = -EBUSY;
4679 		goto err;
4680 	}
4681 
4682 	/*
4683 	 * Unconditionally write the operand B, because
4684 	 * - There is no side effect if an ecmd doesn't require an
4685 	 *   operand B, but we set the register to some value.
4686 	 * - It's not invoked in any critical path. The extra MMIO
4687 	 *   write doesn't bring any performance concerns.
4688 	 */
4689 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4690 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4691 
4692 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4693 		      !(res & DMA_ECMD_ECRSP_IP), res);
4694 
4695 	if (res & DMA_ECMD_ECRSP_IP) {
4696 		ret = -ETIMEDOUT;
4697 		goto err;
4698 	}
4699 
4700 	ret = ecmd_get_status_code(res);
4701 err:
4702 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4703 
4704 	return ret;
4705 }
4706