xref: /linux/drivers/iommu/intel/iommu.c (revision 53564f400572b1b8d9ee5bafb9c226eb1d38600a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 #define rwbf_required(iommu)	(rwbf_quirk || cap_rwbf((iommu)->cap))
61 
62 /*
63  * set to 1 to panic kernel if can't successfully enable VT-d
64  * (used when kernel is launched w/ TXT)
65  */
66 static int force_on = 0;
67 static int intel_iommu_tboot_noforce;
68 static int no_platform_optin;
69 
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 
72 /*
73  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
74  * if marked present.
75  */
root_entry_lctp(struct root_entry * re)76 static phys_addr_t root_entry_lctp(struct root_entry *re)
77 {
78 	if (!(re->lo & 1))
79 		return 0;
80 
81 	return re->lo & VTD_PAGE_MASK;
82 }
83 
84 /*
85  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
86  * if marked present.
87  */
root_entry_uctp(struct root_entry * re)88 static phys_addr_t root_entry_uctp(struct root_entry *re)
89 {
90 	if (!(re->hi & 1))
91 		return 0;
92 
93 	return re->hi & VTD_PAGE_MASK;
94 }
95 
device_rid_cmp_key(const void * key,const struct rb_node * node)96 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
97 {
98 	struct device_domain_info *info =
99 		rb_entry(node, struct device_domain_info, node);
100 	const u16 *rid_lhs = key;
101 
102 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
103 		return -1;
104 
105 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
106 		return 1;
107 
108 	return 0;
109 }
110 
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)111 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
112 {
113 	struct device_domain_info *info =
114 		rb_entry(lhs, struct device_domain_info, node);
115 	u16 key = PCI_DEVID(info->bus, info->devfn);
116 
117 	return device_rid_cmp_key(&key, rhs);
118 }
119 
120 /*
121  * Looks up an IOMMU-probed device using its source ID.
122  *
123  * Returns the pointer to the device if there is a match. Otherwise,
124  * returns NULL.
125  *
126  * Note that this helper doesn't guarantee that the device won't be
127  * released by the iommu subsystem after being returned. The caller
128  * should use its own synchronization mechanism to avoid the device
129  * being released during its use if its possibly the case.
130  */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)131 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
132 {
133 	struct device_domain_info *info = NULL;
134 	struct rb_node *node;
135 	unsigned long flags;
136 
137 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
138 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
139 	if (node)
140 		info = rb_entry(node, struct device_domain_info, node);
141 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
142 
143 	return info ? info->dev : NULL;
144 }
145 
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)146 static int device_rbtree_insert(struct intel_iommu *iommu,
147 				struct device_domain_info *info)
148 {
149 	struct rb_node *curr;
150 	unsigned long flags;
151 
152 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
153 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
154 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
155 	if (WARN_ON(curr))
156 		return -EEXIST;
157 
158 	return 0;
159 }
160 
device_rbtree_remove(struct device_domain_info * info)161 static void device_rbtree_remove(struct device_domain_info *info)
162 {
163 	struct intel_iommu *iommu = info->iommu;
164 	unsigned long flags;
165 
166 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
167 	rb_erase(&info->node, &iommu->device_rbtree);
168 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
169 }
170 
171 struct dmar_rmrr_unit {
172 	struct list_head list;		/* list of rmrr units	*/
173 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
174 	u64	base_address;		/* reserved base address*/
175 	u64	end_address;		/* reserved end address */
176 	struct dmar_dev_scope *devices;	/* target devices */
177 	int	devices_cnt;		/* target device count */
178 };
179 
180 struct dmar_atsr_unit {
181 	struct list_head list;		/* list of ATSR units */
182 	struct acpi_dmar_header *hdr;	/* ACPI header */
183 	struct dmar_dev_scope *devices;	/* target devices */
184 	int devices_cnt;		/* target device count */
185 	u8 include_all:1;		/* include all ports */
186 };
187 
188 struct dmar_satc_unit {
189 	struct list_head list;		/* list of SATC units */
190 	struct acpi_dmar_header *hdr;	/* ACPI header */
191 	struct dmar_dev_scope *devices;	/* target devices */
192 	struct intel_iommu *iommu;	/* the corresponding iommu */
193 	int devices_cnt;		/* target device count */
194 	u8 atc_required:1;		/* ATS is required */
195 };
196 
197 static LIST_HEAD(dmar_atsr_units);
198 static LIST_HEAD(dmar_rmrr_units);
199 static LIST_HEAD(dmar_satc_units);
200 
201 #define for_each_rmrr_units(rmrr) \
202 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
203 
204 static void intel_iommu_domain_free(struct iommu_domain *domain);
205 
206 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
207 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
208 
209 int intel_iommu_enabled = 0;
210 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
211 
212 static int intel_iommu_superpage = 1;
213 static int iommu_identity_mapping;
214 static int iommu_skip_te_disable;
215 static int disable_igfx_iommu;
216 
217 #define IDENTMAP_AZALIA		4
218 
219 const struct iommu_ops intel_iommu_ops;
220 static const struct iommu_dirty_ops intel_dirty_ops;
221 
translation_pre_enabled(struct intel_iommu * iommu)222 static bool translation_pre_enabled(struct intel_iommu *iommu)
223 {
224 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
225 }
226 
clear_translation_pre_enabled(struct intel_iommu * iommu)227 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
228 {
229 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
230 }
231 
init_translation_status(struct intel_iommu * iommu)232 static void init_translation_status(struct intel_iommu *iommu)
233 {
234 	u32 gsts;
235 
236 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
237 	if (gsts & DMA_GSTS_TES)
238 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
239 }
240 
intel_iommu_setup(char * str)241 static int __init intel_iommu_setup(char *str)
242 {
243 	if (!str)
244 		return -EINVAL;
245 
246 	while (*str) {
247 		if (!strncmp(str, "on", 2)) {
248 			dmar_disabled = 0;
249 			pr_info("IOMMU enabled\n");
250 		} else if (!strncmp(str, "off", 3)) {
251 			dmar_disabled = 1;
252 			no_platform_optin = 1;
253 			pr_info("IOMMU disabled\n");
254 		} else if (!strncmp(str, "igfx_off", 8)) {
255 			disable_igfx_iommu = 1;
256 			pr_info("Disable GFX device mapping\n");
257 		} else if (!strncmp(str, "forcedac", 8)) {
258 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
259 			iommu_dma_forcedac = true;
260 		} else if (!strncmp(str, "strict", 6)) {
261 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
262 			iommu_set_dma_strict();
263 		} else if (!strncmp(str, "sp_off", 6)) {
264 			pr_info("Disable supported super page\n");
265 			intel_iommu_superpage = 0;
266 		} else if (!strncmp(str, "sm_on", 5)) {
267 			pr_info("Enable scalable mode if hardware supports\n");
268 			intel_iommu_sm = 1;
269 		} else if (!strncmp(str, "sm_off", 6)) {
270 			pr_info("Scalable mode is disallowed\n");
271 			intel_iommu_sm = 0;
272 		} else if (!strncmp(str, "tboot_noforce", 13)) {
273 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
274 			intel_iommu_tboot_noforce = 1;
275 		} else {
276 			pr_notice("Unknown option - '%s'\n", str);
277 		}
278 
279 		str += strcspn(str, ",");
280 		while (*str == ',')
281 			str++;
282 	}
283 
284 	return 1;
285 }
286 __setup("intel_iommu=", intel_iommu_setup);
287 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)288 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
289 {
290 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
291 
292 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
293 }
294 
295 /*
296  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
297  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
298  * the returned SAGAW.
299  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)300 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
301 {
302 	unsigned long fl_sagaw, sl_sagaw;
303 
304 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
305 	sl_sagaw = cap_sagaw(iommu->cap);
306 
307 	/* Second level only. */
308 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
309 		return sl_sagaw;
310 
311 	/* First level only. */
312 	if (!ecap_slts(iommu->ecap))
313 		return fl_sagaw;
314 
315 	return fl_sagaw & sl_sagaw;
316 }
317 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)318 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
319 {
320 	unsigned long sagaw;
321 	int agaw;
322 
323 	sagaw = __iommu_calculate_sagaw(iommu);
324 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
325 		if (test_bit(agaw, &sagaw))
326 			break;
327 	}
328 
329 	return agaw;
330 }
331 
332 /*
333  * Calculate max SAGAW for each iommu.
334  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)335 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
336 {
337 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
338 }
339 
340 /*
341  * calculate agaw for each iommu.
342  * "SAGAW" may be different across iommus, use a default agaw, and
343  * get a supported less agaw for iommus that don't support the default agaw.
344  */
iommu_calculate_agaw(struct intel_iommu * iommu)345 int iommu_calculate_agaw(struct intel_iommu *iommu)
346 {
347 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
348 }
349 
iommu_paging_structure_coherency(struct intel_iommu * iommu)350 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
351 {
352 	return sm_supported(iommu) ?
353 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
354 }
355 
356 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)357 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
358 {
359 	unsigned long bitmap = 0;
360 
361 	/*
362 	 * 1-level super page supports page size of 2MiB, 2-level super page
363 	 * supports page size of both 2MiB and 1GiB.
364 	 */
365 	if (domain->iommu_superpage == 1)
366 		bitmap |= SZ_2M;
367 	else if (domain->iommu_superpage == 2)
368 		bitmap |= SZ_2M | SZ_1G;
369 
370 	return bitmap;
371 }
372 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)373 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
374 					 u8 devfn, int alloc)
375 {
376 	struct root_entry *root = &iommu->root_entry[bus];
377 	struct context_entry *context;
378 	u64 *entry;
379 
380 	/*
381 	 * Except that the caller requested to allocate a new entry,
382 	 * returning a copied context entry makes no sense.
383 	 */
384 	if (!alloc && context_copied(iommu, bus, devfn))
385 		return NULL;
386 
387 	entry = &root->lo;
388 	if (sm_supported(iommu)) {
389 		if (devfn >= 0x80) {
390 			devfn -= 0x80;
391 			entry = &root->hi;
392 		}
393 		devfn *= 2;
394 	}
395 	if (*entry & 1)
396 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
397 	else {
398 		unsigned long phy_addr;
399 		if (!alloc)
400 			return NULL;
401 
402 		context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
403 						    SZ_4K);
404 		if (!context)
405 			return NULL;
406 
407 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
408 		phy_addr = virt_to_phys((void *)context);
409 		*entry = phy_addr | 1;
410 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
411 	}
412 	return &context[devfn];
413 }
414 
415 /**
416  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
417  *				 sub-hierarchy of a candidate PCI-PCI bridge
418  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
419  * @bridge: the candidate PCI-PCI bridge
420  *
421  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
422  */
423 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)424 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
425 {
426 	struct pci_dev *pdev, *pbridge;
427 
428 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
429 		return false;
430 
431 	pdev = to_pci_dev(dev);
432 	pbridge = to_pci_dev(bridge);
433 
434 	if (pbridge->subordinate &&
435 	    pbridge->subordinate->number <= pdev->bus->number &&
436 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
437 		return true;
438 
439 	return false;
440 }
441 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)442 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
443 {
444 	struct dmar_drhd_unit *drhd;
445 	u32 vtbar;
446 	int rc;
447 
448 	/* We know that this device on this chipset has its own IOMMU.
449 	 * If we find it under a different IOMMU, then the BIOS is lying
450 	 * to us. Hope that the IOMMU for this device is actually
451 	 * disabled, and it needs no translation...
452 	 */
453 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
454 	if (rc) {
455 		/* "can't" happen */
456 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
457 		return false;
458 	}
459 	vtbar &= 0xffff0000;
460 
461 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
462 	drhd = dmar_find_matched_drhd_unit(pdev);
463 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
464 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
465 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
466 		return true;
467 	}
468 
469 	return false;
470 }
471 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)472 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
473 {
474 	if (!iommu || iommu->drhd->ignored)
475 		return true;
476 
477 	if (dev_is_pci(dev)) {
478 		struct pci_dev *pdev = to_pci_dev(dev);
479 
480 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
481 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
482 		    quirk_ioat_snb_local_iommu(pdev))
483 			return true;
484 	}
485 
486 	return false;
487 }
488 
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)489 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
490 {
491 	struct dmar_drhd_unit *drhd = NULL;
492 	struct pci_dev *pdev = NULL;
493 	struct intel_iommu *iommu;
494 	struct device *tmp;
495 	u16 segment = 0;
496 	int i;
497 
498 	if (!dev)
499 		return NULL;
500 
501 	if (dev_is_pci(dev)) {
502 		struct pci_dev *pf_pdev;
503 
504 		pdev = pci_real_dma_dev(to_pci_dev(dev));
505 
506 		/* VFs aren't listed in scope tables; we need to look up
507 		 * the PF instead to find the IOMMU. */
508 		pf_pdev = pci_physfn(pdev);
509 		dev = &pf_pdev->dev;
510 		segment = pci_domain_nr(pdev->bus);
511 	} else if (has_acpi_companion(dev))
512 		dev = &ACPI_COMPANION(dev)->dev;
513 
514 	rcu_read_lock();
515 	for_each_iommu(iommu, drhd) {
516 		if (pdev && segment != drhd->segment)
517 			continue;
518 
519 		for_each_active_dev_scope(drhd->devices,
520 					  drhd->devices_cnt, i, tmp) {
521 			if (tmp == dev) {
522 				/* For a VF use its original BDF# not that of the PF
523 				 * which we used for the IOMMU lookup. Strictly speaking
524 				 * we could do this for all PCI devices; we only need to
525 				 * get the BDF# from the scope table for ACPI matches. */
526 				if (pdev && pdev->is_virtfn)
527 					goto got_pdev;
528 
529 				if (bus && devfn) {
530 					*bus = drhd->devices[i].bus;
531 					*devfn = drhd->devices[i].devfn;
532 				}
533 				goto out;
534 			}
535 
536 			if (is_downstream_to_pci_bridge(dev, tmp))
537 				goto got_pdev;
538 		}
539 
540 		if (pdev && drhd->include_all) {
541 got_pdev:
542 			if (bus && devfn) {
543 				*bus = pdev->bus->number;
544 				*devfn = pdev->devfn;
545 			}
546 			goto out;
547 		}
548 	}
549 	iommu = NULL;
550 out:
551 	if (iommu_is_dummy(iommu, dev))
552 		iommu = NULL;
553 
554 	rcu_read_unlock();
555 
556 	return iommu;
557 }
558 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)559 static void domain_flush_cache(struct dmar_domain *domain,
560 			       void *addr, int size)
561 {
562 	if (!domain->iommu_coherency)
563 		clflush_cache_range(addr, size);
564 }
565 
free_context_table(struct intel_iommu * iommu)566 static void free_context_table(struct intel_iommu *iommu)
567 {
568 	struct context_entry *context;
569 	int i;
570 
571 	if (!iommu->root_entry)
572 		return;
573 
574 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
575 		context = iommu_context_addr(iommu, i, 0, 0);
576 		if (context)
577 			iommu_free_pages(context);
578 
579 		if (!sm_supported(iommu))
580 			continue;
581 
582 		context = iommu_context_addr(iommu, i, 0x80, 0);
583 		if (context)
584 			iommu_free_pages(context);
585 	}
586 
587 	iommu_free_pages(iommu->root_entry);
588 	iommu->root_entry = NULL;
589 }
590 
591 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)592 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
593 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
594 {
595 	struct dma_pte *pte;
596 	int offset;
597 
598 	while (1) {
599 		offset = pfn_level_offset(pfn, level);
600 		pte = &parent[offset];
601 
602 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
603 
604 		if (!dma_pte_present(pte)) {
605 			pr_info("page table not present at level %d\n", level - 1);
606 			break;
607 		}
608 
609 		if (level == 1 || dma_pte_superpage(pte))
610 			break;
611 
612 		parent = phys_to_virt(dma_pte_addr(pte));
613 		level--;
614 	}
615 }
616 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)617 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
618 			  unsigned long long addr, u32 pasid)
619 {
620 	struct pasid_dir_entry *dir, *pde;
621 	struct pasid_entry *entries, *pte;
622 	struct context_entry *ctx_entry;
623 	struct root_entry *rt_entry;
624 	int i, dir_index, index, level;
625 	u8 devfn = source_id & 0xff;
626 	u8 bus = source_id >> 8;
627 	struct dma_pte *pgtable;
628 
629 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
630 
631 	/* root entry dump */
632 	if (!iommu->root_entry) {
633 		pr_info("root table is not present\n");
634 		return;
635 	}
636 	rt_entry = &iommu->root_entry[bus];
637 
638 	if (sm_supported(iommu))
639 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
640 			rt_entry->hi, rt_entry->lo);
641 	else
642 		pr_info("root entry: 0x%016llx", rt_entry->lo);
643 
644 	/* context entry dump */
645 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
646 	if (!ctx_entry) {
647 		pr_info("context table is not present\n");
648 		return;
649 	}
650 
651 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
652 		ctx_entry->hi, ctx_entry->lo);
653 
654 	/* legacy mode does not require PASID entries */
655 	if (!sm_supported(iommu)) {
656 		if (!context_present(ctx_entry)) {
657 			pr_info("legacy mode page table is not present\n");
658 			return;
659 		}
660 		level = agaw_to_level(ctx_entry->hi & 7);
661 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
662 		goto pgtable_walk;
663 	}
664 
665 	if (!context_present(ctx_entry)) {
666 		pr_info("pasid directory table is not present\n");
667 		return;
668 	}
669 
670 	/* get the pointer to pasid directory entry */
671 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
672 
673 	/* For request-without-pasid, get the pasid from context entry */
674 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
675 		pasid = IOMMU_NO_PASID;
676 
677 	dir_index = pasid >> PASID_PDE_SHIFT;
678 	pde = &dir[dir_index];
679 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
680 
681 	/* get the pointer to the pasid table entry */
682 	entries = get_pasid_table_from_pde(pde);
683 	if (!entries) {
684 		pr_info("pasid table is not present\n");
685 		return;
686 	}
687 	index = pasid & PASID_PTE_MASK;
688 	pte = &entries[index];
689 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
690 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
691 
692 	if (!pasid_pte_is_present(pte)) {
693 		pr_info("scalable mode page table is not present\n");
694 		return;
695 	}
696 
697 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
698 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
699 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
700 	} else {
701 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
702 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
703 	}
704 
705 pgtable_walk:
706 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
707 }
708 #endif
709 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)710 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
711 				      unsigned long pfn, int *target_level,
712 				      gfp_t gfp)
713 {
714 	struct dma_pte *parent, *pte;
715 	int level = agaw_to_level(domain->agaw);
716 	int offset;
717 
718 	if (!domain_pfn_supported(domain, pfn))
719 		/* Address beyond IOMMU's addressing capabilities. */
720 		return NULL;
721 
722 	parent = domain->pgd;
723 
724 	while (1) {
725 		void *tmp_page;
726 
727 		offset = pfn_level_offset(pfn, level);
728 		pte = &parent[offset];
729 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
730 			break;
731 		if (level == *target_level)
732 			break;
733 
734 		if (!dma_pte_present(pte)) {
735 			uint64_t pteval, tmp;
736 
737 			tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
738 							     SZ_4K);
739 
740 			if (!tmp_page)
741 				return NULL;
742 
743 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
744 			pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
745 				 DMA_PTE_WRITE;
746 			if (domain->use_first_level)
747 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
748 
749 			tmp = 0ULL;
750 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
751 				/* Someone else set it while we were thinking; use theirs. */
752 				iommu_free_pages(tmp_page);
753 			else
754 				domain_flush_cache(domain, pte, sizeof(*pte));
755 		}
756 		if (level == 1)
757 			break;
758 
759 		parent = phys_to_virt(dma_pte_addr(pte));
760 		level--;
761 	}
762 
763 	if (!*target_level)
764 		*target_level = level;
765 
766 	return pte;
767 }
768 
769 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)770 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
771 					 unsigned long pfn,
772 					 int level, int *large_page)
773 {
774 	struct dma_pte *parent, *pte;
775 	int total = agaw_to_level(domain->agaw);
776 	int offset;
777 
778 	parent = domain->pgd;
779 	while (level <= total) {
780 		offset = pfn_level_offset(pfn, total);
781 		pte = &parent[offset];
782 		if (level == total)
783 			return pte;
784 
785 		if (!dma_pte_present(pte)) {
786 			*large_page = total;
787 			break;
788 		}
789 
790 		if (dma_pte_superpage(pte)) {
791 			*large_page = total;
792 			return pte;
793 		}
794 
795 		parent = phys_to_virt(dma_pte_addr(pte));
796 		total--;
797 	}
798 	return NULL;
799 }
800 
801 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)802 static void dma_pte_clear_range(struct dmar_domain *domain,
803 				unsigned long start_pfn,
804 				unsigned long last_pfn)
805 {
806 	unsigned int large_page;
807 	struct dma_pte *first_pte, *pte;
808 
809 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
810 	    WARN_ON(start_pfn > last_pfn))
811 		return;
812 
813 	/* we don't need lock here; nobody else touches the iova range */
814 	do {
815 		large_page = 1;
816 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
817 		if (!pte) {
818 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
819 			continue;
820 		}
821 		do {
822 			dma_clear_pte(pte);
823 			start_pfn += lvl_to_nr_pages(large_page);
824 			pte++;
825 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
826 
827 		domain_flush_cache(domain, first_pte,
828 				   (void *)pte - (void *)first_pte);
829 
830 	} while (start_pfn && start_pfn <= last_pfn);
831 }
832 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)833 static void dma_pte_free_level(struct dmar_domain *domain, int level,
834 			       int retain_level, struct dma_pte *pte,
835 			       unsigned long pfn, unsigned long start_pfn,
836 			       unsigned long last_pfn)
837 {
838 	pfn = max(start_pfn, pfn);
839 	pte = &pte[pfn_level_offset(pfn, level)];
840 
841 	do {
842 		unsigned long level_pfn;
843 		struct dma_pte *level_pte;
844 
845 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
846 			goto next;
847 
848 		level_pfn = pfn & level_mask(level);
849 		level_pte = phys_to_virt(dma_pte_addr(pte));
850 
851 		if (level > 2) {
852 			dma_pte_free_level(domain, level - 1, retain_level,
853 					   level_pte, level_pfn, start_pfn,
854 					   last_pfn);
855 		}
856 
857 		/*
858 		 * Free the page table if we're below the level we want to
859 		 * retain and the range covers the entire table.
860 		 */
861 		if (level < retain_level && !(start_pfn > level_pfn ||
862 		      last_pfn < level_pfn + level_size(level) - 1)) {
863 			dma_clear_pte(pte);
864 			domain_flush_cache(domain, pte, sizeof(*pte));
865 			iommu_free_pages(level_pte);
866 		}
867 next:
868 		pfn += level_size(level);
869 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
870 }
871 
872 /*
873  * clear last level (leaf) ptes and free page table pages below the
874  * level we wish to keep intact.
875  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)876 static void dma_pte_free_pagetable(struct dmar_domain *domain,
877 				   unsigned long start_pfn,
878 				   unsigned long last_pfn,
879 				   int retain_level)
880 {
881 	dma_pte_clear_range(domain, start_pfn, last_pfn);
882 
883 	/* We don't need lock here; nobody else touches the iova range */
884 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
885 			   domain->pgd, 0, start_pfn, last_pfn);
886 
887 	/* free pgd */
888 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
889 		iommu_free_pages(domain->pgd);
890 		domain->pgd = NULL;
891 	}
892 }
893 
894 /* When a page at a given level is being unlinked from its parent, we don't
895    need to *modify* it at all. All we need to do is make a list of all the
896    pages which can be freed just as soon as we've flushed the IOTLB and we
897    know the hardware page-walk will no longer touch them.
898    The 'pte' argument is the *parent* PTE, pointing to the page that is to
899    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * parent_pte,struct iommu_pages_list * freelist)900 static void dma_pte_list_pagetables(struct dmar_domain *domain,
901 				    int level, struct dma_pte *parent_pte,
902 				    struct iommu_pages_list *freelist)
903 {
904 	struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
905 
906 	iommu_pages_list_add(freelist, pte);
907 
908 	if (level == 1)
909 		return;
910 
911 	do {
912 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
913 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
914 		pte++;
915 	} while (!first_pte_in_page(pte));
916 }
917 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)918 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
919 				struct dma_pte *pte, unsigned long pfn,
920 				unsigned long start_pfn, unsigned long last_pfn,
921 				struct iommu_pages_list *freelist)
922 {
923 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
924 
925 	pfn = max(start_pfn, pfn);
926 	pte = &pte[pfn_level_offset(pfn, level)];
927 
928 	do {
929 		unsigned long level_pfn = pfn & level_mask(level);
930 
931 		if (!dma_pte_present(pte))
932 			goto next;
933 
934 		/* If range covers entire pagetable, free it */
935 		if (start_pfn <= level_pfn &&
936 		    last_pfn >= level_pfn + level_size(level) - 1) {
937 			/* These suborbinate page tables are going away entirely. Don't
938 			   bother to clear them; we're just going to *free* them. */
939 			if (level > 1 && !dma_pte_superpage(pte))
940 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
941 
942 			dma_clear_pte(pte);
943 			if (!first_pte)
944 				first_pte = pte;
945 			last_pte = pte;
946 		} else if (level > 1) {
947 			/* Recurse down into a level that isn't *entirely* obsolete */
948 			dma_pte_clear_level(domain, level - 1,
949 					    phys_to_virt(dma_pte_addr(pte)),
950 					    level_pfn, start_pfn, last_pfn,
951 					    freelist);
952 		}
953 next:
954 		pfn = level_pfn + level_size(level);
955 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
956 
957 	if (first_pte)
958 		domain_flush_cache(domain, first_pte,
959 				   (void *)++last_pte - (void *)first_pte);
960 }
961 
962 /* We can't just free the pages because the IOMMU may still be walking
963    the page tables, and may have cached the intermediate levels. The
964    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)965 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
966 			 unsigned long last_pfn,
967 			 struct iommu_pages_list *freelist)
968 {
969 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
970 	    WARN_ON(start_pfn > last_pfn))
971 		return;
972 
973 	/* we don't need lock here; nobody else touches the iova range */
974 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
975 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
976 
977 	/* free pgd */
978 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
979 		iommu_pages_list_add(freelist, domain->pgd);
980 		domain->pgd = NULL;
981 	}
982 }
983 
984 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)985 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
986 {
987 	struct root_entry *root;
988 
989 	root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
990 	if (!root) {
991 		pr_err("Allocating root entry for %s failed\n",
992 			iommu->name);
993 		return -ENOMEM;
994 	}
995 
996 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
997 	iommu->root_entry = root;
998 
999 	return 0;
1000 }
1001 
iommu_set_root_entry(struct intel_iommu * iommu)1002 static void iommu_set_root_entry(struct intel_iommu *iommu)
1003 {
1004 	u64 addr;
1005 	u32 sts;
1006 	unsigned long flag;
1007 
1008 	addr = virt_to_phys(iommu->root_entry);
1009 	if (sm_supported(iommu))
1010 		addr |= DMA_RTADDR_SMT;
1011 
1012 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1013 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1014 
1015 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1016 
1017 	/* Make sure hardware complete it */
1018 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1019 		      readl, (sts & DMA_GSTS_RTPS), sts);
1020 
1021 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1022 
1023 	/*
1024 	 * Hardware invalidates all DMA remapping hardware translation
1025 	 * caches as part of SRTP flow.
1026 	 */
1027 	if (cap_esrtps(iommu->cap))
1028 		return;
1029 
1030 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1031 	if (sm_supported(iommu))
1032 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1033 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1034 }
1035 
iommu_flush_write_buffer(struct intel_iommu * iommu)1036 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1037 {
1038 	u32 val;
1039 	unsigned long flag;
1040 
1041 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1042 		return;
1043 
1044 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1045 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1046 
1047 	/* Make sure hardware complete it */
1048 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1049 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1050 
1051 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1052 }
1053 
1054 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1055 static void __iommu_flush_context(struct intel_iommu *iommu,
1056 				  u16 did, u16 source_id, u8 function_mask,
1057 				  u64 type)
1058 {
1059 	u64 val = 0;
1060 	unsigned long flag;
1061 
1062 	switch (type) {
1063 	case DMA_CCMD_GLOBAL_INVL:
1064 		val = DMA_CCMD_GLOBAL_INVL;
1065 		break;
1066 	case DMA_CCMD_DOMAIN_INVL:
1067 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1068 		break;
1069 	case DMA_CCMD_DEVICE_INVL:
1070 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1071 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1072 		break;
1073 	default:
1074 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1075 			iommu->name, type);
1076 		return;
1077 	}
1078 	val |= DMA_CCMD_ICC;
1079 
1080 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1081 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1082 
1083 	/* Make sure hardware complete it */
1084 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1085 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1086 
1087 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1088 }
1089 
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1090 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1091 			 unsigned int size_order, u64 type)
1092 {
1093 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1094 	u64 val = 0, val_iva = 0;
1095 	unsigned long flag;
1096 
1097 	switch (type) {
1098 	case DMA_TLB_GLOBAL_FLUSH:
1099 		/* global flush doesn't need set IVA_REG */
1100 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1101 		break;
1102 	case DMA_TLB_DSI_FLUSH:
1103 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 		break;
1105 	case DMA_TLB_PSI_FLUSH:
1106 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1107 		/* IH bit is passed in as part of address */
1108 		val_iva = size_order | addr;
1109 		break;
1110 	default:
1111 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1112 			iommu->name, type);
1113 		return;
1114 	}
1115 
1116 	if (cap_write_drain(iommu->cap))
1117 		val |= DMA_TLB_WRITE_DRAIN;
1118 
1119 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1120 	/* Note: Only uses first TLB reg currently */
1121 	if (val_iva)
1122 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1123 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1124 
1125 	/* Make sure hardware complete it */
1126 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1127 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1128 
1129 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1130 
1131 	/* check IOTLB invalidation granularity */
1132 	if (DMA_TLB_IAIG(val) == 0)
1133 		pr_err("Flush IOTLB failed\n");
1134 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1135 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1136 			(unsigned long long)DMA_TLB_IIRG(type),
1137 			(unsigned long long)DMA_TLB_IAIG(val));
1138 }
1139 
1140 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1141 domain_lookup_dev_info(struct dmar_domain *domain,
1142 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1143 {
1144 	struct device_domain_info *info;
1145 	unsigned long flags;
1146 
1147 	spin_lock_irqsave(&domain->lock, flags);
1148 	list_for_each_entry(info, &domain->devices, link) {
1149 		if (info->iommu == iommu && info->bus == bus &&
1150 		    info->devfn == devfn) {
1151 			spin_unlock_irqrestore(&domain->lock, flags);
1152 			return info;
1153 		}
1154 	}
1155 	spin_unlock_irqrestore(&domain->lock, flags);
1156 
1157 	return NULL;
1158 }
1159 
1160 /*
1161  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1162  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1163  * check because it applies only to the built-in QAT devices and it doesn't
1164  * grant additional privileges.
1165  */
1166 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1167 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1168 {
1169 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1170 		return false;
1171 
1172 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1173 		return false;
1174 
1175 	return true;
1176 }
1177 
iommu_enable_pci_ats(struct device_domain_info * info)1178 static void iommu_enable_pci_ats(struct device_domain_info *info)
1179 {
1180 	struct pci_dev *pdev;
1181 
1182 	if (!info->ats_supported)
1183 		return;
1184 
1185 	pdev = to_pci_dev(info->dev);
1186 	if (!pci_ats_page_aligned(pdev))
1187 		return;
1188 
1189 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1190 		info->ats_enabled = 1;
1191 }
1192 
iommu_disable_pci_ats(struct device_domain_info * info)1193 static void iommu_disable_pci_ats(struct device_domain_info *info)
1194 {
1195 	if (!info->ats_enabled)
1196 		return;
1197 
1198 	pci_disable_ats(to_pci_dev(info->dev));
1199 	info->ats_enabled = 0;
1200 }
1201 
iommu_enable_pci_pri(struct device_domain_info * info)1202 static void iommu_enable_pci_pri(struct device_domain_info *info)
1203 {
1204 	struct pci_dev *pdev;
1205 
1206 	if (!info->ats_enabled || !info->pri_supported)
1207 		return;
1208 
1209 	pdev = to_pci_dev(info->dev);
1210 	/* PASID is required in PRG Response Message. */
1211 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1212 		return;
1213 
1214 	if (pci_reset_pri(pdev))
1215 		return;
1216 
1217 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1218 		info->pri_enabled = 1;
1219 }
1220 
iommu_disable_pci_pri(struct device_domain_info * info)1221 static void iommu_disable_pci_pri(struct device_domain_info *info)
1222 {
1223 	if (!info->pri_enabled)
1224 		return;
1225 
1226 	if (WARN_ON(info->iopf_refcount))
1227 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1228 
1229 	pci_disable_pri(to_pci_dev(info->dev));
1230 	info->pri_enabled = 0;
1231 }
1232 
intel_flush_iotlb_all(struct iommu_domain * domain)1233 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1234 {
1235 	cache_tag_flush_all(to_dmar_domain(domain));
1236 }
1237 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1238 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1239 {
1240 	u32 pmen;
1241 	unsigned long flags;
1242 
1243 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1244 		return;
1245 
1246 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1247 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1248 	pmen &= ~DMA_PMEN_EPM;
1249 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1250 
1251 	/* wait for the protected region status bit to clear */
1252 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1253 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1254 
1255 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1256 }
1257 
iommu_enable_translation(struct intel_iommu * iommu)1258 static void iommu_enable_translation(struct intel_iommu *iommu)
1259 {
1260 	u32 sts;
1261 	unsigned long flags;
1262 
1263 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1264 	iommu->gcmd |= DMA_GCMD_TE;
1265 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1266 
1267 	/* Make sure hardware complete it */
1268 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1269 		      readl, (sts & DMA_GSTS_TES), sts);
1270 
1271 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1272 }
1273 
iommu_disable_translation(struct intel_iommu * iommu)1274 static void iommu_disable_translation(struct intel_iommu *iommu)
1275 {
1276 	u32 sts;
1277 	unsigned long flag;
1278 
1279 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1280 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1281 		return;
1282 
1283 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1284 	iommu->gcmd &= ~DMA_GCMD_TE;
1285 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1289 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
disable_dmar_iommu(struct intel_iommu * iommu)1294 static void disable_dmar_iommu(struct intel_iommu *iommu)
1295 {
1296 	/*
1297 	 * All iommu domains must have been detached from the devices,
1298 	 * hence there should be no domain IDs in use.
1299 	 */
1300 	if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1301 		return;
1302 
1303 	if (iommu->gcmd & DMA_GCMD_TE)
1304 		iommu_disable_translation(iommu);
1305 }
1306 
free_dmar_iommu(struct intel_iommu * iommu)1307 static void free_dmar_iommu(struct intel_iommu *iommu)
1308 {
1309 	if (iommu->copied_tables) {
1310 		bitmap_free(iommu->copied_tables);
1311 		iommu->copied_tables = NULL;
1312 	}
1313 
1314 	/* free context mapping */
1315 	free_context_table(iommu);
1316 
1317 	if (ecap_prs(iommu->ecap))
1318 		intel_iommu_finish_prq(iommu);
1319 }
1320 
1321 /*
1322  * Check and return whether first level is used by default for
1323  * DMA translation.
1324  */
first_level_by_default(struct intel_iommu * iommu)1325 static bool first_level_by_default(struct intel_iommu *iommu)
1326 {
1327 	/* Only SL is available in legacy mode */
1328 	if (!sm_supported(iommu))
1329 		return false;
1330 
1331 	/* Only level (either FL or SL) is available, just use it */
1332 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1333 		return ecap_flts(iommu->ecap);
1334 
1335 	return true;
1336 }
1337 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1338 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1339 {
1340 	struct iommu_domain_info *info, *curr;
1341 	int num, ret = -ENOSPC;
1342 
1343 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1344 		return 0;
1345 
1346 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1347 	if (!info)
1348 		return -ENOMEM;
1349 
1350 	guard(mutex)(&iommu->did_lock);
1351 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1352 	if (curr) {
1353 		curr->refcnt++;
1354 		kfree(info);
1355 		return 0;
1356 	}
1357 
1358 	num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1359 			      cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1360 	if (num < 0) {
1361 		pr_err("%s: No free domain ids\n", iommu->name);
1362 		goto err_unlock;
1363 	}
1364 
1365 	info->refcnt	= 1;
1366 	info->did	= num;
1367 	info->iommu	= iommu;
1368 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1369 			  NULL, info, GFP_KERNEL);
1370 	if (curr) {
1371 		ret = xa_err(curr) ? : -EBUSY;
1372 		goto err_clear;
1373 	}
1374 
1375 	return 0;
1376 
1377 err_clear:
1378 	ida_free(&iommu->domain_ida, info->did);
1379 err_unlock:
1380 	kfree(info);
1381 	return ret;
1382 }
1383 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1384 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1385 {
1386 	struct iommu_domain_info *info;
1387 
1388 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1389 		return;
1390 
1391 	guard(mutex)(&iommu->did_lock);
1392 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1393 	if (--info->refcnt == 0) {
1394 		ida_free(&iommu->domain_ida, info->did);
1395 		xa_erase(&domain->iommu_array, iommu->seq_id);
1396 		kfree(info);
1397 	}
1398 }
1399 
1400 /*
1401  * For kdump cases, old valid entries may be cached due to the
1402  * in-flight DMA and copied pgtable, but there is no unmapping
1403  * behaviour for them, thus we need an explicit cache flush for
1404  * the newly-mapped device. For kdump, at this point, the device
1405  * is supposed to finish reset at its driver probe stage, so no
1406  * in-flight DMA will exist, and we don't need to worry anymore
1407  * hereafter.
1408  */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1409 static void copied_context_tear_down(struct intel_iommu *iommu,
1410 				     struct context_entry *context,
1411 				     u8 bus, u8 devfn)
1412 {
1413 	u16 did_old;
1414 
1415 	if (!context_copied(iommu, bus, devfn))
1416 		return;
1417 
1418 	assert_spin_locked(&iommu->lock);
1419 
1420 	did_old = context_domain_id(context);
1421 	context_clear_entry(context);
1422 
1423 	if (did_old < cap_ndoms(iommu->cap)) {
1424 		iommu->flush.flush_context(iommu, did_old,
1425 					   PCI_DEVID(bus, devfn),
1426 					   DMA_CCMD_MASK_NOBIT,
1427 					   DMA_CCMD_DEVICE_INVL);
1428 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1429 					 DMA_TLB_DSI_FLUSH);
1430 	}
1431 
1432 	clear_context_copied(iommu, bus, devfn);
1433 }
1434 
1435 /*
1436  * It's a non-present to present mapping. If hardware doesn't cache
1437  * non-present entry we only need to flush the write-buffer. If the
1438  * _does_ cache non-present entries, then it does so in the special
1439  * domain #0, which we have to flush:
1440  */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1441 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1442 					u8 bus, u8 devfn)
1443 {
1444 	if (cap_caching_mode(iommu->cap)) {
1445 		iommu->flush.flush_context(iommu, 0,
1446 					   PCI_DEVID(bus, devfn),
1447 					   DMA_CCMD_MASK_NOBIT,
1448 					   DMA_CCMD_DEVICE_INVL);
1449 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1450 	} else {
1451 		iommu_flush_write_buffer(iommu);
1452 	}
1453 }
1454 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1456 				      struct intel_iommu *iommu,
1457 				      u8 bus, u8 devfn)
1458 {
1459 	struct device_domain_info *info =
1460 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1461 	u16 did = domain_id_iommu(domain, iommu);
1462 	int translation = CONTEXT_TT_MULTI_LEVEL;
1463 	struct dma_pte *pgd = domain->pgd;
1464 	struct context_entry *context;
1465 	int ret;
1466 
1467 	if (WARN_ON(!intel_domain_is_ss_paging(domain)))
1468 		return -EINVAL;
1469 
1470 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1471 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1472 
1473 	spin_lock(&iommu->lock);
1474 	ret = -ENOMEM;
1475 	context = iommu_context_addr(iommu, bus, devfn, 1);
1476 	if (!context)
1477 		goto out_unlock;
1478 
1479 	ret = 0;
1480 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1481 		goto out_unlock;
1482 
1483 	copied_context_tear_down(iommu, context, bus, devfn);
1484 	context_clear_entry(context);
1485 	context_set_domain_id(context, did);
1486 
1487 	if (info && info->ats_supported)
1488 		translation = CONTEXT_TT_DEV_IOTLB;
1489 	else
1490 		translation = CONTEXT_TT_MULTI_LEVEL;
1491 
1492 	context_set_address_root(context, virt_to_phys(pgd));
1493 	context_set_address_width(context, domain->agaw);
1494 	context_set_translation_type(context, translation);
1495 	context_set_fault_enable(context);
1496 	context_set_present(context);
1497 	if (!ecap_coherent(iommu->ecap))
1498 		clflush_cache_range(context, sizeof(*context));
1499 	context_present_cache_flush(iommu, did, bus, devfn);
1500 	ret = 0;
1501 
1502 out_unlock:
1503 	spin_unlock(&iommu->lock);
1504 
1505 	return ret;
1506 }
1507 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1508 static int domain_context_mapping_cb(struct pci_dev *pdev,
1509 				     u16 alias, void *opaque)
1510 {
1511 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1512 	struct intel_iommu *iommu = info->iommu;
1513 	struct dmar_domain *domain = opaque;
1514 
1515 	return domain_context_mapping_one(domain, iommu,
1516 					  PCI_BUS_NUM(alias), alias & 0xff);
1517 }
1518 
1519 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1520 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1521 {
1522 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1523 	struct intel_iommu *iommu = info->iommu;
1524 	u8 bus = info->bus, devfn = info->devfn;
1525 	int ret;
1526 
1527 	if (!dev_is_pci(dev))
1528 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1529 
1530 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1531 				     domain_context_mapping_cb, domain);
1532 	if (ret)
1533 		return ret;
1534 
1535 	iommu_enable_pci_ats(info);
1536 
1537 	return 0;
1538 }
1539 
1540 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1541 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1542 				   unsigned long phy_pfn, unsigned long pages)
1543 {
1544 	int support, level = 1;
1545 	unsigned long pfnmerge;
1546 
1547 	support = domain->iommu_superpage;
1548 
1549 	/* To use a large page, the virtual *and* physical addresses
1550 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1551 	   of them will mean we have to use smaller pages. So just
1552 	   merge them and check both at once. */
1553 	pfnmerge = iov_pfn | phy_pfn;
1554 
1555 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1556 		pages >>= VTD_STRIDE_SHIFT;
1557 		if (!pages)
1558 			break;
1559 		pfnmerge >>= VTD_STRIDE_SHIFT;
1560 		level++;
1561 		support--;
1562 	}
1563 	return level;
1564 }
1565 
1566 /*
1567  * Ensure that old small page tables are removed to make room for superpage(s).
1568  * We're going to add new large pages, so make sure we don't remove their parent
1569  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1570  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1571 static void switch_to_super_page(struct dmar_domain *domain,
1572 				 unsigned long start_pfn,
1573 				 unsigned long end_pfn, int level)
1574 {
1575 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1576 	struct dma_pte *pte = NULL;
1577 
1578 	while (start_pfn <= end_pfn) {
1579 		if (!pte)
1580 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1581 					     GFP_ATOMIC);
1582 
1583 		if (dma_pte_present(pte)) {
1584 			dma_pte_free_pagetable(domain, start_pfn,
1585 					       start_pfn + lvl_pages - 1,
1586 					       level + 1);
1587 
1588 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1589 					      end_pfn << VTD_PAGE_SHIFT, 0);
1590 		}
1591 
1592 		pte++;
1593 		start_pfn += lvl_pages;
1594 		if (first_pte_in_page(pte))
1595 			pte = NULL;
1596 	}
1597 }
1598 
1599 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1600 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1601 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1602 		 gfp_t gfp)
1603 {
1604 	struct dma_pte *first_pte = NULL, *pte = NULL;
1605 	unsigned int largepage_lvl = 0;
1606 	unsigned long lvl_pages = 0;
1607 	phys_addr_t pteval;
1608 	u64 attr;
1609 
1610 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1611 		return -EINVAL;
1612 
1613 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1614 		return -EINVAL;
1615 
1616 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1617 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1618 		return -EINVAL;
1619 	}
1620 
1621 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1622 	if (domain->use_first_level) {
1623 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1624 		if (prot & DMA_PTE_WRITE)
1625 			attr |= DMA_FL_PTE_DIRTY;
1626 	}
1627 
1628 	domain->has_mappings = true;
1629 
1630 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1631 
1632 	while (nr_pages > 0) {
1633 		uint64_t tmp;
1634 
1635 		if (!pte) {
1636 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1637 					phys_pfn, nr_pages);
1638 
1639 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1640 					     gfp);
1641 			if (!pte)
1642 				return -ENOMEM;
1643 			first_pte = pte;
1644 
1645 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1646 
1647 			/* It is large page*/
1648 			if (largepage_lvl > 1) {
1649 				unsigned long end_pfn;
1650 				unsigned long pages_to_remove;
1651 
1652 				pteval |= DMA_PTE_LARGE_PAGE;
1653 				pages_to_remove = min_t(unsigned long, nr_pages,
1654 							nr_pte_to_next_page(pte) * lvl_pages);
1655 				end_pfn = iov_pfn + pages_to_remove - 1;
1656 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1657 			} else {
1658 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1659 			}
1660 
1661 		}
1662 		/* We don't need lock here, nobody else
1663 		 * touches the iova range
1664 		 */
1665 		tmp = 0ULL;
1666 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1667 			static int dumps = 5;
1668 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1669 				iov_pfn, tmp, (unsigned long long)pteval);
1670 			if (dumps) {
1671 				dumps--;
1672 				debug_dma_dump_mappings(NULL);
1673 			}
1674 			WARN_ON(1);
1675 		}
1676 
1677 		nr_pages -= lvl_pages;
1678 		iov_pfn += lvl_pages;
1679 		phys_pfn += lvl_pages;
1680 		pteval += lvl_pages * VTD_PAGE_SIZE;
1681 
1682 		/* If the next PTE would be the first in a new page, then we
1683 		 * need to flush the cache on the entries we've just written.
1684 		 * And then we'll need to recalculate 'pte', so clear it and
1685 		 * let it get set again in the if (!pte) block above.
1686 		 *
1687 		 * If we're done (!nr_pages) we need to flush the cache too.
1688 		 *
1689 		 * Also if we've been setting superpages, we may need to
1690 		 * recalculate 'pte' and switch back to smaller pages for the
1691 		 * end of the mapping, if the trailing size is not enough to
1692 		 * use another superpage (i.e. nr_pages < lvl_pages).
1693 		 */
1694 		pte++;
1695 		if (!nr_pages || first_pte_in_page(pte) ||
1696 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1697 			domain_flush_cache(domain, first_pte,
1698 					   (void *)pte - (void *)first_pte);
1699 			pte = NULL;
1700 		}
1701 	}
1702 
1703 	return 0;
1704 }
1705 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1706 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1707 {
1708 	struct intel_iommu *iommu = info->iommu;
1709 	struct context_entry *context;
1710 	u16 did;
1711 
1712 	spin_lock(&iommu->lock);
1713 	context = iommu_context_addr(iommu, bus, devfn, 0);
1714 	if (!context) {
1715 		spin_unlock(&iommu->lock);
1716 		return;
1717 	}
1718 
1719 	did = context_domain_id(context);
1720 	context_clear_entry(context);
1721 	__iommu_flush_cache(iommu, context, sizeof(*context));
1722 	spin_unlock(&iommu->lock);
1723 	intel_context_flush_no_pasid(info, context, did);
1724 }
1725 
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,phys_addr_t fsptptr,int flags,struct iommu_domain * old)1726 int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
1727 			       ioasid_t pasid, u16 did, phys_addr_t fsptptr,
1728 			       int flags, struct iommu_domain *old)
1729 {
1730 	if (!old)
1731 		return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
1732 						     did, flags);
1733 	return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
1734 					       iommu_domain_did(old, iommu),
1735 					       flags);
1736 }
1737 
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1738 static int domain_setup_second_level(struct intel_iommu *iommu,
1739 				     struct dmar_domain *domain,
1740 				     struct device *dev, ioasid_t pasid,
1741 				     struct iommu_domain *old)
1742 {
1743 	if (!old)
1744 		return intel_pasid_setup_second_level(iommu, domain,
1745 						      dev, pasid);
1746 	return intel_pasid_replace_second_level(iommu, domain, dev,
1747 						iommu_domain_did(old, iommu),
1748 						pasid);
1749 }
1750 
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1751 static int domain_setup_passthrough(struct intel_iommu *iommu,
1752 				    struct device *dev, ioasid_t pasid,
1753 				    struct iommu_domain *old)
1754 {
1755 	if (!old)
1756 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1757 	return intel_pasid_replace_pass_through(iommu, dev,
1758 						iommu_domain_did(old, iommu),
1759 						pasid);
1760 }
1761 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1762 static int domain_setup_first_level(struct intel_iommu *iommu,
1763 				    struct dmar_domain *domain,
1764 				    struct device *dev,
1765 				    u32 pasid, struct iommu_domain *old)
1766 {
1767 	struct dma_pte *pgd = domain->pgd;
1768 	int level, flags = 0;
1769 
1770 	level = agaw_to_level(domain->agaw);
1771 	if (level != 4 && level != 5)
1772 		return -EINVAL;
1773 
1774 	if (level == 5)
1775 		flags |= PASID_FLAG_FL5LP;
1776 
1777 	if (domain->force_snooping)
1778 		flags |= PASID_FLAG_PAGE_SNOOP;
1779 
1780 	return __domain_setup_first_level(iommu, dev, pasid,
1781 					  domain_id_iommu(domain, iommu),
1782 					  __pa(pgd), flags, old);
1783 }
1784 
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1785 static int dmar_domain_attach_device(struct dmar_domain *domain,
1786 				     struct device *dev)
1787 {
1788 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1789 	struct intel_iommu *iommu = info->iommu;
1790 	unsigned long flags;
1791 	int ret;
1792 
1793 	ret = domain_attach_iommu(domain, iommu);
1794 	if (ret)
1795 		return ret;
1796 
1797 	info->domain = domain;
1798 	info->domain_attached = true;
1799 	spin_lock_irqsave(&domain->lock, flags);
1800 	list_add(&info->link, &domain->devices);
1801 	spin_unlock_irqrestore(&domain->lock, flags);
1802 
1803 	if (dev_is_real_dma_subdevice(dev))
1804 		return 0;
1805 
1806 	if (!sm_supported(iommu))
1807 		ret = domain_context_mapping(domain, dev);
1808 	else if (intel_domain_is_fs_paging(domain))
1809 		ret = domain_setup_first_level(iommu, domain, dev,
1810 					       IOMMU_NO_PASID, NULL);
1811 	else if (intel_domain_is_ss_paging(domain))
1812 		ret = domain_setup_second_level(iommu, domain, dev,
1813 						IOMMU_NO_PASID, NULL);
1814 	else if (WARN_ON(true))
1815 		ret = -EINVAL;
1816 
1817 	if (ret)
1818 		goto out_block_translation;
1819 
1820 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1821 	if (ret)
1822 		goto out_block_translation;
1823 
1824 	return 0;
1825 
1826 out_block_translation:
1827 	device_block_translation(dev);
1828 	return ret;
1829 }
1830 
1831 /**
1832  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1833  * is relaxable (ie. is allowed to be not enforced under some conditions)
1834  * @dev: device handle
1835  *
1836  * We assume that PCI USB devices with RMRRs have them largely
1837  * for historical reasons and that the RMRR space is not actively used post
1838  * boot.  This exclusion may change if vendors begin to abuse it.
1839  *
1840  * The same exception is made for graphics devices, with the requirement that
1841  * any use of the RMRR regions will be torn down before assigning the device
1842  * to a guest.
1843  *
1844  * Return: true if the RMRR is relaxable, false otherwise
1845  */
device_rmrr_is_relaxable(struct device * dev)1846 static bool device_rmrr_is_relaxable(struct device *dev)
1847 {
1848 	struct pci_dev *pdev;
1849 
1850 	if (!dev_is_pci(dev))
1851 		return false;
1852 
1853 	pdev = to_pci_dev(dev);
1854 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1855 		return true;
1856 	else
1857 		return false;
1858 }
1859 
device_def_domain_type(struct device * dev)1860 static int device_def_domain_type(struct device *dev)
1861 {
1862 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1863 	struct intel_iommu *iommu = info->iommu;
1864 
1865 	/*
1866 	 * Hardware does not support the passthrough translation mode.
1867 	 * Always use a dynamaic mapping domain.
1868 	 */
1869 	if (!ecap_pass_through(iommu->ecap))
1870 		return IOMMU_DOMAIN_DMA;
1871 
1872 	if (dev_is_pci(dev)) {
1873 		struct pci_dev *pdev = to_pci_dev(dev);
1874 
1875 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1876 			return IOMMU_DOMAIN_IDENTITY;
1877 	}
1878 
1879 	return 0;
1880 }
1881 
intel_iommu_init_qi(struct intel_iommu * iommu)1882 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1883 {
1884 	/*
1885 	 * Start from the sane iommu hardware state.
1886 	 * If the queued invalidation is already initialized by us
1887 	 * (for example, while enabling interrupt-remapping) then
1888 	 * we got the things already rolling from a sane state.
1889 	 */
1890 	if (!iommu->qi) {
1891 		/*
1892 		 * Clear any previous faults.
1893 		 */
1894 		dmar_fault(-1, iommu);
1895 		/*
1896 		 * Disable queued invalidation if supported and already enabled
1897 		 * before OS handover.
1898 		 */
1899 		dmar_disable_qi(iommu);
1900 	}
1901 
1902 	if (dmar_enable_qi(iommu)) {
1903 		/*
1904 		 * Queued Invalidate not enabled, use Register Based Invalidate
1905 		 */
1906 		iommu->flush.flush_context = __iommu_flush_context;
1907 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1908 		pr_info("%s: Using Register based invalidation\n",
1909 			iommu->name);
1910 	} else {
1911 		iommu->flush.flush_context = qi_flush_context;
1912 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1913 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1914 	}
1915 }
1916 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1917 static int copy_context_table(struct intel_iommu *iommu,
1918 			      struct root_entry *old_re,
1919 			      struct context_entry **tbl,
1920 			      int bus, bool ext)
1921 {
1922 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1923 	struct context_entry *new_ce = NULL, ce;
1924 	struct context_entry *old_ce = NULL;
1925 	struct root_entry re;
1926 	phys_addr_t old_ce_phys;
1927 
1928 	tbl_idx = ext ? bus * 2 : bus;
1929 	memcpy(&re, old_re, sizeof(re));
1930 
1931 	for (devfn = 0; devfn < 256; devfn++) {
1932 		/* First calculate the correct index */
1933 		idx = (ext ? devfn * 2 : devfn) % 256;
1934 
1935 		if (idx == 0) {
1936 			/* First save what we may have and clean up */
1937 			if (new_ce) {
1938 				tbl[tbl_idx] = new_ce;
1939 				__iommu_flush_cache(iommu, new_ce,
1940 						    VTD_PAGE_SIZE);
1941 				pos = 1;
1942 			}
1943 
1944 			if (old_ce)
1945 				memunmap(old_ce);
1946 
1947 			ret = 0;
1948 			if (devfn < 0x80)
1949 				old_ce_phys = root_entry_lctp(&re);
1950 			else
1951 				old_ce_phys = root_entry_uctp(&re);
1952 
1953 			if (!old_ce_phys) {
1954 				if (ext && devfn == 0) {
1955 					/* No LCTP, try UCTP */
1956 					devfn = 0x7f;
1957 					continue;
1958 				} else {
1959 					goto out;
1960 				}
1961 			}
1962 
1963 			ret = -ENOMEM;
1964 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1965 					MEMREMAP_WB);
1966 			if (!old_ce)
1967 				goto out;
1968 
1969 			new_ce = iommu_alloc_pages_node_sz(iommu->node,
1970 							   GFP_KERNEL, SZ_4K);
1971 			if (!new_ce)
1972 				goto out_unmap;
1973 
1974 			ret = 0;
1975 		}
1976 
1977 		/* Now copy the context entry */
1978 		memcpy(&ce, old_ce + idx, sizeof(ce));
1979 
1980 		if (!context_present(&ce))
1981 			continue;
1982 
1983 		did = context_domain_id(&ce);
1984 		if (did >= 0 && did < cap_ndoms(iommu->cap))
1985 			ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
1986 
1987 		set_context_copied(iommu, bus, devfn);
1988 		new_ce[idx] = ce;
1989 	}
1990 
1991 	tbl[tbl_idx + pos] = new_ce;
1992 
1993 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
1994 
1995 out_unmap:
1996 	memunmap(old_ce);
1997 
1998 out:
1999 	return ret;
2000 }
2001 
copy_translation_tables(struct intel_iommu * iommu)2002 static int copy_translation_tables(struct intel_iommu *iommu)
2003 {
2004 	struct context_entry **ctxt_tbls;
2005 	struct root_entry *old_rt;
2006 	phys_addr_t old_rt_phys;
2007 	int ctxt_table_entries;
2008 	u64 rtaddr_reg;
2009 	int bus, ret;
2010 	bool new_ext, ext;
2011 
2012 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2013 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2014 	new_ext    = !!sm_supported(iommu);
2015 
2016 	/*
2017 	 * The RTT bit can only be changed when translation is disabled,
2018 	 * but disabling translation means to open a window for data
2019 	 * corruption. So bail out and don't copy anything if we would
2020 	 * have to change the bit.
2021 	 */
2022 	if (new_ext != ext)
2023 		return -EINVAL;
2024 
2025 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2026 	if (!iommu->copied_tables)
2027 		return -ENOMEM;
2028 
2029 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2030 	if (!old_rt_phys)
2031 		return -EINVAL;
2032 
2033 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2034 	if (!old_rt)
2035 		return -ENOMEM;
2036 
2037 	/* This is too big for the stack - allocate it from slab */
2038 	ctxt_table_entries = ext ? 512 : 256;
2039 	ret = -ENOMEM;
2040 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2041 	if (!ctxt_tbls)
2042 		goto out_unmap;
2043 
2044 	for (bus = 0; bus < 256; bus++) {
2045 		ret = copy_context_table(iommu, &old_rt[bus],
2046 					 ctxt_tbls, bus, ext);
2047 		if (ret) {
2048 			pr_err("%s: Failed to copy context table for bus %d\n",
2049 				iommu->name, bus);
2050 			continue;
2051 		}
2052 	}
2053 
2054 	spin_lock(&iommu->lock);
2055 
2056 	/* Context tables are copied, now write them to the root_entry table */
2057 	for (bus = 0; bus < 256; bus++) {
2058 		int idx = ext ? bus * 2 : bus;
2059 		u64 val;
2060 
2061 		if (ctxt_tbls[idx]) {
2062 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2063 			iommu->root_entry[bus].lo = val;
2064 		}
2065 
2066 		if (!ext || !ctxt_tbls[idx + 1])
2067 			continue;
2068 
2069 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2070 		iommu->root_entry[bus].hi = val;
2071 	}
2072 
2073 	spin_unlock(&iommu->lock);
2074 
2075 	kfree(ctxt_tbls);
2076 
2077 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2078 
2079 	ret = 0;
2080 
2081 out_unmap:
2082 	memunmap(old_rt);
2083 
2084 	return ret;
2085 }
2086 
init_dmars(void)2087 static int __init init_dmars(void)
2088 {
2089 	struct dmar_drhd_unit *drhd;
2090 	struct intel_iommu *iommu;
2091 	int ret;
2092 
2093 	for_each_iommu(iommu, drhd) {
2094 		if (drhd->ignored) {
2095 			iommu_disable_translation(iommu);
2096 			continue;
2097 		}
2098 
2099 		/*
2100 		 * Find the max pasid size of all IOMMU's in the system.
2101 		 * We need to ensure the system pasid table is no bigger
2102 		 * than the smallest supported.
2103 		 */
2104 		if (pasid_supported(iommu)) {
2105 			u32 temp = 2 << ecap_pss(iommu->ecap);
2106 
2107 			intel_pasid_max_id = min_t(u32, temp,
2108 						   intel_pasid_max_id);
2109 		}
2110 
2111 		intel_iommu_init_qi(iommu);
2112 		init_translation_status(iommu);
2113 
2114 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2115 			iommu_disable_translation(iommu);
2116 			clear_translation_pre_enabled(iommu);
2117 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2118 				iommu->name);
2119 		}
2120 
2121 		/*
2122 		 * TBD:
2123 		 * we could share the same root & context tables
2124 		 * among all IOMMU's. Need to Split it later.
2125 		 */
2126 		ret = iommu_alloc_root_entry(iommu);
2127 		if (ret)
2128 			goto free_iommu;
2129 
2130 		if (translation_pre_enabled(iommu)) {
2131 			pr_info("Translation already enabled - trying to copy translation structures\n");
2132 
2133 			ret = copy_translation_tables(iommu);
2134 			if (ret) {
2135 				/*
2136 				 * We found the IOMMU with translation
2137 				 * enabled - but failed to copy over the
2138 				 * old root-entry table. Try to proceed
2139 				 * by disabling translation now and
2140 				 * allocating a clean root-entry table.
2141 				 * This might cause DMAR faults, but
2142 				 * probably the dump will still succeed.
2143 				 */
2144 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2145 				       iommu->name);
2146 				iommu_disable_translation(iommu);
2147 				clear_translation_pre_enabled(iommu);
2148 			} else {
2149 				pr_info("Copied translation tables from previous kernel for %s\n",
2150 					iommu->name);
2151 			}
2152 		}
2153 
2154 		intel_svm_check(iommu);
2155 	}
2156 
2157 	/*
2158 	 * Now that qi is enabled on all iommus, set the root entry and flush
2159 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2160 	 * flush_context function will loop forever and the boot hangs.
2161 	 */
2162 	for_each_active_iommu(iommu, drhd) {
2163 		iommu_flush_write_buffer(iommu);
2164 		iommu_set_root_entry(iommu);
2165 	}
2166 
2167 	check_tylersburg_isoch();
2168 
2169 	/*
2170 	 * for each drhd
2171 	 *   enable fault log
2172 	 *   global invalidate context cache
2173 	 *   global invalidate iotlb
2174 	 *   enable translation
2175 	 */
2176 	for_each_iommu(iommu, drhd) {
2177 		if (drhd->ignored) {
2178 			/*
2179 			 * we always have to disable PMRs or DMA may fail on
2180 			 * this device
2181 			 */
2182 			if (force_on)
2183 				iommu_disable_protect_mem_regions(iommu);
2184 			continue;
2185 		}
2186 
2187 		iommu_flush_write_buffer(iommu);
2188 
2189 		if (ecap_prs(iommu->ecap)) {
2190 			/*
2191 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2192 			 * could cause possible lock race condition.
2193 			 */
2194 			up_write(&dmar_global_lock);
2195 			ret = intel_iommu_enable_prq(iommu);
2196 			down_write(&dmar_global_lock);
2197 			if (ret)
2198 				goto free_iommu;
2199 		}
2200 
2201 		ret = dmar_set_interrupt(iommu);
2202 		if (ret)
2203 			goto free_iommu;
2204 	}
2205 
2206 	return 0;
2207 
2208 free_iommu:
2209 	for_each_active_iommu(iommu, drhd) {
2210 		disable_dmar_iommu(iommu);
2211 		free_dmar_iommu(iommu);
2212 	}
2213 
2214 	return ret;
2215 }
2216 
init_no_remapping_devices(void)2217 static void __init init_no_remapping_devices(void)
2218 {
2219 	struct dmar_drhd_unit *drhd;
2220 	struct device *dev;
2221 	int i;
2222 
2223 	for_each_drhd_unit(drhd) {
2224 		if (!drhd->include_all) {
2225 			for_each_active_dev_scope(drhd->devices,
2226 						  drhd->devices_cnt, i, dev)
2227 				break;
2228 			/* ignore DMAR unit if no devices exist */
2229 			if (i == drhd->devices_cnt)
2230 				drhd->ignored = 1;
2231 		}
2232 	}
2233 
2234 	for_each_active_drhd_unit(drhd) {
2235 		if (drhd->include_all)
2236 			continue;
2237 
2238 		for_each_active_dev_scope(drhd->devices,
2239 					  drhd->devices_cnt, i, dev)
2240 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2241 				break;
2242 		if (i < drhd->devices_cnt)
2243 			continue;
2244 
2245 		/* This IOMMU has *only* gfx devices. Either bypass it or
2246 		   set the gfx_mapped flag, as appropriate */
2247 		drhd->gfx_dedicated = 1;
2248 		if (disable_igfx_iommu)
2249 			drhd->ignored = 1;
2250 	}
2251 }
2252 
2253 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2254 static int init_iommu_hw(void)
2255 {
2256 	struct dmar_drhd_unit *drhd;
2257 	struct intel_iommu *iommu = NULL;
2258 	int ret;
2259 
2260 	for_each_active_iommu(iommu, drhd) {
2261 		if (iommu->qi) {
2262 			ret = dmar_reenable_qi(iommu);
2263 			if (ret)
2264 				return ret;
2265 		}
2266 	}
2267 
2268 	for_each_iommu(iommu, drhd) {
2269 		if (drhd->ignored) {
2270 			/*
2271 			 * we always have to disable PMRs or DMA may fail on
2272 			 * this device
2273 			 */
2274 			if (force_on)
2275 				iommu_disable_protect_mem_regions(iommu);
2276 			continue;
2277 		}
2278 
2279 		iommu_flush_write_buffer(iommu);
2280 		iommu_set_root_entry(iommu);
2281 		iommu_enable_translation(iommu);
2282 		iommu_disable_protect_mem_regions(iommu);
2283 	}
2284 
2285 	return 0;
2286 }
2287 
iommu_flush_all(void)2288 static void iommu_flush_all(void)
2289 {
2290 	struct dmar_drhd_unit *drhd;
2291 	struct intel_iommu *iommu;
2292 
2293 	for_each_active_iommu(iommu, drhd) {
2294 		iommu->flush.flush_context(iommu, 0, 0, 0,
2295 					   DMA_CCMD_GLOBAL_INVL);
2296 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2297 					 DMA_TLB_GLOBAL_FLUSH);
2298 	}
2299 }
2300 
iommu_suspend(void)2301 static int iommu_suspend(void)
2302 {
2303 	struct dmar_drhd_unit *drhd;
2304 	struct intel_iommu *iommu = NULL;
2305 	unsigned long flag;
2306 
2307 	iommu_flush_all();
2308 
2309 	for_each_active_iommu(iommu, drhd) {
2310 		iommu_disable_translation(iommu);
2311 
2312 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2313 
2314 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2315 			readl(iommu->reg + DMAR_FECTL_REG);
2316 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2317 			readl(iommu->reg + DMAR_FEDATA_REG);
2318 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2319 			readl(iommu->reg + DMAR_FEADDR_REG);
2320 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2321 			readl(iommu->reg + DMAR_FEUADDR_REG);
2322 
2323 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2324 	}
2325 	return 0;
2326 }
2327 
iommu_resume(void)2328 static void iommu_resume(void)
2329 {
2330 	struct dmar_drhd_unit *drhd;
2331 	struct intel_iommu *iommu = NULL;
2332 	unsigned long flag;
2333 
2334 	if (init_iommu_hw()) {
2335 		if (force_on)
2336 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2337 		else
2338 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2339 		return;
2340 	}
2341 
2342 	for_each_active_iommu(iommu, drhd) {
2343 
2344 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2345 
2346 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2347 			iommu->reg + DMAR_FECTL_REG);
2348 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2349 			iommu->reg + DMAR_FEDATA_REG);
2350 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2351 			iommu->reg + DMAR_FEADDR_REG);
2352 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2353 			iommu->reg + DMAR_FEUADDR_REG);
2354 
2355 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2356 	}
2357 }
2358 
2359 static struct syscore_ops iommu_syscore_ops = {
2360 	.resume		= iommu_resume,
2361 	.suspend	= iommu_suspend,
2362 };
2363 
init_iommu_pm_ops(void)2364 static void __init init_iommu_pm_ops(void)
2365 {
2366 	register_syscore_ops(&iommu_syscore_ops);
2367 }
2368 
2369 #else
init_iommu_pm_ops(void)2370 static inline void init_iommu_pm_ops(void) {}
2371 #endif	/* CONFIG_PM */
2372 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2373 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2374 {
2375 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2376 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2377 	    rmrr->end_address <= rmrr->base_address ||
2378 	    arch_rmrr_sanity_check(rmrr))
2379 		return -EINVAL;
2380 
2381 	return 0;
2382 }
2383 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2384 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2385 {
2386 	struct acpi_dmar_reserved_memory *rmrr;
2387 	struct dmar_rmrr_unit *rmrru;
2388 
2389 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2390 	if (rmrr_sanity_check(rmrr)) {
2391 		pr_warn(FW_BUG
2392 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2393 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2394 			   rmrr->base_address, rmrr->end_address,
2395 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2396 			   dmi_get_system_info(DMI_BIOS_VERSION),
2397 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2398 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2399 	}
2400 
2401 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2402 	if (!rmrru)
2403 		goto out;
2404 
2405 	rmrru->hdr = header;
2406 
2407 	rmrru->base_address = rmrr->base_address;
2408 	rmrru->end_address = rmrr->end_address;
2409 
2410 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2411 				((void *)rmrr) + rmrr->header.length,
2412 				&rmrru->devices_cnt);
2413 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2414 		goto free_rmrru;
2415 
2416 	list_add(&rmrru->list, &dmar_rmrr_units);
2417 
2418 	return 0;
2419 free_rmrru:
2420 	kfree(rmrru);
2421 out:
2422 	return -ENOMEM;
2423 }
2424 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2425 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2426 {
2427 	struct dmar_atsr_unit *atsru;
2428 	struct acpi_dmar_atsr *tmp;
2429 
2430 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2431 				dmar_rcu_check()) {
2432 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2433 		if (atsr->segment != tmp->segment)
2434 			continue;
2435 		if (atsr->header.length != tmp->header.length)
2436 			continue;
2437 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2438 			return atsru;
2439 	}
2440 
2441 	return NULL;
2442 }
2443 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2444 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2445 {
2446 	struct acpi_dmar_atsr *atsr;
2447 	struct dmar_atsr_unit *atsru;
2448 
2449 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2450 		return 0;
2451 
2452 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2453 	atsru = dmar_find_atsr(atsr);
2454 	if (atsru)
2455 		return 0;
2456 
2457 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2458 	if (!atsru)
2459 		return -ENOMEM;
2460 
2461 	/*
2462 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2463 	 * copy the memory content because the memory buffer will be freed
2464 	 * on return.
2465 	 */
2466 	atsru->hdr = (void *)(atsru + 1);
2467 	memcpy(atsru->hdr, hdr, hdr->length);
2468 	atsru->include_all = atsr->flags & 0x1;
2469 	if (!atsru->include_all) {
2470 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2471 				(void *)atsr + atsr->header.length,
2472 				&atsru->devices_cnt);
2473 		if (atsru->devices_cnt && atsru->devices == NULL) {
2474 			kfree(atsru);
2475 			return -ENOMEM;
2476 		}
2477 	}
2478 
2479 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2480 
2481 	return 0;
2482 }
2483 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2484 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2485 {
2486 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2487 	kfree(atsru);
2488 }
2489 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2490 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2491 {
2492 	struct acpi_dmar_atsr *atsr;
2493 	struct dmar_atsr_unit *atsru;
2494 
2495 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2496 	atsru = dmar_find_atsr(atsr);
2497 	if (atsru) {
2498 		list_del_rcu(&atsru->list);
2499 		synchronize_rcu();
2500 		intel_iommu_free_atsr(atsru);
2501 	}
2502 
2503 	return 0;
2504 }
2505 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2506 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2507 {
2508 	int i;
2509 	struct device *dev;
2510 	struct acpi_dmar_atsr *atsr;
2511 	struct dmar_atsr_unit *atsru;
2512 
2513 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2514 	atsru = dmar_find_atsr(atsr);
2515 	if (!atsru)
2516 		return 0;
2517 
2518 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2519 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2520 					  i, dev)
2521 			return -EBUSY;
2522 	}
2523 
2524 	return 0;
2525 }
2526 
dmar_find_satc(struct acpi_dmar_satc * satc)2527 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2528 {
2529 	struct dmar_satc_unit *satcu;
2530 	struct acpi_dmar_satc *tmp;
2531 
2532 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2533 				dmar_rcu_check()) {
2534 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2535 		if (satc->segment != tmp->segment)
2536 			continue;
2537 		if (satc->header.length != tmp->header.length)
2538 			continue;
2539 		if (memcmp(satc, tmp, satc->header.length) == 0)
2540 			return satcu;
2541 	}
2542 
2543 	return NULL;
2544 }
2545 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2546 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2547 {
2548 	struct acpi_dmar_satc *satc;
2549 	struct dmar_satc_unit *satcu;
2550 
2551 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2552 		return 0;
2553 
2554 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2555 	satcu = dmar_find_satc(satc);
2556 	if (satcu)
2557 		return 0;
2558 
2559 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2560 	if (!satcu)
2561 		return -ENOMEM;
2562 
2563 	satcu->hdr = (void *)(satcu + 1);
2564 	memcpy(satcu->hdr, hdr, hdr->length);
2565 	satcu->atc_required = satc->flags & 0x1;
2566 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2567 					      (void *)satc + satc->header.length,
2568 					      &satcu->devices_cnt);
2569 	if (satcu->devices_cnt && !satcu->devices) {
2570 		kfree(satcu);
2571 		return -ENOMEM;
2572 	}
2573 	list_add_rcu(&satcu->list, &dmar_satc_units);
2574 
2575 	return 0;
2576 }
2577 
intel_iommu_add(struct dmar_drhd_unit * dmaru)2578 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2579 {
2580 	struct intel_iommu *iommu = dmaru->iommu;
2581 	int ret;
2582 
2583 	/*
2584 	 * Disable translation if already enabled prior to OS handover.
2585 	 */
2586 	if (iommu->gcmd & DMA_GCMD_TE)
2587 		iommu_disable_translation(iommu);
2588 
2589 	ret = iommu_alloc_root_entry(iommu);
2590 	if (ret)
2591 		goto out;
2592 
2593 	intel_svm_check(iommu);
2594 
2595 	if (dmaru->ignored) {
2596 		/*
2597 		 * we always have to disable PMRs or DMA may fail on this device
2598 		 */
2599 		if (force_on)
2600 			iommu_disable_protect_mem_regions(iommu);
2601 		return 0;
2602 	}
2603 
2604 	intel_iommu_init_qi(iommu);
2605 	iommu_flush_write_buffer(iommu);
2606 
2607 	if (ecap_prs(iommu->ecap)) {
2608 		ret = intel_iommu_enable_prq(iommu);
2609 		if (ret)
2610 			goto disable_iommu;
2611 	}
2612 
2613 	ret = dmar_set_interrupt(iommu);
2614 	if (ret)
2615 		goto disable_iommu;
2616 
2617 	iommu_set_root_entry(iommu);
2618 	iommu_enable_translation(iommu);
2619 
2620 	iommu_disable_protect_mem_regions(iommu);
2621 	return 0;
2622 
2623 disable_iommu:
2624 	disable_dmar_iommu(iommu);
2625 out:
2626 	free_dmar_iommu(iommu);
2627 	return ret;
2628 }
2629 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2630 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2631 {
2632 	int ret = 0;
2633 	struct intel_iommu *iommu = dmaru->iommu;
2634 
2635 	if (!intel_iommu_enabled)
2636 		return 0;
2637 	if (iommu == NULL)
2638 		return -EINVAL;
2639 
2640 	if (insert) {
2641 		ret = intel_iommu_add(dmaru);
2642 	} else {
2643 		disable_dmar_iommu(iommu);
2644 		free_dmar_iommu(iommu);
2645 	}
2646 
2647 	return ret;
2648 }
2649 
intel_iommu_free_dmars(void)2650 static void intel_iommu_free_dmars(void)
2651 {
2652 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2653 	struct dmar_atsr_unit *atsru, *atsr_n;
2654 	struct dmar_satc_unit *satcu, *satc_n;
2655 
2656 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2657 		list_del(&rmrru->list);
2658 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2659 		kfree(rmrru);
2660 	}
2661 
2662 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2663 		list_del(&atsru->list);
2664 		intel_iommu_free_atsr(atsru);
2665 	}
2666 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2667 		list_del(&satcu->list);
2668 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2669 		kfree(satcu);
2670 	}
2671 }
2672 
dmar_find_matched_satc_unit(struct pci_dev * dev)2673 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2674 {
2675 	struct dmar_satc_unit *satcu;
2676 	struct acpi_dmar_satc *satc;
2677 	struct device *tmp;
2678 	int i;
2679 
2680 	rcu_read_lock();
2681 
2682 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2683 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2684 		if (satc->segment != pci_domain_nr(dev->bus))
2685 			continue;
2686 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2687 			if (to_pci_dev(tmp) == dev)
2688 				goto out;
2689 	}
2690 	satcu = NULL;
2691 out:
2692 	rcu_read_unlock();
2693 	return satcu;
2694 }
2695 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2696 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2697 {
2698 	struct pci_dev *bridge = NULL;
2699 	struct dmar_atsr_unit *atsru;
2700 	struct dmar_satc_unit *satcu;
2701 	struct acpi_dmar_atsr *atsr;
2702 	bool supported = true;
2703 	struct pci_bus *bus;
2704 	struct device *tmp;
2705 	int i;
2706 
2707 	dev = pci_physfn(dev);
2708 	satcu = dmar_find_matched_satc_unit(dev);
2709 	if (satcu)
2710 		/*
2711 		 * This device supports ATS as it is in SATC table.
2712 		 * When IOMMU is in legacy mode, enabling ATS is done
2713 		 * automatically by HW for the device that requires
2714 		 * ATS, hence OS should not enable this device ATS
2715 		 * to avoid duplicated TLB invalidation.
2716 		 */
2717 		return !(satcu->atc_required && !sm_supported(iommu));
2718 
2719 	for (bus = dev->bus; bus; bus = bus->parent) {
2720 		bridge = bus->self;
2721 		/* If it's an integrated device, allow ATS */
2722 		if (!bridge)
2723 			return true;
2724 		/* Connected via non-PCIe: no ATS */
2725 		if (!pci_is_pcie(bridge) ||
2726 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2727 			return false;
2728 		/* If we found the root port, look it up in the ATSR */
2729 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2730 			break;
2731 	}
2732 
2733 	rcu_read_lock();
2734 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2735 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2736 		if (atsr->segment != pci_domain_nr(dev->bus))
2737 			continue;
2738 
2739 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2740 			if (tmp == &bridge->dev)
2741 				goto out;
2742 
2743 		if (atsru->include_all)
2744 			goto out;
2745 	}
2746 	supported = false;
2747 out:
2748 	rcu_read_unlock();
2749 
2750 	return supported;
2751 }
2752 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2753 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2754 {
2755 	int ret;
2756 	struct dmar_rmrr_unit *rmrru;
2757 	struct dmar_atsr_unit *atsru;
2758 	struct dmar_satc_unit *satcu;
2759 	struct acpi_dmar_atsr *atsr;
2760 	struct acpi_dmar_reserved_memory *rmrr;
2761 	struct acpi_dmar_satc *satc;
2762 
2763 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2764 		return 0;
2765 
2766 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2767 		rmrr = container_of(rmrru->hdr,
2768 				    struct acpi_dmar_reserved_memory, header);
2769 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2770 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2771 				((void *)rmrr) + rmrr->header.length,
2772 				rmrr->segment, rmrru->devices,
2773 				rmrru->devices_cnt);
2774 			if (ret < 0)
2775 				return ret;
2776 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2777 			dmar_remove_dev_scope(info, rmrr->segment,
2778 				rmrru->devices, rmrru->devices_cnt);
2779 		}
2780 	}
2781 
2782 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2783 		if (atsru->include_all)
2784 			continue;
2785 
2786 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2787 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2788 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2789 					(void *)atsr + atsr->header.length,
2790 					atsr->segment, atsru->devices,
2791 					atsru->devices_cnt);
2792 			if (ret > 0)
2793 				break;
2794 			else if (ret < 0)
2795 				return ret;
2796 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2797 			if (dmar_remove_dev_scope(info, atsr->segment,
2798 					atsru->devices, atsru->devices_cnt))
2799 				break;
2800 		}
2801 	}
2802 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2803 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2804 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2805 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2806 					(void *)satc + satc->header.length,
2807 					satc->segment, satcu->devices,
2808 					satcu->devices_cnt);
2809 			if (ret > 0)
2810 				break;
2811 			else if (ret < 0)
2812 				return ret;
2813 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2814 			if (dmar_remove_dev_scope(info, satc->segment,
2815 					satcu->devices, satcu->devices_cnt))
2816 				break;
2817 		}
2818 	}
2819 
2820 	return 0;
2821 }
2822 
intel_disable_iommus(void)2823 static void intel_disable_iommus(void)
2824 {
2825 	struct intel_iommu *iommu = NULL;
2826 	struct dmar_drhd_unit *drhd;
2827 
2828 	for_each_iommu(iommu, drhd)
2829 		iommu_disable_translation(iommu);
2830 }
2831 
intel_iommu_shutdown(void)2832 void intel_iommu_shutdown(void)
2833 {
2834 	struct dmar_drhd_unit *drhd;
2835 	struct intel_iommu *iommu = NULL;
2836 
2837 	if (no_iommu || dmar_disabled)
2838 		return;
2839 
2840 	/*
2841 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2842 	 * no lock and RCU checking needed anymore
2843 	 */
2844 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2845 		iommu = drhd->iommu;
2846 
2847 		/* Disable PMRs explicitly here. */
2848 		iommu_disable_protect_mem_regions(iommu);
2849 
2850 		/* Make sure the IOMMUs are switched off */
2851 		iommu_disable_translation(iommu);
2852 	}
2853 }
2854 
dev_to_intel_iommu(struct device * dev)2855 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2856 {
2857 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2858 
2859 	return container_of(iommu_dev, struct intel_iommu, iommu);
2860 }
2861 
version_show(struct device * dev,struct device_attribute * attr,char * buf)2862 static ssize_t version_show(struct device *dev,
2863 			    struct device_attribute *attr, char *buf)
2864 {
2865 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2866 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2867 	return sysfs_emit(buf, "%d:%d\n",
2868 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2869 }
2870 static DEVICE_ATTR_RO(version);
2871 
address_show(struct device * dev,struct device_attribute * attr,char * buf)2872 static ssize_t address_show(struct device *dev,
2873 			    struct device_attribute *attr, char *buf)
2874 {
2875 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2876 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2877 }
2878 static DEVICE_ATTR_RO(address);
2879 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2880 static ssize_t cap_show(struct device *dev,
2881 			struct device_attribute *attr, char *buf)
2882 {
2883 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2884 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2885 }
2886 static DEVICE_ATTR_RO(cap);
2887 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2888 static ssize_t ecap_show(struct device *dev,
2889 			 struct device_attribute *attr, char *buf)
2890 {
2891 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2892 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2893 }
2894 static DEVICE_ATTR_RO(ecap);
2895 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2896 static ssize_t domains_supported_show(struct device *dev,
2897 				      struct device_attribute *attr, char *buf)
2898 {
2899 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2900 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2901 }
2902 static DEVICE_ATTR_RO(domains_supported);
2903 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2904 static ssize_t domains_used_show(struct device *dev,
2905 				 struct device_attribute *attr, char *buf)
2906 {
2907 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2908 	unsigned int count = 0;
2909 	int id;
2910 
2911 	for (id = 0; id < cap_ndoms(iommu->cap); id++)
2912 		if (ida_exists(&iommu->domain_ida, id))
2913 			count++;
2914 
2915 	return sysfs_emit(buf, "%d\n", count);
2916 }
2917 static DEVICE_ATTR_RO(domains_used);
2918 
2919 static struct attribute *intel_iommu_attrs[] = {
2920 	&dev_attr_version.attr,
2921 	&dev_attr_address.attr,
2922 	&dev_attr_cap.attr,
2923 	&dev_attr_ecap.attr,
2924 	&dev_attr_domains_supported.attr,
2925 	&dev_attr_domains_used.attr,
2926 	NULL,
2927 };
2928 
2929 static struct attribute_group intel_iommu_group = {
2930 	.name = "intel-iommu",
2931 	.attrs = intel_iommu_attrs,
2932 };
2933 
2934 const struct attribute_group *intel_iommu_groups[] = {
2935 	&intel_iommu_group,
2936 	NULL,
2937 };
2938 
has_external_pci(void)2939 static bool has_external_pci(void)
2940 {
2941 	struct pci_dev *pdev = NULL;
2942 
2943 	for_each_pci_dev(pdev)
2944 		if (pdev->external_facing) {
2945 			pci_dev_put(pdev);
2946 			return true;
2947 		}
2948 
2949 	return false;
2950 }
2951 
platform_optin_force_iommu(void)2952 static int __init platform_optin_force_iommu(void)
2953 {
2954 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2955 		return 0;
2956 
2957 	if (no_iommu || dmar_disabled)
2958 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2959 
2960 	/*
2961 	 * If Intel-IOMMU is disabled by default, we will apply identity
2962 	 * map for all devices except those marked as being untrusted.
2963 	 */
2964 	if (dmar_disabled)
2965 		iommu_set_default_passthrough(false);
2966 
2967 	dmar_disabled = 0;
2968 	no_iommu = 0;
2969 
2970 	return 1;
2971 }
2972 
probe_acpi_namespace_devices(void)2973 static int __init probe_acpi_namespace_devices(void)
2974 {
2975 	struct dmar_drhd_unit *drhd;
2976 	/* To avoid a -Wunused-but-set-variable warning. */
2977 	struct intel_iommu *iommu __maybe_unused;
2978 	struct device *dev;
2979 	int i, ret = 0;
2980 
2981 	for_each_active_iommu(iommu, drhd) {
2982 		for_each_active_dev_scope(drhd->devices,
2983 					  drhd->devices_cnt, i, dev) {
2984 			struct acpi_device_physical_node *pn;
2985 			struct acpi_device *adev;
2986 
2987 			if (dev->bus != &acpi_bus_type)
2988 				continue;
2989 
2990 			up_read(&dmar_global_lock);
2991 			adev = to_acpi_device(dev);
2992 			mutex_lock(&adev->physical_node_lock);
2993 			list_for_each_entry(pn,
2994 					    &adev->physical_node_list, node) {
2995 				ret = iommu_probe_device(pn->dev);
2996 				if (ret)
2997 					break;
2998 			}
2999 			mutex_unlock(&adev->physical_node_lock);
3000 			down_read(&dmar_global_lock);
3001 
3002 			if (ret)
3003 				return ret;
3004 		}
3005 	}
3006 
3007 	return 0;
3008 }
3009 
tboot_force_iommu(void)3010 static __init int tboot_force_iommu(void)
3011 {
3012 	if (!tboot_enabled())
3013 		return 0;
3014 
3015 	if (no_iommu || dmar_disabled)
3016 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3017 
3018 	dmar_disabled = 0;
3019 	no_iommu = 0;
3020 
3021 	return 1;
3022 }
3023 
intel_iommu_init(void)3024 int __init intel_iommu_init(void)
3025 {
3026 	int ret = -ENODEV;
3027 	struct dmar_drhd_unit *drhd;
3028 	struct intel_iommu *iommu;
3029 
3030 	/*
3031 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3032 	 * opt in, so enforce that.
3033 	 */
3034 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3035 		    platform_optin_force_iommu();
3036 
3037 	down_write(&dmar_global_lock);
3038 	if (dmar_table_init()) {
3039 		if (force_on)
3040 			panic("tboot: Failed to initialize DMAR table\n");
3041 		goto out_free_dmar;
3042 	}
3043 
3044 	if (dmar_dev_scope_init() < 0) {
3045 		if (force_on)
3046 			panic("tboot: Failed to initialize DMAR device scope\n");
3047 		goto out_free_dmar;
3048 	}
3049 
3050 	up_write(&dmar_global_lock);
3051 
3052 	/*
3053 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3054 	 * complain later when we register it under the lock.
3055 	 */
3056 	dmar_register_bus_notifier();
3057 
3058 	down_write(&dmar_global_lock);
3059 
3060 	if (!no_iommu)
3061 		intel_iommu_debugfs_init();
3062 
3063 	if (no_iommu || dmar_disabled) {
3064 		/*
3065 		 * We exit the function here to ensure IOMMU's remapping and
3066 		 * mempool aren't setup, which means that the IOMMU's PMRs
3067 		 * won't be disabled via the call to init_dmars(). So disable
3068 		 * it explicitly here. The PMRs were setup by tboot prior to
3069 		 * calling SENTER, but the kernel is expected to reset/tear
3070 		 * down the PMRs.
3071 		 */
3072 		if (intel_iommu_tboot_noforce) {
3073 			for_each_iommu(iommu, drhd)
3074 				iommu_disable_protect_mem_regions(iommu);
3075 		}
3076 
3077 		/*
3078 		 * Make sure the IOMMUs are switched off, even when we
3079 		 * boot into a kexec kernel and the previous kernel left
3080 		 * them enabled
3081 		 */
3082 		intel_disable_iommus();
3083 		goto out_free_dmar;
3084 	}
3085 
3086 	if (list_empty(&dmar_rmrr_units))
3087 		pr_info("No RMRR found\n");
3088 
3089 	if (list_empty(&dmar_atsr_units))
3090 		pr_info("No ATSR found\n");
3091 
3092 	if (list_empty(&dmar_satc_units))
3093 		pr_info("No SATC found\n");
3094 
3095 	init_no_remapping_devices();
3096 
3097 	ret = init_dmars();
3098 	if (ret) {
3099 		if (force_on)
3100 			panic("tboot: Failed to initialize DMARs\n");
3101 		pr_err("Initialization failed\n");
3102 		goto out_free_dmar;
3103 	}
3104 	up_write(&dmar_global_lock);
3105 
3106 	init_iommu_pm_ops();
3107 
3108 	down_read(&dmar_global_lock);
3109 	for_each_active_iommu(iommu, drhd) {
3110 		/*
3111 		 * The flush queue implementation does not perform
3112 		 * page-selective invalidations that are required for efficient
3113 		 * TLB flushes in virtual environments.  The benefit of batching
3114 		 * is likely to be much lower than the overhead of synchronizing
3115 		 * the virtual and physical IOMMU page-tables.
3116 		 */
3117 		if (cap_caching_mode(iommu->cap) &&
3118 		    !first_level_by_default(iommu)) {
3119 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3120 			iommu_set_dma_strict();
3121 		}
3122 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3123 				       intel_iommu_groups,
3124 				       "%s", iommu->name);
3125 		/*
3126 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3127 		 * Release the dmar_global_lock before entering the device probe path
3128 		 * to avoid unnecessary lock order splat.
3129 		 */
3130 		up_read(&dmar_global_lock);
3131 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3132 		down_read(&dmar_global_lock);
3133 
3134 		iommu_pmu_register(iommu);
3135 	}
3136 
3137 	if (probe_acpi_namespace_devices())
3138 		pr_warn("ACPI name space devices didn't probe correctly\n");
3139 
3140 	/* Finally, we enable the DMA remapping hardware. */
3141 	for_each_iommu(iommu, drhd) {
3142 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3143 			iommu_enable_translation(iommu);
3144 
3145 		iommu_disable_protect_mem_regions(iommu);
3146 	}
3147 	up_read(&dmar_global_lock);
3148 
3149 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3150 
3151 	intel_iommu_enabled = 1;
3152 
3153 	return 0;
3154 
3155 out_free_dmar:
3156 	intel_iommu_free_dmars();
3157 	up_write(&dmar_global_lock);
3158 	return ret;
3159 }
3160 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3161 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3162 {
3163 	struct device_domain_info *info = opaque;
3164 
3165 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3166 	return 0;
3167 }
3168 
3169 /*
3170  * NB - intel-iommu lacks any sort of reference counting for the users of
3171  * dependent devices.  If multiple endpoints have intersecting dependent
3172  * devices, unbinding the driver from any one of them will possibly leave
3173  * the others unable to operate.
3174  */
domain_context_clear(struct device_domain_info * info)3175 static void domain_context_clear(struct device_domain_info *info)
3176 {
3177 	if (!dev_is_pci(info->dev)) {
3178 		domain_context_clear_one(info, info->bus, info->devfn);
3179 		return;
3180 	}
3181 
3182 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3183 			       &domain_context_clear_one_cb, info);
3184 	iommu_disable_pci_ats(info);
3185 }
3186 
3187 /*
3188  * Clear the page table pointer in context or pasid table entries so that
3189  * all DMA requests without PASID from the device are blocked. If the page
3190  * table has been set, clean up the data structures.
3191  */
device_block_translation(struct device * dev)3192 void device_block_translation(struct device *dev)
3193 {
3194 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3195 	struct intel_iommu *iommu = info->iommu;
3196 	unsigned long flags;
3197 
3198 	/* Device in DMA blocking state. Noting to do. */
3199 	if (!info->domain_attached)
3200 		return;
3201 
3202 	if (info->domain)
3203 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3204 
3205 	if (!dev_is_real_dma_subdevice(dev)) {
3206 		if (sm_supported(iommu))
3207 			intel_pasid_tear_down_entry(iommu, dev,
3208 						    IOMMU_NO_PASID, false);
3209 		else
3210 			domain_context_clear(info);
3211 	}
3212 
3213 	/* Device now in DMA blocking state. */
3214 	info->domain_attached = false;
3215 
3216 	if (!info->domain)
3217 		return;
3218 
3219 	spin_lock_irqsave(&info->domain->lock, flags);
3220 	list_del(&info->link);
3221 	spin_unlock_irqrestore(&info->domain->lock, flags);
3222 
3223 	domain_detach_iommu(info->domain, iommu);
3224 	info->domain = NULL;
3225 }
3226 
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3227 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3228 				      struct device *dev)
3229 {
3230 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3231 
3232 	iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
3233 	device_block_translation(dev);
3234 	return 0;
3235 }
3236 
3237 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3238 					 struct device *dev, ioasid_t pasid,
3239 					 struct iommu_domain *old);
3240 
3241 static struct iommu_domain blocking_domain = {
3242 	.type = IOMMU_DOMAIN_BLOCKED,
3243 	.ops = &(const struct iommu_domain_ops) {
3244 		.attach_dev	= blocking_domain_attach_dev,
3245 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3246 	}
3247 };
3248 
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3249 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3250 {
3251 	if (!intel_iommu_superpage)
3252 		return 0;
3253 
3254 	if (first_stage)
3255 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3256 
3257 	return fls(cap_super_page_val(iommu->cap));
3258 }
3259 
paging_domain_alloc(struct device * dev,bool first_stage)3260 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3261 {
3262 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3263 	struct intel_iommu *iommu = info->iommu;
3264 	struct dmar_domain *domain;
3265 	int addr_width;
3266 
3267 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3268 	if (!domain)
3269 		return ERR_PTR(-ENOMEM);
3270 
3271 	INIT_LIST_HEAD(&domain->devices);
3272 	INIT_LIST_HEAD(&domain->dev_pasids);
3273 	INIT_LIST_HEAD(&domain->cache_tags);
3274 	spin_lock_init(&domain->lock);
3275 	spin_lock_init(&domain->cache_lock);
3276 	xa_init(&domain->iommu_array);
3277 	INIT_LIST_HEAD(&domain->s1_domains);
3278 	spin_lock_init(&domain->s1_lock);
3279 
3280 	domain->nid = dev_to_node(dev);
3281 	domain->use_first_level = first_stage;
3282 
3283 	domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
3284 
3285 	/* calculate the address width */
3286 	addr_width = agaw_to_width(iommu->agaw);
3287 	if (addr_width > cap_mgaw(iommu->cap))
3288 		addr_width = cap_mgaw(iommu->cap);
3289 	domain->gaw = addr_width;
3290 	domain->agaw = iommu->agaw;
3291 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3292 
3293 	/* iommu memory access coherency */
3294 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3295 
3296 	/* pagesize bitmap */
3297 	domain->domain.pgsize_bitmap = SZ_4K;
3298 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3299 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3300 
3301 	/*
3302 	 * IOVA aperture: First-level translation restricts the input-address
3303 	 * to a canonical address (i.e., address bits 63:N have the same value
3304 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3305 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3306 	 */
3307 	domain->domain.geometry.force_aperture = true;
3308 	domain->domain.geometry.aperture_start = 0;
3309 	if (first_stage)
3310 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3311 	else
3312 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3313 
3314 	/* always allocate the top pgd */
3315 	domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
3316 	if (!domain->pgd) {
3317 		kfree(domain);
3318 		return ERR_PTR(-ENOMEM);
3319 	}
3320 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3321 
3322 	return domain;
3323 }
3324 
3325 static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device * dev,struct intel_iommu * iommu,u32 flags)3326 intel_iommu_domain_alloc_first_stage(struct device *dev,
3327 				     struct intel_iommu *iommu, u32 flags)
3328 {
3329 	struct dmar_domain *dmar_domain;
3330 
3331 	if (flags & ~IOMMU_HWPT_ALLOC_PASID)
3332 		return ERR_PTR(-EOPNOTSUPP);
3333 
3334 	/* Only SL is available in legacy mode */
3335 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3336 		return ERR_PTR(-EOPNOTSUPP);
3337 
3338 	dmar_domain = paging_domain_alloc(dev, true);
3339 	if (IS_ERR(dmar_domain))
3340 		return ERR_CAST(dmar_domain);
3341 
3342 	dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
3343 	/*
3344 	 * iotlb sync for map is only needed for legacy implementations that
3345 	 * explicitly require flushing internal write buffers to ensure memory
3346 	 * coherence.
3347 	 */
3348 	if (rwbf_required(iommu))
3349 		dmar_domain->iotlb_sync_map = true;
3350 
3351 	return &dmar_domain->domain;
3352 }
3353 
3354 static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device * dev,struct intel_iommu * iommu,u32 flags)3355 intel_iommu_domain_alloc_second_stage(struct device *dev,
3356 				      struct intel_iommu *iommu, u32 flags)
3357 {
3358 	struct dmar_domain *dmar_domain;
3359 
3360 	if (flags &
3361 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3362 	       IOMMU_HWPT_ALLOC_PASID)))
3363 		return ERR_PTR(-EOPNOTSUPP);
3364 
3365 	if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
3366 	     !nested_supported(iommu)) ||
3367 	    ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
3368 	     !ssads_supported(iommu)))
3369 		return ERR_PTR(-EOPNOTSUPP);
3370 
3371 	/* Legacy mode always supports second stage */
3372 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3373 		return ERR_PTR(-EOPNOTSUPP);
3374 
3375 	dmar_domain = paging_domain_alloc(dev, false);
3376 	if (IS_ERR(dmar_domain))
3377 		return ERR_CAST(dmar_domain);
3378 
3379 	dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
3380 	dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3381 
3382 	if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
3383 		dmar_domain->domain.dirty_ops = &intel_dirty_ops;
3384 
3385 	/*
3386 	 * Besides the internal write buffer flush, the caching mode used for
3387 	 * legacy nested translation (which utilizes shadowing page tables)
3388 	 * also requires iotlb sync on map.
3389 	 */
3390 	if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
3391 		dmar_domain->iotlb_sync_map = true;
3392 
3393 	return &dmar_domain->domain;
3394 }
3395 
3396 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3397 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3398 				      const struct iommu_user_data *user_data)
3399 {
3400 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3401 	struct intel_iommu *iommu = info->iommu;
3402 	struct iommu_domain *domain;
3403 
3404 	if (user_data)
3405 		return ERR_PTR(-EOPNOTSUPP);
3406 
3407 	/* Prefer first stage if possible by default. */
3408 	domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
3409 	if (domain != ERR_PTR(-EOPNOTSUPP))
3410 		return domain;
3411 	return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
3412 }
3413 
intel_iommu_domain_free(struct iommu_domain * domain)3414 static void intel_iommu_domain_free(struct iommu_domain *domain)
3415 {
3416 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3417 
3418 	if (WARN_ON(dmar_domain->nested_parent &&
3419 		    !list_empty(&dmar_domain->s1_domains)))
3420 		return;
3421 
3422 	if (WARN_ON(!list_empty(&dmar_domain->devices)))
3423 		return;
3424 
3425 	if (dmar_domain->pgd) {
3426 		struct iommu_pages_list freelist =
3427 			IOMMU_PAGES_LIST_INIT(freelist);
3428 
3429 		domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
3430 			     &freelist);
3431 		iommu_put_pages_list(&freelist);
3432 	}
3433 
3434 	kfree(dmar_domain->qi_batch);
3435 	kfree(dmar_domain);
3436 }
3437 
paging_domain_compatible_first_stage(struct dmar_domain * dmar_domain,struct intel_iommu * iommu)3438 static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
3439 						struct intel_iommu *iommu)
3440 {
3441 	if (WARN_ON(dmar_domain->domain.dirty_ops ||
3442 		    dmar_domain->nested_parent))
3443 		return -EINVAL;
3444 
3445 	/* Only SL is available in legacy mode */
3446 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3447 		return -EINVAL;
3448 
3449 	/* Same page size support */
3450 	if (!cap_fl1gp_support(iommu->cap) &&
3451 	    (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3452 		return -EINVAL;
3453 
3454 	/* iotlb sync on map requirement */
3455 	if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
3456 		return -EINVAL;
3457 
3458 	return 0;
3459 }
3460 
3461 static int
paging_domain_compatible_second_stage(struct dmar_domain * dmar_domain,struct intel_iommu * iommu)3462 paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
3463 				      struct intel_iommu *iommu)
3464 {
3465 	unsigned int sslps = cap_super_page_val(iommu->cap);
3466 
3467 	if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
3468 		return -EINVAL;
3469 	if (dmar_domain->nested_parent && !nested_supported(iommu))
3470 		return -EINVAL;
3471 
3472 	/* Legacy mode always supports second stage */
3473 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3474 		return -EINVAL;
3475 
3476 	/* Same page size support */
3477 	if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
3478 		return -EINVAL;
3479 	if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3480 		return -EINVAL;
3481 
3482 	/* iotlb sync on map requirement */
3483 	if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
3484 	    !dmar_domain->iotlb_sync_map)
3485 		return -EINVAL;
3486 
3487 	return 0;
3488 }
3489 
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3490 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3491 {
3492 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3493 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3494 	struct intel_iommu *iommu = info->iommu;
3495 	int ret = -EINVAL;
3496 	int addr_width;
3497 
3498 	if (intel_domain_is_fs_paging(dmar_domain))
3499 		ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
3500 	else if (intel_domain_is_ss_paging(dmar_domain))
3501 		ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
3502 	else if (WARN_ON(true))
3503 		ret = -EINVAL;
3504 	if (ret)
3505 		return ret;
3506 
3507 	/*
3508 	 * FIXME this is locked wrong, it needs to be under the
3509 	 * dmar_domain->lock
3510 	 */
3511 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3512 		return -EINVAL;
3513 
3514 	if (dmar_domain->iommu_coherency !=
3515 			iommu_paging_structure_coherency(iommu))
3516 		return -EINVAL;
3517 
3518 
3519 	/* check if this iommu agaw is sufficient for max mapped address */
3520 	addr_width = agaw_to_width(iommu->agaw);
3521 	if (addr_width > cap_mgaw(iommu->cap))
3522 		addr_width = cap_mgaw(iommu->cap);
3523 
3524 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3525 		return -EINVAL;
3526 
3527 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3528 	    context_copied(iommu, info->bus, info->devfn))
3529 		return intel_pasid_setup_sm_context(dev);
3530 
3531 	return 0;
3532 }
3533 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3534 static int intel_iommu_attach_device(struct iommu_domain *domain,
3535 				     struct device *dev)
3536 {
3537 	int ret;
3538 
3539 	device_block_translation(dev);
3540 
3541 	ret = paging_domain_compatible(domain, dev);
3542 	if (ret)
3543 		return ret;
3544 
3545 	ret = iopf_for_domain_set(domain, dev);
3546 	if (ret)
3547 		return ret;
3548 
3549 	ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
3550 	if (ret)
3551 		iopf_for_domain_remove(domain, dev);
3552 
3553 	return ret;
3554 }
3555 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3556 static int intel_iommu_map(struct iommu_domain *domain,
3557 			   unsigned long iova, phys_addr_t hpa,
3558 			   size_t size, int iommu_prot, gfp_t gfp)
3559 {
3560 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3561 	u64 max_addr;
3562 	int prot = 0;
3563 
3564 	if (iommu_prot & IOMMU_READ)
3565 		prot |= DMA_PTE_READ;
3566 	if (iommu_prot & IOMMU_WRITE)
3567 		prot |= DMA_PTE_WRITE;
3568 	if (dmar_domain->set_pte_snp)
3569 		prot |= DMA_PTE_SNP;
3570 
3571 	max_addr = iova + size;
3572 	if (dmar_domain->max_addr < max_addr) {
3573 		u64 end;
3574 
3575 		/* check if minimum agaw is sufficient for mapped address */
3576 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3577 		if (end < max_addr) {
3578 			pr_err("%s: iommu width (%d) is not "
3579 			       "sufficient for the mapped address (%llx)\n",
3580 			       __func__, dmar_domain->gaw, max_addr);
3581 			return -EFAULT;
3582 		}
3583 		dmar_domain->max_addr = max_addr;
3584 	}
3585 	/* Round up size to next multiple of PAGE_SIZE, if it and
3586 	   the low bits of hpa would take us onto the next page */
3587 	size = aligned_nrpages(hpa, size);
3588 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3589 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3590 }
3591 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3592 static int intel_iommu_map_pages(struct iommu_domain *domain,
3593 				 unsigned long iova, phys_addr_t paddr,
3594 				 size_t pgsize, size_t pgcount,
3595 				 int prot, gfp_t gfp, size_t *mapped)
3596 {
3597 	unsigned long pgshift = __ffs(pgsize);
3598 	size_t size = pgcount << pgshift;
3599 	int ret;
3600 
3601 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3602 		return -EINVAL;
3603 
3604 	if (!IS_ALIGNED(iova | paddr, pgsize))
3605 		return -EINVAL;
3606 
3607 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3608 	if (!ret && mapped)
3609 		*mapped = size;
3610 
3611 	return ret;
3612 }
3613 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3614 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3615 				unsigned long iova, size_t size,
3616 				struct iommu_iotlb_gather *gather)
3617 {
3618 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3619 	unsigned long start_pfn, last_pfn;
3620 	int level = 0;
3621 
3622 	/* Cope with horrid API which requires us to unmap more than the
3623 	   size argument if it happens to be a large-page mapping. */
3624 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3625 				     &level, GFP_ATOMIC)))
3626 		return 0;
3627 
3628 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3629 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3630 
3631 	start_pfn = iova >> VTD_PAGE_SHIFT;
3632 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3633 
3634 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3635 
3636 	if (dmar_domain->max_addr == iova + size)
3637 		dmar_domain->max_addr = iova;
3638 
3639 	/*
3640 	 * We do not use page-selective IOTLB invalidation in flush queue,
3641 	 * so there is no need to track page and sync iotlb.
3642 	 */
3643 	if (!iommu_iotlb_gather_queued(gather))
3644 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3645 
3646 	return size;
3647 }
3648 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3649 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3650 				      unsigned long iova,
3651 				      size_t pgsize, size_t pgcount,
3652 				      struct iommu_iotlb_gather *gather)
3653 {
3654 	unsigned long pgshift = __ffs(pgsize);
3655 	size_t size = pgcount << pgshift;
3656 
3657 	return intel_iommu_unmap(domain, iova, size, gather);
3658 }
3659 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3660 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3661 				 struct iommu_iotlb_gather *gather)
3662 {
3663 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3664 			      gather->end,
3665 			      iommu_pages_list_empty(&gather->freelist));
3666 	iommu_put_pages_list(&gather->freelist);
3667 }
3668 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3669 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3670 					    dma_addr_t iova)
3671 {
3672 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3673 	struct dma_pte *pte;
3674 	int level = 0;
3675 	u64 phys = 0;
3676 
3677 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3678 			     GFP_ATOMIC);
3679 	if (pte && dma_pte_present(pte))
3680 		phys = dma_pte_addr(pte) +
3681 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3682 						VTD_PAGE_SHIFT) - 1));
3683 
3684 	return phys;
3685 }
3686 
domain_support_force_snooping(struct dmar_domain * domain)3687 static bool domain_support_force_snooping(struct dmar_domain *domain)
3688 {
3689 	struct device_domain_info *info;
3690 	bool support = true;
3691 
3692 	assert_spin_locked(&domain->lock);
3693 	list_for_each_entry(info, &domain->devices, link) {
3694 		if (!ecap_sc_support(info->iommu->ecap)) {
3695 			support = false;
3696 			break;
3697 		}
3698 	}
3699 
3700 	return support;
3701 }
3702 
intel_iommu_enforce_cache_coherency_fs(struct iommu_domain * domain)3703 static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
3704 {
3705 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3706 	struct device_domain_info *info;
3707 
3708 	guard(spinlock_irqsave)(&dmar_domain->lock);
3709 
3710 	if (dmar_domain->force_snooping)
3711 		return true;
3712 
3713 	if (!domain_support_force_snooping(dmar_domain))
3714 		return false;
3715 
3716 	dmar_domain->force_snooping = true;
3717 	list_for_each_entry(info, &dmar_domain->devices, link)
3718 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3719 						     IOMMU_NO_PASID);
3720 	return true;
3721 }
3722 
intel_iommu_enforce_cache_coherency_ss(struct iommu_domain * domain)3723 static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
3724 {
3725 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3726 
3727 	guard(spinlock_irqsave)(&dmar_domain->lock);
3728 	if (!domain_support_force_snooping(dmar_domain) ||
3729 	    dmar_domain->has_mappings)
3730 		return false;
3731 
3732 	/*
3733 	 * Second level page table supports per-PTE snoop control. The
3734 	 * iommu_map() interface will handle this by setting SNP bit.
3735 	 */
3736 	dmar_domain->set_pte_snp = true;
3737 	dmar_domain->force_snooping = true;
3738 	return true;
3739 }
3740 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3741 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3742 {
3743 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3744 
3745 	switch (cap) {
3746 	case IOMMU_CAP_CACHE_COHERENCY:
3747 	case IOMMU_CAP_DEFERRED_FLUSH:
3748 		return true;
3749 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3750 		return dmar_platform_optin();
3751 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3752 		return ecap_sc_support(info->iommu->ecap);
3753 	case IOMMU_CAP_DIRTY_TRACKING:
3754 		return ssads_supported(info->iommu);
3755 	default:
3756 		return false;
3757 	}
3758 }
3759 
intel_iommu_probe_device(struct device * dev)3760 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3761 {
3762 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3763 	struct device_domain_info *info;
3764 	struct intel_iommu *iommu;
3765 	u8 bus, devfn;
3766 	int ret;
3767 
3768 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3769 	if (!iommu || !iommu->iommu.ops)
3770 		return ERR_PTR(-ENODEV);
3771 
3772 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3773 	if (!info)
3774 		return ERR_PTR(-ENOMEM);
3775 
3776 	if (dev_is_real_dma_subdevice(dev)) {
3777 		info->bus = pdev->bus->number;
3778 		info->devfn = pdev->devfn;
3779 		info->segment = pci_domain_nr(pdev->bus);
3780 	} else {
3781 		info->bus = bus;
3782 		info->devfn = devfn;
3783 		info->segment = iommu->segment;
3784 	}
3785 
3786 	info->dev = dev;
3787 	info->iommu = iommu;
3788 	if (dev_is_pci(dev)) {
3789 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3790 		    pci_ats_supported(pdev) &&
3791 		    dmar_ats_supported(pdev, iommu)) {
3792 			info->ats_supported = 1;
3793 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3794 
3795 			/*
3796 			 * For IOMMU that supports device IOTLB throttling
3797 			 * (DIT), we assign PFSID to the invalidation desc
3798 			 * of a VF such that IOMMU HW can gauge queue depth
3799 			 * at PF level. If DIT is not set, PFSID will be
3800 			 * treated as reserved, which should be set to 0.
3801 			 */
3802 			if (ecap_dit(iommu->ecap))
3803 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3804 			info->ats_qdep = pci_ats_queue_depth(pdev);
3805 		}
3806 		if (sm_supported(iommu)) {
3807 			if (pasid_supported(iommu)) {
3808 				int features = pci_pasid_features(pdev);
3809 
3810 				if (features >= 0)
3811 					info->pasid_supported = features | 1;
3812 			}
3813 
3814 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3815 			    pci_pri_supported(pdev))
3816 				info->pri_supported = 1;
3817 		}
3818 	}
3819 
3820 	dev_iommu_priv_set(dev, info);
3821 	if (pdev && pci_ats_supported(pdev)) {
3822 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3823 		ret = device_rbtree_insert(iommu, info);
3824 		if (ret)
3825 			goto free;
3826 	}
3827 
3828 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3829 		ret = intel_pasid_alloc_table(dev);
3830 		if (ret) {
3831 			dev_err(dev, "PASID table allocation failed\n");
3832 			goto clear_rbtree;
3833 		}
3834 
3835 		if (!context_copied(iommu, info->bus, info->devfn)) {
3836 			ret = intel_pasid_setup_sm_context(dev);
3837 			if (ret)
3838 				goto free_table;
3839 		}
3840 	}
3841 
3842 	intel_iommu_debugfs_create_dev(info);
3843 
3844 	return &iommu->iommu;
3845 free_table:
3846 	intel_pasid_free_table(dev);
3847 clear_rbtree:
3848 	device_rbtree_remove(info);
3849 free:
3850 	kfree(info);
3851 
3852 	return ERR_PTR(ret);
3853 }
3854 
intel_iommu_probe_finalize(struct device * dev)3855 static void intel_iommu_probe_finalize(struct device *dev)
3856 {
3857 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3858 	struct intel_iommu *iommu = info->iommu;
3859 
3860 	/*
3861 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3862 	 * device is undefined if you enable PASID support after ATS support.
3863 	 * So always enable PASID support on devices which have it, even if
3864 	 * we can't yet know if we're ever going to use it.
3865 	 */
3866 	if (info->pasid_supported &&
3867 	    !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3868 		info->pasid_enabled = 1;
3869 
3870 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3871 		iommu_enable_pci_ats(info);
3872 		/* Assign a DEVTLB cache tag to the default domain. */
3873 		if (info->ats_enabled && info->domain) {
3874 			u16 did = domain_id_iommu(info->domain, iommu);
3875 
3876 			if (cache_tag_assign(info->domain, did, dev,
3877 					     IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
3878 				iommu_disable_pci_ats(info);
3879 		}
3880 	}
3881 	iommu_enable_pci_pri(info);
3882 }
3883 
intel_iommu_release_device(struct device * dev)3884 static void intel_iommu_release_device(struct device *dev)
3885 {
3886 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3887 	struct intel_iommu *iommu = info->iommu;
3888 
3889 	iommu_disable_pci_pri(info);
3890 	iommu_disable_pci_ats(info);
3891 
3892 	if (info->pasid_enabled) {
3893 		pci_disable_pasid(to_pci_dev(dev));
3894 		info->pasid_enabled = 0;
3895 	}
3896 
3897 	mutex_lock(&iommu->iopf_lock);
3898 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3899 		device_rbtree_remove(info);
3900 	mutex_unlock(&iommu->iopf_lock);
3901 
3902 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3903 	    !context_copied(iommu, info->bus, info->devfn))
3904 		intel_pasid_teardown_sm_context(dev);
3905 
3906 	intel_pasid_free_table(dev);
3907 	intel_iommu_debugfs_remove_dev(info);
3908 	kfree(info);
3909 }
3910 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3911 static void intel_iommu_get_resv_regions(struct device *device,
3912 					 struct list_head *head)
3913 {
3914 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3915 	struct iommu_resv_region *reg;
3916 	struct dmar_rmrr_unit *rmrr;
3917 	struct device *i_dev;
3918 	int i;
3919 
3920 	rcu_read_lock();
3921 	for_each_rmrr_units(rmrr) {
3922 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3923 					  i, i_dev) {
3924 			struct iommu_resv_region *resv;
3925 			enum iommu_resv_type type;
3926 			size_t length;
3927 
3928 			if (i_dev != device &&
3929 			    !is_downstream_to_pci_bridge(device, i_dev))
3930 				continue;
3931 
3932 			length = rmrr->end_address - rmrr->base_address + 1;
3933 
3934 			type = device_rmrr_is_relaxable(device) ?
3935 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3936 
3937 			resv = iommu_alloc_resv_region(rmrr->base_address,
3938 						       length, prot, type,
3939 						       GFP_ATOMIC);
3940 			if (!resv)
3941 				break;
3942 
3943 			list_add_tail(&resv->list, head);
3944 		}
3945 	}
3946 	rcu_read_unlock();
3947 
3948 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3949 	if (dev_is_pci(device)) {
3950 		struct pci_dev *pdev = to_pci_dev(device);
3951 
3952 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3953 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3954 					IOMMU_RESV_DIRECT_RELAXABLE,
3955 					GFP_KERNEL);
3956 			if (reg)
3957 				list_add_tail(&reg->list, head);
3958 		}
3959 	}
3960 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3961 
3962 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3963 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3964 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3965 	if (!reg)
3966 		return;
3967 	list_add_tail(&reg->list, head);
3968 }
3969 
intel_iommu_device_group(struct device * dev)3970 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3971 {
3972 	if (dev_is_pci(dev))
3973 		return pci_device_group(dev);
3974 	return generic_device_group(dev);
3975 }
3976 
intel_iommu_enable_iopf(struct device * dev)3977 int intel_iommu_enable_iopf(struct device *dev)
3978 {
3979 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3980 	struct intel_iommu *iommu = info->iommu;
3981 	int ret;
3982 
3983 	if (!info->pri_enabled)
3984 		return -ENODEV;
3985 
3986 	/* pri_enabled is protected by the group mutex. */
3987 	iommu_group_mutex_assert(dev);
3988 	if (info->iopf_refcount) {
3989 		info->iopf_refcount++;
3990 		return 0;
3991 	}
3992 
3993 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3994 	if (ret)
3995 		return ret;
3996 
3997 	info->iopf_refcount = 1;
3998 
3999 	return 0;
4000 }
4001 
intel_iommu_disable_iopf(struct device * dev)4002 void intel_iommu_disable_iopf(struct device *dev)
4003 {
4004 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4005 	struct intel_iommu *iommu = info->iommu;
4006 
4007 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
4008 		return;
4009 
4010 	iommu_group_mutex_assert(dev);
4011 	if (--info->iopf_refcount)
4012 		return;
4013 
4014 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4015 }
4016 
intel_iommu_is_attach_deferred(struct device * dev)4017 static bool intel_iommu_is_attach_deferred(struct device *dev)
4018 {
4019 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4020 
4021 	return translation_pre_enabled(info->iommu) && !info->domain;
4022 }
4023 
4024 /*
4025  * Check that the device does not live on an external facing PCI port that is
4026  * marked as untrusted. Such devices should not be able to apply quirks and
4027  * thus not be able to bypass the IOMMU restrictions.
4028  */
risky_device(struct pci_dev * pdev)4029 static bool risky_device(struct pci_dev *pdev)
4030 {
4031 	if (pdev->untrusted) {
4032 		pci_info(pdev,
4033 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4034 			 pdev->vendor, pdev->device);
4035 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4036 		return true;
4037 	}
4038 	return false;
4039 }
4040 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4041 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4042 				      unsigned long iova, size_t size)
4043 {
4044 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4045 
4046 	if (dmar_domain->iotlb_sync_map)
4047 		cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
4048 
4049 	return 0;
4050 }
4051 
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4052 void domain_remove_dev_pasid(struct iommu_domain *domain,
4053 			     struct device *dev, ioasid_t pasid)
4054 {
4055 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4056 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4057 	struct intel_iommu *iommu = info->iommu;
4058 	struct dmar_domain *dmar_domain;
4059 	unsigned long flags;
4060 
4061 	if (!domain)
4062 		return;
4063 
4064 	/* Identity domain has no meta data for pasid. */
4065 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4066 		return;
4067 
4068 	dmar_domain = to_dmar_domain(domain);
4069 	spin_lock_irqsave(&dmar_domain->lock, flags);
4070 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4071 		if (curr->dev == dev && curr->pasid == pasid) {
4072 			list_del(&curr->link_domain);
4073 			dev_pasid = curr;
4074 			break;
4075 		}
4076 	}
4077 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4078 
4079 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4080 	domain_detach_iommu(dmar_domain, iommu);
4081 	if (!WARN_ON_ONCE(!dev_pasid)) {
4082 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4083 		kfree(dev_pasid);
4084 	}
4085 }
4086 
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4087 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4088 					 struct device *dev, ioasid_t pasid,
4089 					 struct iommu_domain *old)
4090 {
4091 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4092 
4093 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4094 	iopf_for_domain_remove(old, dev);
4095 	domain_remove_dev_pasid(old, dev, pasid);
4096 
4097 	return 0;
4098 }
4099 
4100 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4101 domain_add_dev_pasid(struct iommu_domain *domain,
4102 		     struct device *dev, ioasid_t pasid)
4103 {
4104 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4105 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4106 	struct intel_iommu *iommu = info->iommu;
4107 	struct dev_pasid_info *dev_pasid;
4108 	unsigned long flags;
4109 	int ret;
4110 
4111 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4112 	if (!dev_pasid)
4113 		return ERR_PTR(-ENOMEM);
4114 
4115 	ret = domain_attach_iommu(dmar_domain, iommu);
4116 	if (ret)
4117 		goto out_free;
4118 
4119 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4120 	if (ret)
4121 		goto out_detach_iommu;
4122 
4123 	dev_pasid->dev = dev;
4124 	dev_pasid->pasid = pasid;
4125 	spin_lock_irqsave(&dmar_domain->lock, flags);
4126 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4127 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4128 
4129 	return dev_pasid;
4130 out_detach_iommu:
4131 	domain_detach_iommu(dmar_domain, iommu);
4132 out_free:
4133 	kfree(dev_pasid);
4134 	return ERR_PTR(ret);
4135 }
4136 
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4137 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4138 				     struct device *dev, ioasid_t pasid,
4139 				     struct iommu_domain *old)
4140 {
4141 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4142 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4143 	struct intel_iommu *iommu = info->iommu;
4144 	struct dev_pasid_info *dev_pasid;
4145 	int ret;
4146 
4147 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4148 		return -EINVAL;
4149 
4150 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4151 		return -EOPNOTSUPP;
4152 
4153 	if (domain->dirty_ops)
4154 		return -EINVAL;
4155 
4156 	if (context_copied(iommu, info->bus, info->devfn))
4157 		return -EBUSY;
4158 
4159 	ret = paging_domain_compatible(domain, dev);
4160 	if (ret)
4161 		return ret;
4162 
4163 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4164 	if (IS_ERR(dev_pasid))
4165 		return PTR_ERR(dev_pasid);
4166 
4167 	ret = iopf_for_domain_replace(domain, old, dev);
4168 	if (ret)
4169 		goto out_remove_dev_pasid;
4170 
4171 	if (intel_domain_is_fs_paging(dmar_domain))
4172 		ret = domain_setup_first_level(iommu, dmar_domain,
4173 					       dev, pasid, old);
4174 	else if (intel_domain_is_ss_paging(dmar_domain))
4175 		ret = domain_setup_second_level(iommu, dmar_domain,
4176 						dev, pasid, old);
4177 	else if (WARN_ON(true))
4178 		ret = -EINVAL;
4179 
4180 	if (ret)
4181 		goto out_unwind_iopf;
4182 
4183 	domain_remove_dev_pasid(old, dev, pasid);
4184 
4185 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4186 
4187 	return 0;
4188 
4189 out_unwind_iopf:
4190 	iopf_for_domain_replace(old, domain, dev);
4191 out_remove_dev_pasid:
4192 	domain_remove_dev_pasid(domain, dev, pasid);
4193 	return ret;
4194 }
4195 
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4196 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4197 {
4198 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4199 	struct intel_iommu *iommu = info->iommu;
4200 	struct iommu_hw_info_vtd *vtd;
4201 
4202 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4203 	if (!vtd)
4204 		return ERR_PTR(-ENOMEM);
4205 
4206 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4207 	vtd->cap_reg = iommu->cap;
4208 	vtd->ecap_reg = iommu->ecap;
4209 	*length = sizeof(*vtd);
4210 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4211 	return vtd;
4212 }
4213 
4214 /*
4215  * Set dirty tracking for the device list of a domain. The caller must
4216  * hold the domain->lock when calling it.
4217  */
device_set_dirty_tracking(struct list_head * devices,bool enable)4218 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4219 {
4220 	struct device_domain_info *info;
4221 	int ret = 0;
4222 
4223 	list_for_each_entry(info, devices, link) {
4224 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4225 						       IOMMU_NO_PASID, enable);
4226 		if (ret)
4227 			break;
4228 	}
4229 
4230 	return ret;
4231 }
4232 
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4233 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4234 					    bool enable)
4235 {
4236 	struct dmar_domain *s1_domain;
4237 	unsigned long flags;
4238 	int ret;
4239 
4240 	spin_lock(&domain->s1_lock);
4241 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4242 		spin_lock_irqsave(&s1_domain->lock, flags);
4243 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4244 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4245 		if (ret)
4246 			goto err_unwind;
4247 	}
4248 	spin_unlock(&domain->s1_lock);
4249 	return 0;
4250 
4251 err_unwind:
4252 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4253 		spin_lock_irqsave(&s1_domain->lock, flags);
4254 		device_set_dirty_tracking(&s1_domain->devices,
4255 					  domain->dirty_tracking);
4256 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4257 	}
4258 	spin_unlock(&domain->s1_lock);
4259 	return ret;
4260 }
4261 
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4262 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4263 					  bool enable)
4264 {
4265 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4266 	int ret;
4267 
4268 	spin_lock(&dmar_domain->lock);
4269 	if (dmar_domain->dirty_tracking == enable)
4270 		goto out_unlock;
4271 
4272 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4273 	if (ret)
4274 		goto err_unwind;
4275 
4276 	if (dmar_domain->nested_parent) {
4277 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4278 		if (ret)
4279 			goto err_unwind;
4280 	}
4281 
4282 	dmar_domain->dirty_tracking = enable;
4283 out_unlock:
4284 	spin_unlock(&dmar_domain->lock);
4285 
4286 	return 0;
4287 
4288 err_unwind:
4289 	device_set_dirty_tracking(&dmar_domain->devices,
4290 				  dmar_domain->dirty_tracking);
4291 	spin_unlock(&dmar_domain->lock);
4292 	return ret;
4293 }
4294 
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4295 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4296 					    unsigned long iova, size_t size,
4297 					    unsigned long flags,
4298 					    struct iommu_dirty_bitmap *dirty)
4299 {
4300 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4301 	unsigned long end = iova + size - 1;
4302 	unsigned long pgsize;
4303 
4304 	/*
4305 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4306 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4307 	 * have occurred when we stopped dirty tracking. This ensures that we
4308 	 * never inherit dirtied bits from a previous cycle.
4309 	 */
4310 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4311 		return -EINVAL;
4312 
4313 	do {
4314 		struct dma_pte *pte;
4315 		int lvl = 0;
4316 
4317 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4318 				     GFP_ATOMIC);
4319 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4320 		if (!pte || !dma_pte_present(pte)) {
4321 			iova += pgsize;
4322 			continue;
4323 		}
4324 
4325 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4326 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4327 		iova += pgsize;
4328 	} while (iova < end);
4329 
4330 	return 0;
4331 }
4332 
4333 static const struct iommu_dirty_ops intel_dirty_ops = {
4334 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4335 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4336 };
4337 
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4338 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4339 {
4340 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4341 	struct intel_iommu *iommu = info->iommu;
4342 	struct context_entry *context;
4343 
4344 	spin_lock(&iommu->lock);
4345 	context = iommu_context_addr(iommu, bus, devfn, 1);
4346 	if (!context) {
4347 		spin_unlock(&iommu->lock);
4348 		return -ENOMEM;
4349 	}
4350 
4351 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4352 		spin_unlock(&iommu->lock);
4353 		return 0;
4354 	}
4355 
4356 	copied_context_tear_down(iommu, context, bus, devfn);
4357 	context_clear_entry(context);
4358 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4359 
4360 	/*
4361 	 * In pass through mode, AW must be programmed to indicate the largest
4362 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4363 	 */
4364 	context_set_address_width(context, iommu->msagaw);
4365 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4366 	context_set_fault_enable(context);
4367 	context_set_present(context);
4368 	if (!ecap_coherent(iommu->ecap))
4369 		clflush_cache_range(context, sizeof(*context));
4370 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4371 	spin_unlock(&iommu->lock);
4372 
4373 	return 0;
4374 }
4375 
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4376 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4377 {
4378 	struct device *dev = data;
4379 
4380 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4381 }
4382 
device_setup_pass_through(struct device * dev)4383 static int device_setup_pass_through(struct device *dev)
4384 {
4385 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4386 
4387 	if (!dev_is_pci(dev))
4388 		return context_setup_pass_through(dev, info->bus, info->devfn);
4389 
4390 	return pci_for_each_dma_alias(to_pci_dev(dev),
4391 				      context_setup_pass_through_cb, dev);
4392 }
4393 
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4394 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4395 {
4396 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4397 	struct intel_iommu *iommu = info->iommu;
4398 	int ret;
4399 
4400 	device_block_translation(dev);
4401 
4402 	if (dev_is_real_dma_subdevice(dev))
4403 		return 0;
4404 
4405 	/*
4406 	 * No PRI support with the global identity domain. No need to enable or
4407 	 * disable PRI in this path as the iommu has been put in the blocking
4408 	 * state.
4409 	 */
4410 	if (sm_supported(iommu))
4411 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4412 	else
4413 		ret = device_setup_pass_through(dev);
4414 
4415 	if (!ret)
4416 		info->domain_attached = true;
4417 
4418 	return ret;
4419 }
4420 
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4421 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4422 					 struct device *dev, ioasid_t pasid,
4423 					 struct iommu_domain *old)
4424 {
4425 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4426 	struct intel_iommu *iommu = info->iommu;
4427 	int ret;
4428 
4429 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4430 		return -EOPNOTSUPP;
4431 
4432 	ret = iopf_for_domain_replace(domain, old, dev);
4433 	if (ret)
4434 		return ret;
4435 
4436 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4437 	if (ret) {
4438 		iopf_for_domain_replace(old, domain, dev);
4439 		return ret;
4440 	}
4441 
4442 	domain_remove_dev_pasid(old, dev, pasid);
4443 	return 0;
4444 }
4445 
4446 static struct iommu_domain identity_domain = {
4447 	.type = IOMMU_DOMAIN_IDENTITY,
4448 	.ops = &(const struct iommu_domain_ops) {
4449 		.attach_dev	= identity_domain_attach_dev,
4450 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4451 	},
4452 };
4453 
4454 const struct iommu_domain_ops intel_fs_paging_domain_ops = {
4455 	.attach_dev = intel_iommu_attach_device,
4456 	.set_dev_pasid = intel_iommu_set_dev_pasid,
4457 	.map_pages = intel_iommu_map_pages,
4458 	.unmap_pages = intel_iommu_unmap_pages,
4459 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
4460 	.flush_iotlb_all = intel_flush_iotlb_all,
4461 	.iotlb_sync = intel_iommu_tlb_sync,
4462 	.iova_to_phys = intel_iommu_iova_to_phys,
4463 	.free = intel_iommu_domain_free,
4464 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
4465 };
4466 
4467 const struct iommu_domain_ops intel_ss_paging_domain_ops = {
4468 	.attach_dev = intel_iommu_attach_device,
4469 	.set_dev_pasid = intel_iommu_set_dev_pasid,
4470 	.map_pages = intel_iommu_map_pages,
4471 	.unmap_pages = intel_iommu_unmap_pages,
4472 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
4473 	.flush_iotlb_all = intel_flush_iotlb_all,
4474 	.iotlb_sync = intel_iommu_tlb_sync,
4475 	.iova_to_phys = intel_iommu_iova_to_phys,
4476 	.free = intel_iommu_domain_free,
4477 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
4478 };
4479 
4480 const struct iommu_ops intel_iommu_ops = {
4481 	.blocked_domain		= &blocking_domain,
4482 	.release_domain		= &blocking_domain,
4483 	.identity_domain	= &identity_domain,
4484 	.capable		= intel_iommu_capable,
4485 	.hw_info		= intel_iommu_hw_info,
4486 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4487 	.domain_alloc_sva	= intel_svm_domain_alloc,
4488 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4489 	.probe_device		= intel_iommu_probe_device,
4490 	.probe_finalize		= intel_iommu_probe_finalize,
4491 	.release_device		= intel_iommu_release_device,
4492 	.get_resv_regions	= intel_iommu_get_resv_regions,
4493 	.device_group		= intel_iommu_device_group,
4494 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4495 	.def_domain_type	= device_def_domain_type,
4496 	.page_response		= intel_iommu_page_response,
4497 };
4498 
quirk_iommu_igfx(struct pci_dev * dev)4499 static void quirk_iommu_igfx(struct pci_dev *dev)
4500 {
4501 	if (risky_device(dev))
4502 		return;
4503 
4504 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4505 	disable_igfx_iommu = 1;
4506 }
4507 
4508 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4516 
4517 /* QM57/QS57 integrated gfx malfunctions with dmar */
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4519 
4520 /* Broadwell igfx malfunctions with dmar */
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4527 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4529 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4530 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4539 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4545 
quirk_iommu_rwbf(struct pci_dev * dev)4546 static void quirk_iommu_rwbf(struct pci_dev *dev)
4547 {
4548 	if (risky_device(dev))
4549 		return;
4550 
4551 	/*
4552 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4553 	 * but needs it. Same seems to hold for the desktop versions.
4554 	 */
4555 	pci_info(dev, "Forcing write-buffer flush capability\n");
4556 	rwbf_quirk = 1;
4557 }
4558 
4559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4560 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4564 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4565 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4566 
4567 #define GGC 0x52
4568 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4569 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4570 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4571 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4572 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4573 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4574 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4575 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4576 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4577 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4578 {
4579 	unsigned short ggc;
4580 
4581 	if (risky_device(dev))
4582 		return;
4583 
4584 	if (pci_read_config_word(dev, GGC, &ggc))
4585 		return;
4586 
4587 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4588 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4589 		disable_igfx_iommu = 1;
4590 	} else if (!disable_igfx_iommu) {
4591 		/* we have to ensure the gfx device is idle before we flush */
4592 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4593 		iommu_set_dma_strict();
4594 	}
4595 }
4596 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4597 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4598 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4599 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4600 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4601 {
4602 	unsigned short ver;
4603 
4604 	if (!IS_GFX_DEVICE(dev))
4605 		return;
4606 
4607 	ver = (dev->device >> 8) & 0xff;
4608 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4609 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4610 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4611 		return;
4612 
4613 	if (risky_device(dev))
4614 		return;
4615 
4616 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4617 	iommu_skip_te_disable = 1;
4618 }
4619 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4620 
4621 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4622    ISOCH DMAR unit for the Azalia sound device, but not give it any
4623    TLB entries, which causes it to deadlock. Check for that.  We do
4624    this in a function called from init_dmars(), instead of in a PCI
4625    quirk, because we don't want to print the obnoxious "BIOS broken"
4626    message if VT-d is actually disabled.
4627 */
check_tylersburg_isoch(void)4628 static void __init check_tylersburg_isoch(void)
4629 {
4630 	struct pci_dev *pdev;
4631 	uint32_t vtisochctrl;
4632 
4633 	/* If there's no Azalia in the system anyway, forget it. */
4634 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4635 	if (!pdev)
4636 		return;
4637 
4638 	if (risky_device(pdev)) {
4639 		pci_dev_put(pdev);
4640 		return;
4641 	}
4642 
4643 	pci_dev_put(pdev);
4644 
4645 	/* System Management Registers. Might be hidden, in which case
4646 	   we can't do the sanity check. But that's OK, because the
4647 	   known-broken BIOSes _don't_ actually hide it, so far. */
4648 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4649 	if (!pdev)
4650 		return;
4651 
4652 	if (risky_device(pdev)) {
4653 		pci_dev_put(pdev);
4654 		return;
4655 	}
4656 
4657 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4658 		pci_dev_put(pdev);
4659 		return;
4660 	}
4661 
4662 	pci_dev_put(pdev);
4663 
4664 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4665 	if (vtisochctrl & 1)
4666 		return;
4667 
4668 	/* Drop all bits other than the number of TLB entries */
4669 	vtisochctrl &= 0x1c;
4670 
4671 	/* If we have the recommended number of TLB entries (16), fine. */
4672 	if (vtisochctrl == 0x10)
4673 		return;
4674 
4675 	/* Zero TLB entries? You get to ride the short bus to school. */
4676 	if (!vtisochctrl) {
4677 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4678 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4679 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4680 		     dmi_get_system_info(DMI_BIOS_VERSION),
4681 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4682 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4683 		return;
4684 	}
4685 
4686 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4687 	       vtisochctrl);
4688 }
4689 
4690 /*
4691  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4692  * invalidation completion before posted writes initiated with translated address
4693  * that utilized translations matching the invalidation address range, violating
4694  * the invalidation completion ordering.
4695  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4696  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4697  * under the control of the trusted/privileged host device driver must use this
4698  * quirk.
4699  * Device TLBs are invalidated under the following six conditions:
4700  * 1. Device driver does DMA API unmap IOVA
4701  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4702  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4703  *    exit_mmap() due to crash
4704  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4705  *    VM has to free pages that were unmapped
4706  * 5. Userspace driver unmaps a DMA buffer
4707  * 6. Cache invalidation in vSVA usage (upcoming)
4708  *
4709  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4710  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4711  * invalidate TLB the same way as normal user unmap which will use this quirk.
4712  * The dTLB invalidation after PASID cache flush does not need this quirk.
4713  *
4714  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4715  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4716 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4717 			       unsigned long address, unsigned long mask,
4718 			       u32 pasid, u16 qdep)
4719 {
4720 	u16 sid;
4721 
4722 	if (likely(!info->dtlb_extra_inval))
4723 		return;
4724 
4725 	sid = PCI_DEVID(info->bus, info->devfn);
4726 	if (pasid == IOMMU_NO_PASID) {
4727 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4728 				   qdep, address, mask);
4729 	} else {
4730 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4731 					 pasid, qdep, address, mask);
4732 	}
4733 }
4734 
4735 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4736 
4737 /*
4738  * Function to submit a command to the enhanced command interface. The
4739  * valid enhanced command descriptions are defined in Table 47 of the
4740  * VT-d spec. The VT-d hardware implementation may support some but not
4741  * all commands, which can be determined by checking the Enhanced
4742  * Command Capability Register.
4743  *
4744  * Return values:
4745  *  - 0: Command successful without any error;
4746  *  - Negative: software error value;
4747  *  - Nonzero positive: failure status code defined in Table 48.
4748  */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4749 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4750 {
4751 	unsigned long flags;
4752 	u64 res;
4753 	int ret;
4754 
4755 	if (!cap_ecmds(iommu->cap))
4756 		return -ENODEV;
4757 
4758 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4759 
4760 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4761 	if (res & DMA_ECMD_ECRSP_IP) {
4762 		ret = -EBUSY;
4763 		goto err;
4764 	}
4765 
4766 	/*
4767 	 * Unconditionally write the operand B, because
4768 	 * - There is no side effect if an ecmd doesn't require an
4769 	 *   operand B, but we set the register to some value.
4770 	 * - It's not invoked in any critical path. The extra MMIO
4771 	 *   write doesn't bring any performance concerns.
4772 	 */
4773 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4774 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4775 
4776 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4777 		      !(res & DMA_ECMD_ECRSP_IP), res);
4778 
4779 	if (res & DMA_ECMD_ECRSP_IP) {
4780 		ret = -ETIMEDOUT;
4781 		goto err;
4782 	}
4783 
4784 	ret = ecmd_get_status_code(res);
4785 err:
4786 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4787 
4788 	return ret;
4789 }
4790