xref: /linux/drivers/iommu/amd/iommu.c (revision 2a52ca7c98960aafb0eca9ef96b2d0c932171357)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
4  * Author: Joerg Roedel <jroedel@suse.de>
5  *         Leo Duran <leo.duran@amd.com>
6  */
7 
8 #define pr_fmt(fmt)     "AMD-Vi: " fmt
9 #define dev_fmt(fmt)    pr_fmt(fmt)
10 
11 #include <linux/ratelimit.h>
12 #include <linux/pci.h>
13 #include <linux/acpi.h>
14 #include <linux/pci-ats.h>
15 #include <linux/bitmap.h>
16 #include <linux/slab.h>
17 #include <linux/debugfs.h>
18 #include <linux/scatterlist.h>
19 #include <linux/dma-map-ops.h>
20 #include <linux/dma-direct.h>
21 #include <linux/iommu-helper.h>
22 #include <linux/delay.h>
23 #include <linux/amd-iommu.h>
24 #include <linux/notifier.h>
25 #include <linux/export.h>
26 #include <linux/irq.h>
27 #include <linux/msi.h>
28 #include <linux/irqdomain.h>
29 #include <linux/percpu.h>
30 #include <linux/io-pgtable.h>
31 #include <linux/cc_platform.h>
32 #include <asm/irq_remapping.h>
33 #include <asm/io_apic.h>
34 #include <asm/apic.h>
35 #include <asm/hw_irq.h>
36 #include <asm/proto.h>
37 #include <asm/iommu.h>
38 #include <asm/gart.h>
39 #include <asm/dma.h>
40 #include <uapi/linux/iommufd.h>
41 
42 #include "amd_iommu.h"
43 #include "../dma-iommu.h"
44 #include "../irq_remapping.h"
45 #include "../iommu-pages.h"
46 
47 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
48 
49 /* Reserved IOVA ranges */
50 #define MSI_RANGE_START		(0xfee00000)
51 #define MSI_RANGE_END		(0xfeefffff)
52 #define HT_RANGE_START		(0xfd00000000ULL)
53 #define HT_RANGE_END		(0xffffffffffULL)
54 
55 #define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
56 
57 static DEFINE_SPINLOCK(pd_bitmap_lock);
58 
59 LIST_HEAD(ioapic_map);
60 LIST_HEAD(hpet_map);
61 LIST_HEAD(acpihid_map);
62 
63 const struct iommu_ops amd_iommu_ops;
64 static const struct iommu_dirty_ops amd_dirty_ops;
65 
66 int amd_iommu_max_glx_val = -1;
67 
68 /*
69  * general struct to manage commands send to an IOMMU
70  */
71 struct iommu_cmd {
72 	u32 data[4];
73 };
74 
75 struct kmem_cache *amd_iommu_irq_cache;
76 
77 static void detach_device(struct device *dev);
78 
79 static void set_dte_entry(struct amd_iommu *iommu,
80 			  struct iommu_dev_data *dev_data);
81 
82 /****************************************************************************
83  *
84  * Helper functions
85  *
86  ****************************************************************************/
87 
88 static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
89 {
90 	return (pdom && (pdom->pd_mode == PD_MODE_V2));
91 }
92 
93 static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom)
94 {
95 	return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY);
96 }
97 
98 /*
99  * We cannot support PASID w/ existing v1 page table in the same domain
100  * since it will be nested. However, existing domain w/ v2 page table
101  * or passthrough mode can be used for PASID.
102  */
103 static inline bool pdom_is_sva_capable(struct protection_domain *pdom)
104 {
105 	return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom);
106 }
107 
108 static inline int get_acpihid_device_id(struct device *dev,
109 					struct acpihid_map_entry **entry)
110 {
111 	struct acpi_device *adev = ACPI_COMPANION(dev);
112 	struct acpihid_map_entry *p;
113 
114 	if (!adev)
115 		return -ENODEV;
116 
117 	list_for_each_entry(p, &acpihid_map, list) {
118 		if (acpi_dev_hid_uid_match(adev, p->hid,
119 					   p->uid[0] ? p->uid : NULL)) {
120 			if (entry)
121 				*entry = p;
122 			return p->devid;
123 		}
124 	}
125 	return -EINVAL;
126 }
127 
128 static inline int get_device_sbdf_id(struct device *dev)
129 {
130 	int sbdf;
131 
132 	if (dev_is_pci(dev))
133 		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
134 	else
135 		sbdf = get_acpihid_device_id(dev, NULL);
136 
137 	return sbdf;
138 }
139 
140 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
141 {
142 	struct dev_table_entry *dev_table;
143 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
144 
145 	BUG_ON(pci_seg == NULL);
146 	dev_table = pci_seg->dev_table;
147 	BUG_ON(dev_table == NULL);
148 
149 	return dev_table;
150 }
151 
152 static inline u16 get_device_segment(struct device *dev)
153 {
154 	u16 seg;
155 
156 	if (dev_is_pci(dev)) {
157 		struct pci_dev *pdev = to_pci_dev(dev);
158 
159 		seg = pci_domain_nr(pdev->bus);
160 	} else {
161 		u32 devid = get_acpihid_device_id(dev, NULL);
162 
163 		seg = PCI_SBDF_TO_SEGID(devid);
164 	}
165 
166 	return seg;
167 }
168 
169 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */
170 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
171 {
172 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
173 
174 	pci_seg->rlookup_table[devid] = iommu;
175 }
176 
177 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
178 {
179 	struct amd_iommu_pci_seg *pci_seg;
180 
181 	for_each_pci_segment(pci_seg) {
182 		if (pci_seg->id == seg)
183 			return pci_seg->rlookup_table[devid];
184 	}
185 	return NULL;
186 }
187 
188 static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
189 {
190 	u16 seg = get_device_segment(dev);
191 	int devid = get_device_sbdf_id(dev);
192 
193 	if (devid < 0)
194 		return NULL;
195 	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
196 }
197 
198 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
199 {
200 	struct iommu_dev_data *dev_data;
201 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
202 
203 	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
204 	if (!dev_data)
205 		return NULL;
206 
207 	spin_lock_init(&dev_data->lock);
208 	dev_data->devid = devid;
209 	ratelimit_default_init(&dev_data->rs);
210 
211 	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
212 	return dev_data;
213 }
214 
215 static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
216 {
217 	struct iommu_dev_data *dev_data;
218 	struct llist_node *node;
219 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
220 
221 	if (llist_empty(&pci_seg->dev_data_list))
222 		return NULL;
223 
224 	node = pci_seg->dev_data_list.first;
225 	llist_for_each_entry(dev_data, node, dev_data_list) {
226 		if (dev_data->devid == devid)
227 			return dev_data;
228 	}
229 
230 	return NULL;
231 }
232 
233 static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
234 {
235 	struct amd_iommu *iommu;
236 	struct dev_table_entry *dev_table;
237 	u16 devid = pci_dev_id(pdev);
238 
239 	if (devid == alias)
240 		return 0;
241 
242 	iommu = rlookup_amd_iommu(&pdev->dev);
243 	if (!iommu)
244 		return 0;
245 
246 	amd_iommu_set_rlookup_table(iommu, alias);
247 	dev_table = get_dev_table(iommu);
248 	memcpy(dev_table[alias].data,
249 	       dev_table[devid].data,
250 	       sizeof(dev_table[alias].data));
251 
252 	return 0;
253 }
254 
255 static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
256 {
257 	struct pci_dev *pdev;
258 
259 	if (!dev_is_pci(dev))
260 		return;
261 	pdev = to_pci_dev(dev);
262 
263 	/*
264 	 * The IVRS alias stored in the alias table may not be
265 	 * part of the PCI DMA aliases if it's bus differs
266 	 * from the original device.
267 	 */
268 	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
269 
270 	pci_for_each_dma_alias(pdev, clone_alias, NULL);
271 }
272 
273 static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
274 {
275 	struct pci_dev *pdev = to_pci_dev(dev);
276 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
277 	u16 ivrs_alias;
278 
279 	/* For ACPI HID devices, there are no aliases */
280 	if (!dev_is_pci(dev))
281 		return;
282 
283 	/*
284 	 * Add the IVRS alias to the pci aliases if it is on the same
285 	 * bus. The IVRS table may know about a quirk that we don't.
286 	 */
287 	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
288 	if (ivrs_alias != pci_dev_id(pdev) &&
289 	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
290 		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
291 
292 	clone_aliases(iommu, dev);
293 }
294 
295 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
296 {
297 	struct iommu_dev_data *dev_data;
298 
299 	dev_data = search_dev_data(iommu, devid);
300 
301 	if (dev_data == NULL) {
302 		dev_data = alloc_dev_data(iommu, devid);
303 		if (!dev_data)
304 			return NULL;
305 
306 		if (translation_pre_enabled(iommu))
307 			dev_data->defer_attach = true;
308 	}
309 
310 	return dev_data;
311 }
312 
313 /*
314 * Find or create an IOMMU group for a acpihid device.
315 */
316 static struct iommu_group *acpihid_device_group(struct device *dev)
317 {
318 	struct acpihid_map_entry *p, *entry = NULL;
319 	int devid;
320 
321 	devid = get_acpihid_device_id(dev, &entry);
322 	if (devid < 0)
323 		return ERR_PTR(devid);
324 
325 	list_for_each_entry(p, &acpihid_map, list) {
326 		if ((devid == p->devid) && p->group)
327 			entry->group = p->group;
328 	}
329 
330 	if (!entry->group)
331 		entry->group = generic_device_group(dev);
332 	else
333 		iommu_group_ref_get(entry->group);
334 
335 	return entry->group;
336 }
337 
338 static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
339 {
340 	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
341 }
342 
343 static u32 pdev_get_caps(struct pci_dev *pdev)
344 {
345 	int features;
346 	u32 flags = 0;
347 
348 	if (pci_ats_supported(pdev))
349 		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
350 
351 	if (pci_pri_supported(pdev))
352 		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
353 
354 	features = pci_pasid_features(pdev);
355 	if (features >= 0) {
356 		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
357 
358 		if (features & PCI_PASID_CAP_EXEC)
359 			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
360 
361 		if (features & PCI_PASID_CAP_PRIV)
362 			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
363 	}
364 
365 	return flags;
366 }
367 
368 static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
369 {
370 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
371 	int ret = -EINVAL;
372 
373 	if (dev_data->ats_enabled)
374 		return 0;
375 
376 	if (amd_iommu_iotlb_sup &&
377 	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
378 		ret = pci_enable_ats(pdev, PAGE_SHIFT);
379 		if (!ret) {
380 			dev_data->ats_enabled = 1;
381 			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
382 		}
383 	}
384 
385 	return ret;
386 }
387 
388 static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
389 {
390 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
391 
392 	if (dev_data->ats_enabled) {
393 		pci_disable_ats(pdev);
394 		dev_data->ats_enabled = 0;
395 	}
396 }
397 
398 static inline int pdev_enable_cap_pri(struct pci_dev *pdev)
399 {
400 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
401 	int ret = -EINVAL;
402 
403 	if (dev_data->pri_enabled)
404 		return 0;
405 
406 	if (!dev_data->ats_enabled)
407 		return 0;
408 
409 	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
410 		/*
411 		 * First reset the PRI state of the device.
412 		 * FIXME: Hardcode number of outstanding requests for now
413 		 */
414 		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
415 			dev_data->pri_enabled = 1;
416 			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
417 
418 			ret = 0;
419 		}
420 	}
421 
422 	return ret;
423 }
424 
425 static inline void pdev_disable_cap_pri(struct pci_dev *pdev)
426 {
427 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
428 
429 	if (dev_data->pri_enabled) {
430 		pci_disable_pri(pdev);
431 		dev_data->pri_enabled = 0;
432 	}
433 }
434 
435 static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
436 {
437 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
438 	int ret = -EINVAL;
439 
440 	if (dev_data->pasid_enabled)
441 		return 0;
442 
443 	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
444 		/* Only allow access to user-accessible pages */
445 		ret = pci_enable_pasid(pdev, 0);
446 		if (!ret)
447 			dev_data->pasid_enabled = 1;
448 	}
449 
450 	return ret;
451 }
452 
453 static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
454 {
455 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
456 
457 	if (dev_data->pasid_enabled) {
458 		pci_disable_pasid(pdev);
459 		dev_data->pasid_enabled = 0;
460 	}
461 }
462 
463 static void pdev_enable_caps(struct pci_dev *pdev)
464 {
465 	pdev_enable_cap_ats(pdev);
466 	pdev_enable_cap_pasid(pdev);
467 	pdev_enable_cap_pri(pdev);
468 }
469 
470 static void pdev_disable_caps(struct pci_dev *pdev)
471 {
472 	pdev_disable_cap_ats(pdev);
473 	pdev_disable_cap_pasid(pdev);
474 	pdev_disable_cap_pri(pdev);
475 }
476 
477 /*
478  * This function checks if the driver got a valid device from the caller to
479  * avoid dereferencing invalid pointers.
480  */
481 static bool check_device(struct device *dev)
482 {
483 	struct amd_iommu_pci_seg *pci_seg;
484 	struct amd_iommu *iommu;
485 	int devid, sbdf;
486 
487 	if (!dev)
488 		return false;
489 
490 	sbdf = get_device_sbdf_id(dev);
491 	if (sbdf < 0)
492 		return false;
493 	devid = PCI_SBDF_TO_DEVID(sbdf);
494 
495 	iommu = rlookup_amd_iommu(dev);
496 	if (!iommu)
497 		return false;
498 
499 	/* Out of our scope? */
500 	pci_seg = iommu->pci_seg;
501 	if (devid > pci_seg->last_bdf)
502 		return false;
503 
504 	return true;
505 }
506 
507 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
508 {
509 	struct iommu_dev_data *dev_data;
510 	int devid, sbdf;
511 
512 	if (dev_iommu_priv_get(dev))
513 		return 0;
514 
515 	sbdf = get_device_sbdf_id(dev);
516 	if (sbdf < 0)
517 		return sbdf;
518 
519 	devid = PCI_SBDF_TO_DEVID(sbdf);
520 	dev_data = find_dev_data(iommu, devid);
521 	if (!dev_data)
522 		return -ENOMEM;
523 
524 	dev_data->dev = dev;
525 	setup_aliases(iommu, dev);
526 
527 	/*
528 	 * By default we use passthrough mode for IOMMUv2 capable device.
529 	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
530 	 * invalid address), we ignore the capability for the device so
531 	 * it'll be forced to go into translation mode.
532 	 */
533 	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
534 	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
535 		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
536 	}
537 
538 	dev_iommu_priv_set(dev, dev_data);
539 
540 	return 0;
541 }
542 
543 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
544 {
545 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
546 	struct dev_table_entry *dev_table = get_dev_table(iommu);
547 	int devid, sbdf;
548 
549 	sbdf = get_device_sbdf_id(dev);
550 	if (sbdf < 0)
551 		return;
552 
553 	devid = PCI_SBDF_TO_DEVID(sbdf);
554 	pci_seg->rlookup_table[devid] = NULL;
555 	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
556 
557 	setup_aliases(iommu, dev);
558 }
559 
560 static void amd_iommu_uninit_device(struct device *dev)
561 {
562 	struct iommu_dev_data *dev_data;
563 
564 	dev_data = dev_iommu_priv_get(dev);
565 	if (!dev_data)
566 		return;
567 
568 	if (dev_data->domain)
569 		detach_device(dev);
570 
571 	/*
572 	 * We keep dev_data around for unplugged devices and reuse it when the
573 	 * device is re-plugged - not doing so would introduce a ton of races.
574 	 */
575 }
576 
577 /****************************************************************************
578  *
579  * Interrupt handling functions
580  *
581  ****************************************************************************/
582 
583 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
584 {
585 	int i;
586 	struct dev_table_entry *dev_table = get_dev_table(iommu);
587 
588 	for (i = 0; i < 4; ++i)
589 		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
590 }
591 
592 static void dump_command(unsigned long phys_addr)
593 {
594 	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
595 	int i;
596 
597 	for (i = 0; i < 4; ++i)
598 		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
599 }
600 
601 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
602 {
603 	struct iommu_dev_data *dev_data = NULL;
604 	int devid, vmg_tag, flags;
605 	struct pci_dev *pdev;
606 	u64 spa;
607 
608 	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
609 	vmg_tag = (event[1]) & 0xFFFF;
610 	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
611 	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
612 
613 	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
614 					   devid & 0xff);
615 	if (pdev)
616 		dev_data = dev_iommu_priv_get(&pdev->dev);
617 
618 	if (dev_data) {
619 		if (__ratelimit(&dev_data->rs)) {
620 			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
621 				vmg_tag, spa, flags);
622 		}
623 	} else {
624 		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
625 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
626 			vmg_tag, spa, flags);
627 	}
628 
629 	if (pdev)
630 		pci_dev_put(pdev);
631 }
632 
633 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
634 {
635 	struct iommu_dev_data *dev_data = NULL;
636 	int devid, flags_rmp, vmg_tag, flags;
637 	struct pci_dev *pdev;
638 	u64 gpa;
639 
640 	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
641 	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
642 	vmg_tag   = (event[1]) & 0xFFFF;
643 	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
644 	gpa       = ((u64)event[3] << 32) | event[2];
645 
646 	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
647 					   devid & 0xff);
648 	if (pdev)
649 		dev_data = dev_iommu_priv_get(&pdev->dev);
650 
651 	if (dev_data) {
652 		if (__ratelimit(&dev_data->rs)) {
653 			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
654 				vmg_tag, gpa, flags_rmp, flags);
655 		}
656 	} else {
657 		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
658 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
659 			vmg_tag, gpa, flags_rmp, flags);
660 	}
661 
662 	if (pdev)
663 		pci_dev_put(pdev);
664 }
665 
666 #define IS_IOMMU_MEM_TRANSACTION(flags)		\
667 	(((flags) & EVENT_FLAG_I) == 0)
668 
669 #define IS_WRITE_REQUEST(flags)			\
670 	((flags) & EVENT_FLAG_RW)
671 
672 static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
673 					u16 devid, u16 domain_id,
674 					u64 address, int flags)
675 {
676 	struct iommu_dev_data *dev_data = NULL;
677 	struct pci_dev *pdev;
678 
679 	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
680 					   devid & 0xff);
681 	if (pdev)
682 		dev_data = dev_iommu_priv_get(&pdev->dev);
683 
684 	if (dev_data) {
685 		/*
686 		 * If this is a DMA fault (for which the I(nterrupt)
687 		 * bit will be unset), allow report_iommu_fault() to
688 		 * prevent logging it.
689 		 */
690 		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
691 			/* Device not attached to domain properly */
692 			if (dev_data->domain == NULL) {
693 				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
694 				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
695 						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
696 						   PCI_FUNC(devid), domain_id);
697 				goto out;
698 			}
699 
700 			if (!report_iommu_fault(&dev_data->domain->domain,
701 						&pdev->dev, address,
702 						IS_WRITE_REQUEST(flags) ?
703 							IOMMU_FAULT_WRITE :
704 							IOMMU_FAULT_READ))
705 				goto out;
706 		}
707 
708 		if (__ratelimit(&dev_data->rs)) {
709 			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
710 				domain_id, address, flags);
711 		}
712 	} else {
713 		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
714 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
715 			domain_id, address, flags);
716 	}
717 
718 out:
719 	if (pdev)
720 		pci_dev_put(pdev);
721 }
722 
723 static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
724 {
725 	struct device *dev = iommu->iommu.dev;
726 	int type, devid, flags, tag;
727 	volatile u32 *event = __evt;
728 	int count = 0;
729 	u64 address;
730 	u32 pasid;
731 
732 retry:
733 	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
734 	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
735 	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
736 		  (event[1] & EVENT_DOMID_MASK_LO);
737 	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
738 	address = (u64)(((u64)event[3]) << 32) | event[2];
739 
740 	if (type == 0) {
741 		/* Did we hit the erratum? */
742 		if (++count == LOOP_TIMEOUT) {
743 			pr_err("No event written to event log\n");
744 			return;
745 		}
746 		udelay(1);
747 		goto retry;
748 	}
749 
750 	if (type == EVENT_TYPE_IO_FAULT) {
751 		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
752 		return;
753 	}
754 
755 	switch (type) {
756 	case EVENT_TYPE_ILL_DEV:
757 		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
758 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
759 			pasid, address, flags);
760 		dump_dte_entry(iommu, devid);
761 		break;
762 	case EVENT_TYPE_DEV_TAB_ERR:
763 		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
764 			"address=0x%llx flags=0x%04x]\n",
765 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
766 			address, flags);
767 		break;
768 	case EVENT_TYPE_PAGE_TAB_ERR:
769 		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
770 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
771 			pasid, address, flags);
772 		break;
773 	case EVENT_TYPE_ILL_CMD:
774 		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
775 		dump_command(address);
776 		break;
777 	case EVENT_TYPE_CMD_HARD_ERR:
778 		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
779 			address, flags);
780 		break;
781 	case EVENT_TYPE_IOTLB_INV_TO:
782 		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
783 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
784 			address);
785 		break;
786 	case EVENT_TYPE_INV_DEV_REQ:
787 		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
788 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
789 			pasid, address, flags);
790 		break;
791 	case EVENT_TYPE_RMP_FAULT:
792 		amd_iommu_report_rmp_fault(iommu, event);
793 		break;
794 	case EVENT_TYPE_RMP_HW_ERR:
795 		amd_iommu_report_rmp_hw_error(iommu, event);
796 		break;
797 	case EVENT_TYPE_INV_PPR_REQ:
798 		pasid = PPR_PASID(*((u64 *)__evt));
799 		tag = event[1] & 0x03FF;
800 		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
801 			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
802 			pasid, address, flags, tag);
803 		break;
804 	default:
805 		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
806 			event[0], event[1], event[2], event[3]);
807 	}
808 
809 	/*
810 	 * To detect the hardware errata 732 we need to clear the
811 	 * entry back to zero. This issue does not exist on SNP
812 	 * enabled system. Also this buffer is not writeable on
813 	 * SNP enabled system.
814 	 */
815 	if (!amd_iommu_snp_en)
816 		memset(__evt, 0, 4 * sizeof(u32));
817 }
818 
819 static void iommu_poll_events(struct amd_iommu *iommu)
820 {
821 	u32 head, tail;
822 
823 	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
824 	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
825 
826 	while (head != tail) {
827 		iommu_print_event(iommu, iommu->evt_buf + head);
828 		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
829 	}
830 
831 	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
832 }
833 
834 #ifdef CONFIG_IRQ_REMAP
835 static int (*iommu_ga_log_notifier)(u32);
836 
837 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
838 {
839 	iommu_ga_log_notifier = notifier;
840 
841 	return 0;
842 }
843 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
844 
845 static void iommu_poll_ga_log(struct amd_iommu *iommu)
846 {
847 	u32 head, tail;
848 
849 	if (iommu->ga_log == NULL)
850 		return;
851 
852 	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
853 	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
854 
855 	while (head != tail) {
856 		volatile u64 *raw;
857 		u64 log_entry;
858 
859 		raw = (u64 *)(iommu->ga_log + head);
860 
861 		/* Avoid memcpy function-call overhead */
862 		log_entry = *raw;
863 
864 		/* Update head pointer of hardware ring-buffer */
865 		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
866 		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
867 
868 		/* Handle GA entry */
869 		switch (GA_REQ_TYPE(log_entry)) {
870 		case GA_GUEST_NR:
871 			if (!iommu_ga_log_notifier)
872 				break;
873 
874 			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
875 				 __func__, GA_DEVID(log_entry),
876 				 GA_TAG(log_entry));
877 
878 			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
879 				pr_err("GA log notifier failed.\n");
880 			break;
881 		default:
882 			break;
883 		}
884 	}
885 }
886 
887 static void
888 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
889 {
890 	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
891 	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
892 		return;
893 
894 	dev_set_msi_domain(dev, iommu->ir_domain);
895 }
896 
897 #else /* CONFIG_IRQ_REMAP */
898 static inline void
899 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
900 #endif /* !CONFIG_IRQ_REMAP */
901 
902 static void amd_iommu_handle_irq(void *data, const char *evt_type,
903 				 u32 int_mask, u32 overflow_mask,
904 				 void (*int_handler)(struct amd_iommu *),
905 				 void (*overflow_handler)(struct amd_iommu *))
906 {
907 	struct amd_iommu *iommu = (struct amd_iommu *) data;
908 	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
909 	u32 mask = int_mask | overflow_mask;
910 
911 	while (status & mask) {
912 		/* Enable interrupt sources again */
913 		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
914 
915 		if (int_handler) {
916 			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
917 				 iommu->index, evt_type);
918 			int_handler(iommu);
919 		}
920 
921 		if ((status & overflow_mask) && overflow_handler)
922 			overflow_handler(iommu);
923 
924 		/*
925 		 * Hardware bug: ERBT1312
926 		 * When re-enabling interrupt (by writing 1
927 		 * to clear the bit), the hardware might also try to set
928 		 * the interrupt bit in the event status register.
929 		 * In this scenario, the bit will be set, and disable
930 		 * subsequent interrupts.
931 		 *
932 		 * Workaround: The IOMMU driver should read back the
933 		 * status register and check if the interrupt bits are cleared.
934 		 * If not, driver will need to go through the interrupt handler
935 		 * again and re-clear the bits
936 		 */
937 		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
938 	}
939 }
940 
941 irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
942 {
943 	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
944 			     MMIO_STATUS_EVT_OVERFLOW_MASK,
945 			     iommu_poll_events, amd_iommu_restart_event_logging);
946 
947 	return IRQ_HANDLED;
948 }
949 
950 irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
951 {
952 	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
953 			     MMIO_STATUS_PPR_OVERFLOW_MASK,
954 			     amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
955 
956 	return IRQ_HANDLED;
957 }
958 
959 irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
960 {
961 #ifdef CONFIG_IRQ_REMAP
962 	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
963 			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
964 			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
965 #endif
966 
967 	return IRQ_HANDLED;
968 }
969 
970 irqreturn_t amd_iommu_int_thread(int irq, void *data)
971 {
972 	amd_iommu_int_thread_evtlog(irq, data);
973 	amd_iommu_int_thread_pprlog(irq, data);
974 	amd_iommu_int_thread_galog(irq, data);
975 
976 	return IRQ_HANDLED;
977 }
978 
979 irqreturn_t amd_iommu_int_handler(int irq, void *data)
980 {
981 	return IRQ_WAKE_THREAD;
982 }
983 
984 /****************************************************************************
985  *
986  * IOMMU command queuing functions
987  *
988  ****************************************************************************/
989 
990 static int wait_on_sem(struct amd_iommu *iommu, u64 data)
991 {
992 	int i = 0;
993 
994 	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
995 		udelay(1);
996 		i += 1;
997 	}
998 
999 	if (i == LOOP_TIMEOUT) {
1000 		pr_alert("Completion-Wait loop timed out\n");
1001 		return -EIO;
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 static void copy_cmd_to_buffer(struct amd_iommu *iommu,
1008 			       struct iommu_cmd *cmd)
1009 {
1010 	u8 *target;
1011 	u32 tail;
1012 
1013 	/* Copy command to buffer */
1014 	tail = iommu->cmd_buf_tail;
1015 	target = iommu->cmd_buf + tail;
1016 	memcpy(target, cmd, sizeof(*cmd));
1017 
1018 	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1019 	iommu->cmd_buf_tail = tail;
1020 
1021 	/* Tell the IOMMU about it */
1022 	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1023 }
1024 
1025 static void build_completion_wait(struct iommu_cmd *cmd,
1026 				  struct amd_iommu *iommu,
1027 				  u64 data)
1028 {
1029 	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1030 
1031 	memset(cmd, 0, sizeof(*cmd));
1032 	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1033 	cmd->data[1] = upper_32_bits(paddr);
1034 	cmd->data[2] = lower_32_bits(data);
1035 	cmd->data[3] = upper_32_bits(data);
1036 	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1037 }
1038 
1039 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1040 {
1041 	memset(cmd, 0, sizeof(*cmd));
1042 	cmd->data[0] = devid;
1043 	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1044 }
1045 
1046 /*
1047  * Builds an invalidation address which is suitable for one page or multiple
1048  * pages. Sets the size bit (S) as needed is more than one page is flushed.
1049  */
1050 static inline u64 build_inv_address(u64 address, size_t size)
1051 {
1052 	u64 pages, end, msb_diff;
1053 
1054 	pages = iommu_num_pages(address, size, PAGE_SIZE);
1055 
1056 	if (pages == 1)
1057 		return address & PAGE_MASK;
1058 
1059 	end = address + size - 1;
1060 
1061 	/*
1062 	 * msb_diff would hold the index of the most significant bit that
1063 	 * flipped between the start and end.
1064 	 */
1065 	msb_diff = fls64(end ^ address) - 1;
1066 
1067 	/*
1068 	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1069 	 * between the start and the end, invalidate everything.
1070 	 */
1071 	if (unlikely(msb_diff > 51)) {
1072 		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1073 	} else {
1074 		/*
1075 		 * The msb-bit must be clear on the address. Just set all the
1076 		 * lower bits.
1077 		 */
1078 		address |= (1ull << msb_diff) - 1;
1079 	}
1080 
1081 	/* Clear bits 11:0 */
1082 	address &= PAGE_MASK;
1083 
1084 	/* Set the size bit - we flush more than one 4kb page */
1085 	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1086 }
1087 
1088 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1089 				  size_t size, u16 domid,
1090 				  ioasid_t pasid, bool gn)
1091 {
1092 	u64 inv_address = build_inv_address(address, size);
1093 
1094 	memset(cmd, 0, sizeof(*cmd));
1095 
1096 	cmd->data[1] |= domid;
1097 	cmd->data[2]  = lower_32_bits(inv_address);
1098 	cmd->data[3]  = upper_32_bits(inv_address);
1099 	/* PDE bit - we want to flush everything, not only the PTEs */
1100 	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1101 	if (gn) {
1102 		cmd->data[0] |= pasid;
1103 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1104 	}
1105 	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1106 }
1107 
1108 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1109 				  u64 address, size_t size,
1110 				  ioasid_t pasid, bool gn)
1111 {
1112 	u64 inv_address = build_inv_address(address, size);
1113 
1114 	memset(cmd, 0, sizeof(*cmd));
1115 
1116 	cmd->data[0]  = devid;
1117 	cmd->data[0] |= (qdep & 0xff) << 24;
1118 	cmd->data[1]  = devid;
1119 	cmd->data[2]  = lower_32_bits(inv_address);
1120 	cmd->data[3]  = upper_32_bits(inv_address);
1121 	if (gn) {
1122 		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1123 		cmd->data[1] |= (pasid & 0xff) << 16;
1124 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1125 	}
1126 
1127 	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1128 }
1129 
1130 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1131 			       int status, int tag, u8 gn)
1132 {
1133 	memset(cmd, 0, sizeof(*cmd));
1134 
1135 	cmd->data[0]  = devid;
1136 	if (gn) {
1137 		cmd->data[1]  = pasid;
1138 		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1139 	}
1140 	cmd->data[3]  = tag & 0x1ff;
1141 	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1142 
1143 	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1144 }
1145 
1146 static void build_inv_all(struct iommu_cmd *cmd)
1147 {
1148 	memset(cmd, 0, sizeof(*cmd));
1149 	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1150 }
1151 
1152 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1153 {
1154 	memset(cmd, 0, sizeof(*cmd));
1155 	cmd->data[0] = devid;
1156 	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1157 }
1158 
1159 /*
1160  * Writes the command to the IOMMUs command buffer and informs the
1161  * hardware about the new command.
1162  */
1163 static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1164 				      struct iommu_cmd *cmd,
1165 				      bool sync)
1166 {
1167 	unsigned int count = 0;
1168 	u32 left, next_tail;
1169 
1170 	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1171 again:
1172 	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1173 
1174 	if (left <= 0x20) {
1175 		/* Skip udelay() the first time around */
1176 		if (count++) {
1177 			if (count == LOOP_TIMEOUT) {
1178 				pr_err("Command buffer timeout\n");
1179 				return -EIO;
1180 			}
1181 
1182 			udelay(1);
1183 		}
1184 
1185 		/* Update head and recheck remaining space */
1186 		iommu->cmd_buf_head = readl(iommu->mmio_base +
1187 					    MMIO_CMD_HEAD_OFFSET);
1188 
1189 		goto again;
1190 	}
1191 
1192 	copy_cmd_to_buffer(iommu, cmd);
1193 
1194 	/* Do we need to make sure all commands are processed? */
1195 	iommu->need_sync = sync;
1196 
1197 	return 0;
1198 }
1199 
1200 static int iommu_queue_command_sync(struct amd_iommu *iommu,
1201 				    struct iommu_cmd *cmd,
1202 				    bool sync)
1203 {
1204 	unsigned long flags;
1205 	int ret;
1206 
1207 	raw_spin_lock_irqsave(&iommu->lock, flags);
1208 	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1209 	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1210 
1211 	return ret;
1212 }
1213 
1214 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1215 {
1216 	return iommu_queue_command_sync(iommu, cmd, true);
1217 }
1218 
1219 /*
1220  * This function queues a completion wait command into the command
1221  * buffer of an IOMMU
1222  */
1223 static int iommu_completion_wait(struct amd_iommu *iommu)
1224 {
1225 	struct iommu_cmd cmd;
1226 	unsigned long flags;
1227 	int ret;
1228 	u64 data;
1229 
1230 	if (!iommu->need_sync)
1231 		return 0;
1232 
1233 	data = atomic64_add_return(1, &iommu->cmd_sem_val);
1234 	build_completion_wait(&cmd, iommu, data);
1235 
1236 	raw_spin_lock_irqsave(&iommu->lock, flags);
1237 
1238 	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1239 	if (ret)
1240 		goto out_unlock;
1241 
1242 	ret = wait_on_sem(iommu, data);
1243 
1244 out_unlock:
1245 	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1246 
1247 	return ret;
1248 }
1249 
1250 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1251 {
1252 	struct iommu_cmd cmd;
1253 
1254 	build_inv_dte(&cmd, devid);
1255 
1256 	return iommu_queue_command(iommu, &cmd);
1257 }
1258 
1259 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1260 {
1261 	u32 devid;
1262 	u16 last_bdf = iommu->pci_seg->last_bdf;
1263 
1264 	for (devid = 0; devid <= last_bdf; ++devid)
1265 		iommu_flush_dte(iommu, devid);
1266 
1267 	iommu_completion_wait(iommu);
1268 }
1269 
1270 /*
1271  * This function uses heavy locking and may disable irqs for some time. But
1272  * this is no issue because it is only called during resume.
1273  */
1274 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1275 {
1276 	u32 dom_id;
1277 	u16 last_bdf = iommu->pci_seg->last_bdf;
1278 
1279 	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1280 		struct iommu_cmd cmd;
1281 		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1282 				      dom_id, IOMMU_NO_PASID, false);
1283 		iommu_queue_command(iommu, &cmd);
1284 	}
1285 
1286 	iommu_completion_wait(iommu);
1287 }
1288 
1289 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1290 {
1291 	struct iommu_cmd cmd;
1292 
1293 	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1294 			      dom_id, IOMMU_NO_PASID, false);
1295 	iommu_queue_command(iommu, &cmd);
1296 
1297 	iommu_completion_wait(iommu);
1298 }
1299 
1300 static void amd_iommu_flush_all(struct amd_iommu *iommu)
1301 {
1302 	struct iommu_cmd cmd;
1303 
1304 	build_inv_all(&cmd);
1305 
1306 	iommu_queue_command(iommu, &cmd);
1307 	iommu_completion_wait(iommu);
1308 }
1309 
1310 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1311 {
1312 	struct iommu_cmd cmd;
1313 
1314 	build_inv_irt(&cmd, devid);
1315 
1316 	iommu_queue_command(iommu, &cmd);
1317 }
1318 
1319 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1320 {
1321 	u32 devid;
1322 	u16 last_bdf = iommu->pci_seg->last_bdf;
1323 
1324 	if (iommu->irtcachedis_enabled)
1325 		return;
1326 
1327 	for (devid = 0; devid <= last_bdf; devid++)
1328 		iommu_flush_irt(iommu, devid);
1329 
1330 	iommu_completion_wait(iommu);
1331 }
1332 
1333 void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1334 {
1335 	if (check_feature(FEATURE_IA)) {
1336 		amd_iommu_flush_all(iommu);
1337 	} else {
1338 		amd_iommu_flush_dte_all(iommu);
1339 		amd_iommu_flush_irt_all(iommu);
1340 		amd_iommu_flush_tlb_all(iommu);
1341 	}
1342 }
1343 
1344 /*
1345  * Command send function for flushing on-device TLB
1346  */
1347 static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1348 			      size_t size, ioasid_t pasid, bool gn)
1349 {
1350 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1351 	struct iommu_cmd cmd;
1352 	int qdep = dev_data->ats_qdep;
1353 
1354 	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1355 			      size, pasid, gn);
1356 
1357 	return iommu_queue_command(iommu, &cmd);
1358 }
1359 
1360 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1361 {
1362 	struct amd_iommu *iommu = data;
1363 
1364 	return iommu_flush_dte(iommu, alias);
1365 }
1366 
1367 /*
1368  * Command send function for invalidating a device table entry
1369  */
1370 static int device_flush_dte(struct iommu_dev_data *dev_data)
1371 {
1372 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1373 	struct pci_dev *pdev = NULL;
1374 	struct amd_iommu_pci_seg *pci_seg;
1375 	u16 alias;
1376 	int ret;
1377 
1378 	if (dev_is_pci(dev_data->dev))
1379 		pdev = to_pci_dev(dev_data->dev);
1380 
1381 	if (pdev)
1382 		ret = pci_for_each_dma_alias(pdev,
1383 					     device_flush_dte_alias, iommu);
1384 	else
1385 		ret = iommu_flush_dte(iommu, dev_data->devid);
1386 	if (ret)
1387 		return ret;
1388 
1389 	pci_seg = iommu->pci_seg;
1390 	alias = pci_seg->alias_table[dev_data->devid];
1391 	if (alias != dev_data->devid) {
1392 		ret = iommu_flush_dte(iommu, alias);
1393 		if (ret)
1394 			return ret;
1395 	}
1396 
1397 	if (dev_data->ats_enabled) {
1398 		/* Invalidate the entire contents of an IOTLB */
1399 		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1400 					 IOMMU_NO_PASID, false);
1401 	}
1402 
1403 	return ret;
1404 }
1405 
1406 static int domain_flush_pages_v2(struct protection_domain *pdom,
1407 				 u64 address, size_t size)
1408 {
1409 	struct iommu_dev_data *dev_data;
1410 	struct iommu_cmd cmd;
1411 	int ret = 0;
1412 
1413 	list_for_each_entry(dev_data, &pdom->dev_list, list) {
1414 		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1415 		u16 domid = dev_data->gcr3_info.domid;
1416 
1417 		build_inv_iommu_pages(&cmd, address, size,
1418 				      domid, IOMMU_NO_PASID, true);
1419 
1420 		ret |= iommu_queue_command(iommu, &cmd);
1421 	}
1422 
1423 	return ret;
1424 }
1425 
1426 static int domain_flush_pages_v1(struct protection_domain *pdom,
1427 				 u64 address, size_t size)
1428 {
1429 	struct iommu_cmd cmd;
1430 	int ret = 0, i;
1431 
1432 	build_inv_iommu_pages(&cmd, address, size,
1433 			      pdom->id, IOMMU_NO_PASID, false);
1434 
1435 	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1436 		if (!pdom->dev_iommu[i])
1437 			continue;
1438 
1439 		/*
1440 		 * Devices of this domain are behind this IOMMU
1441 		 * We need a TLB flush
1442 		 */
1443 		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1444 	}
1445 
1446 	return ret;
1447 }
1448 
1449 /*
1450  * TLB invalidation function which is called from the mapping functions.
1451  * It flushes range of PTEs of the domain.
1452  */
1453 static void __domain_flush_pages(struct protection_domain *domain,
1454 				 u64 address, size_t size)
1455 {
1456 	struct iommu_dev_data *dev_data;
1457 	int ret = 0;
1458 	ioasid_t pasid = IOMMU_NO_PASID;
1459 	bool gn = false;
1460 
1461 	if (pdom_is_v2_pgtbl_mode(domain)) {
1462 		gn = true;
1463 		ret = domain_flush_pages_v2(domain, address, size);
1464 	} else {
1465 		ret = domain_flush_pages_v1(domain, address, size);
1466 	}
1467 
1468 	list_for_each_entry(dev_data, &domain->dev_list, list) {
1469 
1470 		if (!dev_data->ats_enabled)
1471 			continue;
1472 
1473 		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1474 	}
1475 
1476 	WARN_ON(ret);
1477 }
1478 
1479 void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1480 				  u64 address, size_t size)
1481 {
1482 	if (likely(!amd_iommu_np_cache)) {
1483 		__domain_flush_pages(domain, address, size);
1484 
1485 		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1486 		amd_iommu_domain_flush_complete(domain);
1487 
1488 		return;
1489 	}
1490 
1491 	/*
1492 	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1493 	 * In such setups it is best to avoid flushes of ranges which are not
1494 	 * naturally aligned, since it would lead to flushes of unmodified
1495 	 * PTEs. Such flushes would require the hypervisor to do more work than
1496 	 * necessary. Therefore, perform repeated flushes of aligned ranges
1497 	 * until you cover the range. Each iteration flushes the smaller
1498 	 * between the natural alignment of the address that we flush and the
1499 	 * greatest naturally aligned region that fits in the range.
1500 	 */
1501 	while (size != 0) {
1502 		int addr_alignment = __ffs(address);
1503 		int size_alignment = __fls(size);
1504 		int min_alignment;
1505 		size_t flush_size;
1506 
1507 		/*
1508 		 * size is always non-zero, but address might be zero, causing
1509 		 * addr_alignment to be negative. As the casting of the
1510 		 * argument in __ffs(address) to long might trim the high bits
1511 		 * of the address on x86-32, cast to long when doing the check.
1512 		 */
1513 		if (likely((unsigned long)address != 0))
1514 			min_alignment = min(addr_alignment, size_alignment);
1515 		else
1516 			min_alignment = size_alignment;
1517 
1518 		flush_size = 1ul << min_alignment;
1519 
1520 		__domain_flush_pages(domain, address, flush_size);
1521 		address += flush_size;
1522 		size -= flush_size;
1523 	}
1524 
1525 	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1526 	amd_iommu_domain_flush_complete(domain);
1527 }
1528 
1529 /* Flush the whole IO/TLB for a given protection domain - including PDE */
1530 static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1531 {
1532 	amd_iommu_domain_flush_pages(domain, 0,
1533 				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1534 }
1535 
1536 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1537 				     ioasid_t pasid, u64 address, size_t size)
1538 {
1539 	struct iommu_cmd cmd;
1540 	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1541 
1542 	build_inv_iommu_pages(&cmd, address, size,
1543 			      dev_data->gcr3_info.domid, pasid, true);
1544 	iommu_queue_command(iommu, &cmd);
1545 
1546 	if (dev_data->ats_enabled)
1547 		device_flush_iotlb(dev_data, address, size, pasid, true);
1548 
1549 	iommu_completion_wait(iommu);
1550 }
1551 
1552 void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1553 				   ioasid_t pasid)
1554 {
1555 	amd_iommu_dev_flush_pasid_pages(dev_data, 0,
1556 					CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid);
1557 }
1558 
1559 void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1560 {
1561 	int i;
1562 
1563 	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1564 		if (domain && !domain->dev_iommu[i])
1565 			continue;
1566 
1567 		/*
1568 		 * Devices of this domain are behind this IOMMU
1569 		 * We need to wait for completion of all commands.
1570 		 */
1571 		iommu_completion_wait(amd_iommus[i]);
1572 	}
1573 }
1574 
1575 /* Flush the not present cache if it exists */
1576 static void domain_flush_np_cache(struct protection_domain *domain,
1577 		dma_addr_t iova, size_t size)
1578 {
1579 	if (unlikely(amd_iommu_np_cache)) {
1580 		unsigned long flags;
1581 
1582 		spin_lock_irqsave(&domain->lock, flags);
1583 		amd_iommu_domain_flush_pages(domain, iova, size);
1584 		spin_unlock_irqrestore(&domain->lock, flags);
1585 	}
1586 }
1587 
1588 
1589 /*
1590  * This function flushes the DTEs for all devices in domain
1591  */
1592 static void domain_flush_devices(struct protection_domain *domain)
1593 {
1594 	struct iommu_dev_data *dev_data;
1595 
1596 	list_for_each_entry(dev_data, &domain->dev_list, list)
1597 		device_flush_dte(dev_data);
1598 }
1599 
1600 static void update_device_table(struct protection_domain *domain)
1601 {
1602 	struct iommu_dev_data *dev_data;
1603 
1604 	list_for_each_entry(dev_data, &domain->dev_list, list) {
1605 		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1606 
1607 		set_dte_entry(iommu, dev_data);
1608 		clone_aliases(iommu, dev_data->dev);
1609 	}
1610 }
1611 
1612 void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1613 {
1614 	update_device_table(domain);
1615 	domain_flush_devices(domain);
1616 }
1617 
1618 void amd_iommu_domain_update(struct protection_domain *domain)
1619 {
1620 	/* Update device table */
1621 	amd_iommu_update_and_flush_device_table(domain);
1622 
1623 	/* Flush domain TLB(s) and wait for completion */
1624 	amd_iommu_domain_flush_all(domain);
1625 }
1626 
1627 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
1628 {
1629 	struct iommu_dev_data *dev_data;
1630 	struct amd_iommu *iommu;
1631 	struct iommu_cmd cmd;
1632 
1633 	dev_data = dev_iommu_priv_get(dev);
1634 	iommu    = get_amd_iommu_from_dev(dev);
1635 
1636 	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
1637 			   tag, dev_data->pri_tlp);
1638 
1639 	return iommu_queue_command(iommu, &cmd);
1640 }
1641 
1642 /****************************************************************************
1643  *
1644  * The next functions belong to the domain allocation. A domain is
1645  * allocated for every IOMMU as the default domain. If device isolation
1646  * is enabled, every device get its own domain. The most important thing
1647  * about domains is the page table mapping the DMA address space they
1648  * contain.
1649  *
1650  ****************************************************************************/
1651 
1652 static u16 domain_id_alloc(void)
1653 {
1654 	unsigned long flags;
1655 	int id;
1656 
1657 	spin_lock_irqsave(&pd_bitmap_lock, flags);
1658 	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1659 	BUG_ON(id == 0);
1660 	if (id > 0 && id < MAX_DOMAIN_ID)
1661 		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1662 	else
1663 		id = 0;
1664 	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1665 
1666 	return id;
1667 }
1668 
1669 static void domain_id_free(int id)
1670 {
1671 	unsigned long flags;
1672 
1673 	spin_lock_irqsave(&pd_bitmap_lock, flags);
1674 	if (id > 0 && id < MAX_DOMAIN_ID)
1675 		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1676 	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1677 }
1678 
1679 static void free_gcr3_tbl_level1(u64 *tbl)
1680 {
1681 	u64 *ptr;
1682 	int i;
1683 
1684 	for (i = 0; i < 512; ++i) {
1685 		if (!(tbl[i] & GCR3_VALID))
1686 			continue;
1687 
1688 		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1689 
1690 		iommu_free_page(ptr);
1691 	}
1692 }
1693 
1694 static void free_gcr3_tbl_level2(u64 *tbl)
1695 {
1696 	u64 *ptr;
1697 	int i;
1698 
1699 	for (i = 0; i < 512; ++i) {
1700 		if (!(tbl[i] & GCR3_VALID))
1701 			continue;
1702 
1703 		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1704 
1705 		free_gcr3_tbl_level1(ptr);
1706 	}
1707 }
1708 
1709 static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1710 {
1711 	if (gcr3_info->glx == 2)
1712 		free_gcr3_tbl_level2(gcr3_info->gcr3_tbl);
1713 	else if (gcr3_info->glx == 1)
1714 		free_gcr3_tbl_level1(gcr3_info->gcr3_tbl);
1715 	else
1716 		WARN_ON_ONCE(gcr3_info->glx != 0);
1717 
1718 	gcr3_info->glx = 0;
1719 
1720 	/* Free per device domain ID */
1721 	domain_id_free(gcr3_info->domid);
1722 
1723 	iommu_free_page(gcr3_info->gcr3_tbl);
1724 	gcr3_info->gcr3_tbl = NULL;
1725 }
1726 
1727 /*
1728  * Number of GCR3 table levels required. Level must be 4-Kbyte
1729  * page and can contain up to 512 entries.
1730  */
1731 static int get_gcr3_levels(int pasids)
1732 {
1733 	int levels;
1734 
1735 	if (pasids == -1)
1736 		return amd_iommu_max_glx_val;
1737 
1738 	levels = get_count_order(pasids);
1739 
1740 	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1741 }
1742 
1743 static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1744 			    struct amd_iommu *iommu, int pasids)
1745 {
1746 	int levels = get_gcr3_levels(pasids);
1747 	int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
1748 
1749 	if (levels > amd_iommu_max_glx_val)
1750 		return -EINVAL;
1751 
1752 	if (gcr3_info->gcr3_tbl)
1753 		return -EBUSY;
1754 
1755 	/* Allocate per device domain ID */
1756 	gcr3_info->domid = domain_id_alloc();
1757 
1758 	gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC);
1759 	if (gcr3_info->gcr3_tbl == NULL) {
1760 		domain_id_free(gcr3_info->domid);
1761 		return -ENOMEM;
1762 	}
1763 
1764 	gcr3_info->glx = levels;
1765 
1766 	return 0;
1767 }
1768 
1769 static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1770 			   ioasid_t pasid, bool alloc)
1771 {
1772 	int index;
1773 	u64 *pte;
1774 	u64 *root = gcr3_info->gcr3_tbl;
1775 	int level = gcr3_info->glx;
1776 
1777 	while (true) {
1778 
1779 		index = (pasid >> (9 * level)) & 0x1ff;
1780 		pte   = &root[index];
1781 
1782 		if (level == 0)
1783 			break;
1784 
1785 		if (!(*pte & GCR3_VALID)) {
1786 			if (!alloc)
1787 				return NULL;
1788 
1789 			root = (void *)get_zeroed_page(GFP_ATOMIC);
1790 			if (root == NULL)
1791 				return NULL;
1792 
1793 			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
1794 		}
1795 
1796 		root = iommu_phys_to_virt(*pte & PAGE_MASK);
1797 
1798 		level -= 1;
1799 	}
1800 
1801 	return pte;
1802 }
1803 
1804 static int update_gcr3(struct iommu_dev_data *dev_data,
1805 		       ioasid_t pasid, unsigned long gcr3, bool set)
1806 {
1807 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1808 	u64 *pte;
1809 
1810 	pte = __get_gcr3_pte(gcr3_info, pasid, true);
1811 	if (pte == NULL)
1812 		return -ENOMEM;
1813 
1814 	if (set)
1815 		*pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1816 	else
1817 		*pte = 0;
1818 
1819 	amd_iommu_dev_flush_pasid_all(dev_data, pasid);
1820 	return 0;
1821 }
1822 
1823 int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1824 		       unsigned long gcr3)
1825 {
1826 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1827 	int ret;
1828 
1829 	iommu_group_mutex_assert(dev_data->dev);
1830 
1831 	ret = update_gcr3(dev_data, pasid, gcr3, true);
1832 	if (ret)
1833 		return ret;
1834 
1835 	gcr3_info->pasid_cnt++;
1836 	return ret;
1837 }
1838 
1839 int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1840 {
1841 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1842 	int ret;
1843 
1844 	iommu_group_mutex_assert(dev_data->dev);
1845 
1846 	ret = update_gcr3(dev_data, pasid, 0, false);
1847 	if (ret)
1848 		return ret;
1849 
1850 	gcr3_info->pasid_cnt--;
1851 	return ret;
1852 }
1853 
1854 static void set_dte_entry(struct amd_iommu *iommu,
1855 			  struct iommu_dev_data *dev_data)
1856 {
1857 	u64 pte_root = 0;
1858 	u64 flags = 0;
1859 	u32 old_domid;
1860 	u16 devid = dev_data->devid;
1861 	u16 domid;
1862 	struct protection_domain *domain = dev_data->domain;
1863 	struct dev_table_entry *dev_table = get_dev_table(iommu);
1864 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1865 
1866 	if (gcr3_info && gcr3_info->gcr3_tbl)
1867 		domid = dev_data->gcr3_info.domid;
1868 	else
1869 		domid = domain->id;
1870 
1871 	if (domain->iop.mode != PAGE_MODE_NONE)
1872 		pte_root = iommu_virt_to_phys(domain->iop.root);
1873 
1874 	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1875 		    << DEV_ENTRY_MODE_SHIFT;
1876 
1877 	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1878 
1879 	/*
1880 	 * When SNP is enabled, Only set TV bit when IOMMU
1881 	 * page translation is in use.
1882 	 */
1883 	if (!amd_iommu_snp_en || (domid != 0))
1884 		pte_root |= DTE_FLAG_TV;
1885 
1886 	flags = dev_table[devid].data[1];
1887 
1888 	if (dev_data->ats_enabled)
1889 		flags |= DTE_FLAG_IOTLB;
1890 
1891 	if (dev_data->ppr)
1892 		pte_root |= 1ULL << DEV_ENTRY_PPR;
1893 
1894 	if (domain->dirty_tracking)
1895 		pte_root |= DTE_FLAG_HAD;
1896 
1897 	if (gcr3_info && gcr3_info->gcr3_tbl) {
1898 		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
1899 		u64 glx  = gcr3_info->glx;
1900 		u64 tmp;
1901 
1902 		pte_root |= DTE_FLAG_GV;
1903 		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1904 
1905 		/* First mask out possible old values for GCR3 table */
1906 		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1907 		flags    &= ~tmp;
1908 
1909 		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1910 		flags    &= ~tmp;
1911 
1912 		/* Encode GCR3 table into DTE */
1913 		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1914 		pte_root |= tmp;
1915 
1916 		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1917 		flags    |= tmp;
1918 
1919 		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1920 		flags    |= tmp;
1921 
1922 		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1923 			dev_table[devid].data[2] |=
1924 				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1925 		}
1926 
1927 		/* GIOV is supported with V2 page table mode only */
1928 		if (pdom_is_v2_pgtbl_mode(domain))
1929 			pte_root |= DTE_FLAG_GIOV;
1930 	}
1931 
1932 	flags &= ~DEV_DOMID_MASK;
1933 	flags |= domid;
1934 
1935 	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1936 	dev_table[devid].data[1]  = flags;
1937 	dev_table[devid].data[0]  = pte_root;
1938 
1939 	/*
1940 	 * A kdump kernel might be replacing a domain ID that was copied from
1941 	 * the previous kernel--if so, it needs to flush the translation cache
1942 	 * entries for the old domain ID that is being overwritten
1943 	 */
1944 	if (old_domid) {
1945 		amd_iommu_flush_tlb_domid(iommu, old_domid);
1946 	}
1947 }
1948 
1949 static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1950 {
1951 	struct dev_table_entry *dev_table = get_dev_table(iommu);
1952 
1953 	/* remove entry from the device table seen by the hardware */
1954 	dev_table[devid].data[0]  = DTE_FLAG_V;
1955 
1956 	if (!amd_iommu_snp_en)
1957 		dev_table[devid].data[0] |= DTE_FLAG_TV;
1958 
1959 	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1960 
1961 	amd_iommu_apply_erratum_63(iommu, devid);
1962 }
1963 
1964 /* Update and flush DTE for the given device */
1965 void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set)
1966 {
1967 	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1968 
1969 	if (set)
1970 		set_dte_entry(iommu, dev_data);
1971 	else
1972 		clear_dte_entry(iommu, dev_data->devid);
1973 
1974 	clone_aliases(iommu, dev_data->dev);
1975 	device_flush_dte(dev_data);
1976 	iommu_completion_wait(iommu);
1977 }
1978 
1979 /*
1980  * If domain is SVA capable then initialize GCR3 table. Also if domain is
1981  * in v2 page table mode then update GCR3[0].
1982  */
1983 static int init_gcr3_table(struct iommu_dev_data *dev_data,
1984 			   struct protection_domain *pdom)
1985 {
1986 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1987 	int max_pasids = dev_data->max_pasids;
1988 	int ret = 0;
1989 
1990 	 /*
1991 	  * If domain is in pt mode then setup GCR3 table only if device
1992 	  * is PASID capable
1993 	  */
1994 	if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data))
1995 		return ret;
1996 
1997 	/*
1998 	 * By default, setup GCR3 table to support MAX PASIDs
1999 	 * supported by the device/IOMMU.
2000 	 */
2001 	ret = setup_gcr3_table(&dev_data->gcr3_info, iommu,
2002 			       max_pasids > 0 ?  max_pasids : 1);
2003 	if (ret)
2004 		return ret;
2005 
2006 	/* Setup GCR3[0] only if domain is setup with v2 page table mode */
2007 	if (!pdom_is_v2_pgtbl_mode(pdom))
2008 		return ret;
2009 
2010 	ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
2011 	if (ret)
2012 		free_gcr3_table(&dev_data->gcr3_info);
2013 
2014 	return ret;
2015 }
2016 
2017 static void destroy_gcr3_table(struct iommu_dev_data *dev_data,
2018 			       struct protection_domain *pdom)
2019 {
2020 	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
2021 
2022 	if (pdom_is_v2_pgtbl_mode(pdom))
2023 		update_gcr3(dev_data, 0, 0, false);
2024 
2025 	if (gcr3_info->gcr3_tbl == NULL)
2026 		return;
2027 
2028 	free_gcr3_table(gcr3_info);
2029 }
2030 
2031 static int do_attach(struct iommu_dev_data *dev_data,
2032 		     struct protection_domain *domain)
2033 {
2034 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2035 	struct pci_dev *pdev;
2036 	int ret = 0;
2037 
2038 	/* Update data structures */
2039 	dev_data->domain = domain;
2040 	list_add(&dev_data->list, &domain->dev_list);
2041 
2042 	/* Update NUMA Node ID */
2043 	if (domain->nid == NUMA_NO_NODE)
2044 		domain->nid = dev_to_node(dev_data->dev);
2045 
2046 	/* Do reference counting */
2047 	domain->dev_iommu[iommu->index] += 1;
2048 	domain->dev_cnt                 += 1;
2049 
2050 	pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
2051 	if (pdom_is_sva_capable(domain)) {
2052 		ret = init_gcr3_table(dev_data, domain);
2053 		if (ret)
2054 			return ret;
2055 
2056 		if (pdev) {
2057 			pdev_enable_caps(pdev);
2058 
2059 			/*
2060 			 * Device can continue to function even if IOPF
2061 			 * enablement failed. Hence in error path just
2062 			 * disable device PRI support.
2063 			 */
2064 			if (amd_iommu_iopf_add_device(iommu, dev_data))
2065 				pdev_disable_cap_pri(pdev);
2066 		}
2067 	} else if (pdev) {
2068 		pdev_enable_cap_ats(pdev);
2069 	}
2070 
2071 	/* Update device table */
2072 	amd_iommu_dev_update_dte(dev_data, true);
2073 
2074 	return ret;
2075 }
2076 
2077 static void do_detach(struct iommu_dev_data *dev_data)
2078 {
2079 	struct protection_domain *domain = dev_data->domain;
2080 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2081 
2082 	/* Clear GCR3 table */
2083 	if (pdom_is_sva_capable(domain))
2084 		destroy_gcr3_table(dev_data, domain);
2085 
2086 	/* Update data structures */
2087 	dev_data->domain = NULL;
2088 	list_del(&dev_data->list);
2089 
2090 	/* Clear DTE and flush the entry */
2091 	amd_iommu_dev_update_dte(dev_data, false);
2092 
2093 	/* Flush IOTLB and wait for the flushes to finish */
2094 	amd_iommu_domain_flush_all(domain);
2095 
2096 	/* decrease reference counters - needs to happen after the flushes */
2097 	domain->dev_iommu[iommu->index] -= 1;
2098 	domain->dev_cnt                 -= 1;
2099 }
2100 
2101 /*
2102  * If a device is not yet associated with a domain, this function makes the
2103  * device visible in the domain
2104  */
2105 static int attach_device(struct device *dev,
2106 			 struct protection_domain *domain)
2107 {
2108 	struct iommu_dev_data *dev_data;
2109 	unsigned long flags;
2110 	int ret = 0;
2111 
2112 	spin_lock_irqsave(&domain->lock, flags);
2113 
2114 	dev_data = dev_iommu_priv_get(dev);
2115 
2116 	spin_lock(&dev_data->lock);
2117 
2118 	if (dev_data->domain != NULL) {
2119 		ret = -EBUSY;
2120 		goto out;
2121 	}
2122 
2123 	ret = do_attach(dev_data, domain);
2124 
2125 out:
2126 	spin_unlock(&dev_data->lock);
2127 
2128 	spin_unlock_irqrestore(&domain->lock, flags);
2129 
2130 	return ret;
2131 }
2132 
2133 /*
2134  * Removes a device from a protection domain (with devtable_lock held)
2135  */
2136 static void detach_device(struct device *dev)
2137 {
2138 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2139 	struct protection_domain *domain = dev_data->domain;
2140 	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2141 	unsigned long flags;
2142 	bool ppr = dev_data->ppr;
2143 
2144 	spin_lock_irqsave(&domain->lock, flags);
2145 
2146 	spin_lock(&dev_data->lock);
2147 
2148 	/*
2149 	 * First check if the device is still attached. It might already
2150 	 * be detached from its domain because the generic
2151 	 * iommu_detach_group code detached it and we try again here in
2152 	 * our alias handling.
2153 	 */
2154 	if (WARN_ON(!dev_data->domain))
2155 		goto out;
2156 
2157 	if (ppr) {
2158 		iopf_queue_flush_dev(dev);
2159 
2160 		/* Updated here so that it gets reflected in DTE */
2161 		dev_data->ppr = false;
2162 	}
2163 
2164 	do_detach(dev_data);
2165 
2166 	/* Remove IOPF handler */
2167 	if (ppr)
2168 		amd_iommu_iopf_remove_device(iommu, dev_data);
2169 
2170 	if (dev_is_pci(dev))
2171 		pdev_disable_caps(to_pci_dev(dev));
2172 
2173 out:
2174 	spin_unlock(&dev_data->lock);
2175 
2176 	spin_unlock_irqrestore(&domain->lock, flags);
2177 }
2178 
2179 static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2180 {
2181 	struct iommu_device *iommu_dev;
2182 	struct amd_iommu *iommu;
2183 	struct iommu_dev_data *dev_data;
2184 	int ret;
2185 
2186 	if (!check_device(dev))
2187 		return ERR_PTR(-ENODEV);
2188 
2189 	iommu = rlookup_amd_iommu(dev);
2190 	if (!iommu)
2191 		return ERR_PTR(-ENODEV);
2192 
2193 	/* Not registered yet? */
2194 	if (!iommu->iommu.ops)
2195 		return ERR_PTR(-ENODEV);
2196 
2197 	if (dev_iommu_priv_get(dev))
2198 		return &iommu->iommu;
2199 
2200 	ret = iommu_init_device(iommu, dev);
2201 	if (ret) {
2202 		dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
2203 		iommu_dev = ERR_PTR(ret);
2204 		iommu_ignore_device(iommu, dev);
2205 	} else {
2206 		amd_iommu_set_pci_msi_domain(dev, iommu);
2207 		iommu_dev = &iommu->iommu;
2208 	}
2209 
2210 	/*
2211 	 * If IOMMU and device supports PASID then it will contain max
2212 	 * supported PASIDs, else it will be zero.
2213 	 */
2214 	dev_data = dev_iommu_priv_get(dev);
2215 	if (amd_iommu_pasid_supported() && dev_is_pci(dev) &&
2216 	    pdev_pasid_supported(dev_data)) {
2217 		dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
2218 					     pci_max_pasids(to_pci_dev(dev)));
2219 	}
2220 
2221 	iommu_completion_wait(iommu);
2222 
2223 	return iommu_dev;
2224 }
2225 
2226 static void amd_iommu_release_device(struct device *dev)
2227 {
2228 	struct amd_iommu *iommu;
2229 
2230 	if (!check_device(dev))
2231 		return;
2232 
2233 	iommu = rlookup_amd_iommu(dev);
2234 	if (!iommu)
2235 		return;
2236 
2237 	amd_iommu_uninit_device(dev);
2238 	iommu_completion_wait(iommu);
2239 }
2240 
2241 static struct iommu_group *amd_iommu_device_group(struct device *dev)
2242 {
2243 	if (dev_is_pci(dev))
2244 		return pci_device_group(dev);
2245 
2246 	return acpihid_device_group(dev);
2247 }
2248 
2249 /*****************************************************************************
2250  *
2251  * The following functions belong to the exported interface of AMD IOMMU
2252  *
2253  * This interface allows access to lower level functions of the IOMMU
2254  * like protection domain handling and assignement of devices to domains
2255  * which is not possible with the dma_ops interface.
2256  *
2257  *****************************************************************************/
2258 
2259 static void cleanup_domain(struct protection_domain *domain)
2260 {
2261 	struct iommu_dev_data *entry;
2262 
2263 	lockdep_assert_held(&domain->lock);
2264 
2265 	if (!domain->dev_cnt)
2266 		return;
2267 
2268 	while (!list_empty(&domain->dev_list)) {
2269 		entry = list_first_entry(&domain->dev_list,
2270 					 struct iommu_dev_data, list);
2271 		BUG_ON(!entry->domain);
2272 		do_detach(entry);
2273 	}
2274 	WARN_ON(domain->dev_cnt != 0);
2275 }
2276 
2277 void protection_domain_free(struct protection_domain *domain)
2278 {
2279 	if (!domain)
2280 		return;
2281 
2282 	if (domain->iop.pgtbl_cfg.tlb)
2283 		free_io_pgtable_ops(&domain->iop.iop.ops);
2284 
2285 	if (domain->iop.root)
2286 		iommu_free_page(domain->iop.root);
2287 
2288 	if (domain->id)
2289 		domain_id_free(domain->id);
2290 
2291 	kfree(domain);
2292 }
2293 
2294 static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2295 {
2296 	u64 *pt_root = NULL;
2297 
2298 	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2299 
2300 	if (mode != PAGE_MODE_NONE) {
2301 		pt_root = iommu_alloc_page(GFP_KERNEL);
2302 		if (!pt_root)
2303 			return -ENOMEM;
2304 	}
2305 
2306 	domain->pd_mode = PD_MODE_V1;
2307 	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2308 
2309 	return 0;
2310 }
2311 
2312 static int protection_domain_init_v2(struct protection_domain *pdom)
2313 {
2314 	pdom->pd_mode = PD_MODE_V2;
2315 	pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2316 
2317 	return 0;
2318 }
2319 
2320 struct protection_domain *protection_domain_alloc(unsigned int type)
2321 {
2322 	struct io_pgtable_ops *pgtbl_ops;
2323 	struct protection_domain *domain;
2324 	int pgtable;
2325 	int ret;
2326 
2327 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2328 	if (!domain)
2329 		return NULL;
2330 
2331 	domain->id = domain_id_alloc();
2332 	if (!domain->id)
2333 		goto out_err;
2334 
2335 	spin_lock_init(&domain->lock);
2336 	INIT_LIST_HEAD(&domain->dev_list);
2337 	INIT_LIST_HEAD(&domain->dev_data_list);
2338 	domain->nid = NUMA_NO_NODE;
2339 
2340 	switch (type) {
2341 	/* No need to allocate io pgtable ops in passthrough mode */
2342 	case IOMMU_DOMAIN_IDENTITY:
2343 	case IOMMU_DOMAIN_SVA:
2344 		return domain;
2345 	case IOMMU_DOMAIN_DMA:
2346 		pgtable = amd_iommu_pgtable;
2347 		break;
2348 	/*
2349 	 * Force IOMMU v1 page table when allocating
2350 	 * domain for pass-through devices.
2351 	 */
2352 	case IOMMU_DOMAIN_UNMANAGED:
2353 		pgtable = AMD_IOMMU_V1;
2354 		break;
2355 	default:
2356 		goto out_err;
2357 	}
2358 
2359 	switch (pgtable) {
2360 	case AMD_IOMMU_V1:
2361 		ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
2362 		break;
2363 	case AMD_IOMMU_V2:
2364 		ret = protection_domain_init_v2(domain);
2365 		break;
2366 	default:
2367 		ret = -EINVAL;
2368 		break;
2369 	}
2370 
2371 	if (ret)
2372 		goto out_err;
2373 
2374 	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2375 	if (!pgtbl_ops)
2376 		goto out_err;
2377 
2378 	return domain;
2379 out_err:
2380 	protection_domain_free(domain);
2381 	return NULL;
2382 }
2383 
2384 static inline u64 dma_max_address(void)
2385 {
2386 	if (amd_iommu_pgtable == AMD_IOMMU_V1)
2387 		return ~0ULL;
2388 
2389 	/* V2 with 4/5 level page table */
2390 	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2391 }
2392 
2393 static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2394 {
2395 	return iommu && (iommu->features & FEATURE_HDSUP);
2396 }
2397 
2398 static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2399 						  struct device *dev, u32 flags)
2400 {
2401 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2402 	struct protection_domain *domain;
2403 	struct amd_iommu *iommu = NULL;
2404 
2405 	if (dev)
2406 		iommu = get_amd_iommu_from_dev(dev);
2407 
2408 	/*
2409 	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2410 	 * default to use IOMMU_DOMAIN_DMA[_FQ].
2411 	 */
2412 	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2413 		return ERR_PTR(-EINVAL);
2414 
2415 	if (dirty_tracking && !amd_iommu_hd_support(iommu))
2416 		return ERR_PTR(-EOPNOTSUPP);
2417 
2418 	domain = protection_domain_alloc(type);
2419 	if (!domain)
2420 		return ERR_PTR(-ENOMEM);
2421 
2422 	domain->domain.geometry.aperture_start = 0;
2423 	domain->domain.geometry.aperture_end   = dma_max_address();
2424 	domain->domain.geometry.force_aperture = true;
2425 
2426 	if (iommu) {
2427 		domain->domain.type = type;
2428 		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2429 		domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2430 
2431 		if (dirty_tracking)
2432 			domain->domain.dirty_ops = &amd_dirty_ops;
2433 	}
2434 
2435 	return &domain->domain;
2436 }
2437 
2438 static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
2439 {
2440 	struct iommu_domain *domain;
2441 
2442 	domain = do_iommu_domain_alloc(type, NULL, 0);
2443 	if (IS_ERR(domain))
2444 		return NULL;
2445 
2446 	return domain;
2447 }
2448 
2449 static struct iommu_domain *
2450 amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2451 			    struct iommu_domain *parent,
2452 			    const struct iommu_user_data *user_data)
2453 
2454 {
2455 	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2456 
2457 	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2458 		return ERR_PTR(-EOPNOTSUPP);
2459 
2460 	return do_iommu_domain_alloc(type, dev, flags);
2461 }
2462 
2463 void amd_iommu_domain_free(struct iommu_domain *dom)
2464 {
2465 	struct protection_domain *domain;
2466 	unsigned long flags;
2467 
2468 	if (!dom)
2469 		return;
2470 
2471 	domain = to_pdomain(dom);
2472 
2473 	spin_lock_irqsave(&domain->lock, flags);
2474 
2475 	cleanup_domain(domain);
2476 
2477 	spin_unlock_irqrestore(&domain->lock, flags);
2478 
2479 	protection_domain_free(domain);
2480 }
2481 
2482 static int amd_iommu_attach_device(struct iommu_domain *dom,
2483 				   struct device *dev)
2484 {
2485 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2486 	struct protection_domain *domain = to_pdomain(dom);
2487 	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2488 	int ret;
2489 
2490 	/*
2491 	 * Skip attach device to domain if new domain is same as
2492 	 * devices current domain
2493 	 */
2494 	if (dev_data->domain == domain)
2495 		return 0;
2496 
2497 	dev_data->defer_attach = false;
2498 
2499 	/*
2500 	 * Restrict to devices with compatible IOMMU hardware support
2501 	 * when enforcement of dirty tracking is enabled.
2502 	 */
2503 	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2504 		return -EINVAL;
2505 
2506 	if (dev_data->domain)
2507 		detach_device(dev);
2508 
2509 	ret = attach_device(dev, domain);
2510 
2511 #ifdef CONFIG_IRQ_REMAP
2512 	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2513 		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2514 			dev_data->use_vapic = 1;
2515 		else
2516 			dev_data->use_vapic = 0;
2517 	}
2518 #endif
2519 
2520 	iommu_completion_wait(iommu);
2521 
2522 	return ret;
2523 }
2524 
2525 static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2526 				    unsigned long iova, size_t size)
2527 {
2528 	struct protection_domain *domain = to_pdomain(dom);
2529 	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2530 
2531 	if (ops->map_pages)
2532 		domain_flush_np_cache(domain, iova, size);
2533 	return 0;
2534 }
2535 
2536 static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2537 			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2538 			       int iommu_prot, gfp_t gfp, size_t *mapped)
2539 {
2540 	struct protection_domain *domain = to_pdomain(dom);
2541 	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2542 	int prot = 0;
2543 	int ret = -EINVAL;
2544 
2545 	if ((domain->pd_mode == PD_MODE_V1) &&
2546 	    (domain->iop.mode == PAGE_MODE_NONE))
2547 		return -EINVAL;
2548 
2549 	if (iommu_prot & IOMMU_READ)
2550 		prot |= IOMMU_PROT_IR;
2551 	if (iommu_prot & IOMMU_WRITE)
2552 		prot |= IOMMU_PROT_IW;
2553 
2554 	if (ops->map_pages) {
2555 		ret = ops->map_pages(ops, iova, paddr, pgsize,
2556 				     pgcount, prot, gfp, mapped);
2557 	}
2558 
2559 	return ret;
2560 }
2561 
2562 static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2563 					    struct iommu_iotlb_gather *gather,
2564 					    unsigned long iova, size_t size)
2565 {
2566 	/*
2567 	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2568 	 * Unless we run in a virtual machine, which can be inferred according
2569 	 * to whether "non-present cache" is on, it is probably best to prefer
2570 	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2571 	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2572 	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2573 	 * the guest, and the trade-off is different: unnecessary TLB flushes
2574 	 * should be avoided.
2575 	 */
2576 	if (amd_iommu_np_cache &&
2577 	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2578 		iommu_iotlb_sync(domain, gather);
2579 
2580 	iommu_iotlb_gather_add_range(gather, iova, size);
2581 }
2582 
2583 static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2584 				    size_t pgsize, size_t pgcount,
2585 				    struct iommu_iotlb_gather *gather)
2586 {
2587 	struct protection_domain *domain = to_pdomain(dom);
2588 	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2589 	size_t r;
2590 
2591 	if ((domain->pd_mode == PD_MODE_V1) &&
2592 	    (domain->iop.mode == PAGE_MODE_NONE))
2593 		return 0;
2594 
2595 	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2596 
2597 	if (r)
2598 		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2599 
2600 	return r;
2601 }
2602 
2603 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2604 					  dma_addr_t iova)
2605 {
2606 	struct protection_domain *domain = to_pdomain(dom);
2607 	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2608 
2609 	return ops->iova_to_phys(ops, iova);
2610 }
2611 
2612 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2613 {
2614 	switch (cap) {
2615 	case IOMMU_CAP_CACHE_COHERENCY:
2616 		return true;
2617 	case IOMMU_CAP_NOEXEC:
2618 		return false;
2619 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2620 		return amdr_ivrs_remap_support;
2621 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2622 		return true;
2623 	case IOMMU_CAP_DEFERRED_FLUSH:
2624 		return true;
2625 	case IOMMU_CAP_DIRTY_TRACKING: {
2626 		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2627 
2628 		return amd_iommu_hd_support(iommu);
2629 	}
2630 	default:
2631 		break;
2632 	}
2633 
2634 	return false;
2635 }
2636 
2637 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2638 					bool enable)
2639 {
2640 	struct protection_domain *pdomain = to_pdomain(domain);
2641 	struct dev_table_entry *dev_table;
2642 	struct iommu_dev_data *dev_data;
2643 	bool domain_flush = false;
2644 	struct amd_iommu *iommu;
2645 	unsigned long flags;
2646 	u64 pte_root;
2647 
2648 	spin_lock_irqsave(&pdomain->lock, flags);
2649 	if (!(pdomain->dirty_tracking ^ enable)) {
2650 		spin_unlock_irqrestore(&pdomain->lock, flags);
2651 		return 0;
2652 	}
2653 
2654 	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2655 		iommu = get_amd_iommu_from_dev_data(dev_data);
2656 
2657 		dev_table = get_dev_table(iommu);
2658 		pte_root = dev_table[dev_data->devid].data[0];
2659 
2660 		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2661 				     pte_root & ~DTE_FLAG_HAD);
2662 
2663 		/* Flush device DTE */
2664 		dev_table[dev_data->devid].data[0] = pte_root;
2665 		device_flush_dte(dev_data);
2666 		domain_flush = true;
2667 	}
2668 
2669 	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2670 	if (domain_flush)
2671 		amd_iommu_domain_flush_all(pdomain);
2672 
2673 	pdomain->dirty_tracking = enable;
2674 	spin_unlock_irqrestore(&pdomain->lock, flags);
2675 
2676 	return 0;
2677 }
2678 
2679 static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2680 					  unsigned long iova, size_t size,
2681 					  unsigned long flags,
2682 					  struct iommu_dirty_bitmap *dirty)
2683 {
2684 	struct protection_domain *pdomain = to_pdomain(domain);
2685 	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2686 	unsigned long lflags;
2687 
2688 	if (!ops || !ops->read_and_clear_dirty)
2689 		return -EOPNOTSUPP;
2690 
2691 	spin_lock_irqsave(&pdomain->lock, lflags);
2692 	if (!pdomain->dirty_tracking && dirty->bitmap) {
2693 		spin_unlock_irqrestore(&pdomain->lock, lflags);
2694 		return -EINVAL;
2695 	}
2696 	spin_unlock_irqrestore(&pdomain->lock, lflags);
2697 
2698 	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2699 }
2700 
2701 static void amd_iommu_get_resv_regions(struct device *dev,
2702 				       struct list_head *head)
2703 {
2704 	struct iommu_resv_region *region;
2705 	struct unity_map_entry *entry;
2706 	struct amd_iommu *iommu;
2707 	struct amd_iommu_pci_seg *pci_seg;
2708 	int devid, sbdf;
2709 
2710 	sbdf = get_device_sbdf_id(dev);
2711 	if (sbdf < 0)
2712 		return;
2713 
2714 	devid = PCI_SBDF_TO_DEVID(sbdf);
2715 	iommu = get_amd_iommu_from_dev(dev);
2716 	pci_seg = iommu->pci_seg;
2717 
2718 	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2719 		int type, prot = 0;
2720 		size_t length;
2721 
2722 		if (devid < entry->devid_start || devid > entry->devid_end)
2723 			continue;
2724 
2725 		type   = IOMMU_RESV_DIRECT;
2726 		length = entry->address_end - entry->address_start;
2727 		if (entry->prot & IOMMU_PROT_IR)
2728 			prot |= IOMMU_READ;
2729 		if (entry->prot & IOMMU_PROT_IW)
2730 			prot |= IOMMU_WRITE;
2731 		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2732 			/* Exclusion range */
2733 			type = IOMMU_RESV_RESERVED;
2734 
2735 		region = iommu_alloc_resv_region(entry->address_start,
2736 						 length, prot, type,
2737 						 GFP_KERNEL);
2738 		if (!region) {
2739 			dev_err(dev, "Out of memory allocating dm-regions\n");
2740 			return;
2741 		}
2742 		list_add_tail(&region->list, head);
2743 	}
2744 
2745 	region = iommu_alloc_resv_region(MSI_RANGE_START,
2746 					 MSI_RANGE_END - MSI_RANGE_START + 1,
2747 					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2748 	if (!region)
2749 		return;
2750 	list_add_tail(&region->list, head);
2751 
2752 	region = iommu_alloc_resv_region(HT_RANGE_START,
2753 					 HT_RANGE_END - HT_RANGE_START + 1,
2754 					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2755 	if (!region)
2756 		return;
2757 	list_add_tail(&region->list, head);
2758 }
2759 
2760 bool amd_iommu_is_attach_deferred(struct device *dev)
2761 {
2762 	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2763 
2764 	return dev_data->defer_attach;
2765 }
2766 
2767 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2768 {
2769 	struct protection_domain *dom = to_pdomain(domain);
2770 	unsigned long flags;
2771 
2772 	spin_lock_irqsave(&dom->lock, flags);
2773 	amd_iommu_domain_flush_all(dom);
2774 	spin_unlock_irqrestore(&dom->lock, flags);
2775 }
2776 
2777 static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2778 				 struct iommu_iotlb_gather *gather)
2779 {
2780 	struct protection_domain *dom = to_pdomain(domain);
2781 	unsigned long flags;
2782 
2783 	spin_lock_irqsave(&dom->lock, flags);
2784 	amd_iommu_domain_flush_pages(dom, gather->start,
2785 				     gather->end - gather->start + 1);
2786 	spin_unlock_irqrestore(&dom->lock, flags);
2787 }
2788 
2789 static int amd_iommu_def_domain_type(struct device *dev)
2790 {
2791 	struct iommu_dev_data *dev_data;
2792 
2793 	dev_data = dev_iommu_priv_get(dev);
2794 	if (!dev_data)
2795 		return 0;
2796 
2797 	/* Always use DMA domain for untrusted device */
2798 	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
2799 		return IOMMU_DOMAIN_DMA;
2800 
2801 	/*
2802 	 * Do not identity map IOMMUv2 capable devices when:
2803 	 *  - memory encryption is active, because some of those devices
2804 	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2805 	 *    and require remapping.
2806 	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2807 	 */
2808 	if (pdev_pasid_supported(dev_data) &&
2809 	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2810 	    !amd_iommu_snp_en) {
2811 		return IOMMU_DOMAIN_IDENTITY;
2812 	}
2813 
2814 	return 0;
2815 }
2816 
2817 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2818 {
2819 	/* IOMMU_PTE_FC is always set */
2820 	return true;
2821 }
2822 
2823 static const struct iommu_dirty_ops amd_dirty_ops = {
2824 	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2825 	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2826 };
2827 
2828 static int amd_iommu_dev_enable_feature(struct device *dev,
2829 					enum iommu_dev_features feat)
2830 {
2831 	int ret = 0;
2832 
2833 	switch (feat) {
2834 	case IOMMU_DEV_FEAT_IOPF:
2835 	case IOMMU_DEV_FEAT_SVA:
2836 		break;
2837 	default:
2838 		ret = -EINVAL;
2839 		break;
2840 	}
2841 	return ret;
2842 }
2843 
2844 static int amd_iommu_dev_disable_feature(struct device *dev,
2845 					 enum iommu_dev_features feat)
2846 {
2847 	int ret = 0;
2848 
2849 	switch (feat) {
2850 	case IOMMU_DEV_FEAT_IOPF:
2851 	case IOMMU_DEV_FEAT_SVA:
2852 		break;
2853 	default:
2854 		ret = -EINVAL;
2855 		break;
2856 	}
2857 	return ret;
2858 }
2859 
2860 const struct iommu_ops amd_iommu_ops = {
2861 	.capable = amd_iommu_capable,
2862 	.domain_alloc = amd_iommu_domain_alloc,
2863 	.domain_alloc_user = amd_iommu_domain_alloc_user,
2864 	.domain_alloc_sva = amd_iommu_domain_alloc_sva,
2865 	.probe_device = amd_iommu_probe_device,
2866 	.release_device = amd_iommu_release_device,
2867 	.device_group = amd_iommu_device_group,
2868 	.get_resv_regions = amd_iommu_get_resv_regions,
2869 	.is_attach_deferred = amd_iommu_is_attach_deferred,
2870 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2871 	.def_domain_type = amd_iommu_def_domain_type,
2872 	.dev_enable_feat = amd_iommu_dev_enable_feature,
2873 	.dev_disable_feat = amd_iommu_dev_disable_feature,
2874 	.remove_dev_pasid = amd_iommu_remove_dev_pasid,
2875 	.page_response = amd_iommu_page_response,
2876 	.default_domain_ops = &(const struct iommu_domain_ops) {
2877 		.attach_dev	= amd_iommu_attach_device,
2878 		.map_pages	= amd_iommu_map_pages,
2879 		.unmap_pages	= amd_iommu_unmap_pages,
2880 		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2881 		.iova_to_phys	= amd_iommu_iova_to_phys,
2882 		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2883 		.iotlb_sync	= amd_iommu_iotlb_sync,
2884 		.free		= amd_iommu_domain_free,
2885 		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2886 	}
2887 };
2888 
2889 #ifdef CONFIG_IRQ_REMAP
2890 
2891 /*****************************************************************************
2892  *
2893  * Interrupt Remapping Implementation
2894  *
2895  *****************************************************************************/
2896 
2897 static struct irq_chip amd_ir_chip;
2898 static DEFINE_SPINLOCK(iommu_table_lock);
2899 
2900 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2901 {
2902 	int ret;
2903 	u64 data;
2904 	unsigned long flags;
2905 	struct iommu_cmd cmd, cmd2;
2906 
2907 	if (iommu->irtcachedis_enabled)
2908 		return;
2909 
2910 	build_inv_irt(&cmd, devid);
2911 	data = atomic64_add_return(1, &iommu->cmd_sem_val);
2912 	build_completion_wait(&cmd2, iommu, data);
2913 
2914 	raw_spin_lock_irqsave(&iommu->lock, flags);
2915 	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2916 	if (ret)
2917 		goto out;
2918 	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2919 	if (ret)
2920 		goto out;
2921 	wait_on_sem(iommu, data);
2922 out:
2923 	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2924 }
2925 
2926 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2927 			      struct irq_remap_table *table)
2928 {
2929 	u64 dte;
2930 	struct dev_table_entry *dev_table = get_dev_table(iommu);
2931 
2932 	dte	= dev_table[devid].data[2];
2933 	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2934 	dte	|= iommu_virt_to_phys(table->table);
2935 	dte	|= DTE_IRQ_REMAP_INTCTL;
2936 	dte	|= DTE_INTTABLEN;
2937 	dte	|= DTE_IRQ_REMAP_ENABLE;
2938 
2939 	dev_table[devid].data[2] = dte;
2940 }
2941 
2942 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2943 {
2944 	struct irq_remap_table *table;
2945 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2946 
2947 	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2948 		      "%s: no iommu for devid %x:%x\n",
2949 		      __func__, pci_seg->id, devid))
2950 		return NULL;
2951 
2952 	table = pci_seg->irq_lookup_table[devid];
2953 	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2954 		      __func__, pci_seg->id, devid))
2955 		return NULL;
2956 
2957 	return table;
2958 }
2959 
2960 static struct irq_remap_table *__alloc_irq_table(void)
2961 {
2962 	struct irq_remap_table *table;
2963 
2964 	table = kzalloc(sizeof(*table), GFP_KERNEL);
2965 	if (!table)
2966 		return NULL;
2967 
2968 	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2969 	if (!table->table) {
2970 		kfree(table);
2971 		return NULL;
2972 	}
2973 	raw_spin_lock_init(&table->lock);
2974 
2975 	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2976 		memset(table->table, 0,
2977 		       MAX_IRQS_PER_TABLE * sizeof(u32));
2978 	else
2979 		memset(table->table, 0,
2980 		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2981 	return table;
2982 }
2983 
2984 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2985 				  struct irq_remap_table *table)
2986 {
2987 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2988 
2989 	pci_seg->irq_lookup_table[devid] = table;
2990 	set_dte_irq_entry(iommu, devid, table);
2991 	iommu_flush_dte(iommu, devid);
2992 }
2993 
2994 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2995 				       void *data)
2996 {
2997 	struct irq_remap_table *table = data;
2998 	struct amd_iommu_pci_seg *pci_seg;
2999 	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
3000 
3001 	if (!iommu)
3002 		return -EINVAL;
3003 
3004 	pci_seg = iommu->pci_seg;
3005 	pci_seg->irq_lookup_table[alias] = table;
3006 	set_dte_irq_entry(iommu, alias, table);
3007 	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
3008 
3009 	return 0;
3010 }
3011 
3012 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
3013 					       u16 devid, struct pci_dev *pdev)
3014 {
3015 	struct irq_remap_table *table = NULL;
3016 	struct irq_remap_table *new_table = NULL;
3017 	struct amd_iommu_pci_seg *pci_seg;
3018 	unsigned long flags;
3019 	u16 alias;
3020 
3021 	spin_lock_irqsave(&iommu_table_lock, flags);
3022 
3023 	pci_seg = iommu->pci_seg;
3024 	table = pci_seg->irq_lookup_table[devid];
3025 	if (table)
3026 		goto out_unlock;
3027 
3028 	alias = pci_seg->alias_table[devid];
3029 	table = pci_seg->irq_lookup_table[alias];
3030 	if (table) {
3031 		set_remap_table_entry(iommu, devid, table);
3032 		goto out_wait;
3033 	}
3034 	spin_unlock_irqrestore(&iommu_table_lock, flags);
3035 
3036 	/* Nothing there yet, allocate new irq remapping table */
3037 	new_table = __alloc_irq_table();
3038 	if (!new_table)
3039 		return NULL;
3040 
3041 	spin_lock_irqsave(&iommu_table_lock, flags);
3042 
3043 	table = pci_seg->irq_lookup_table[devid];
3044 	if (table)
3045 		goto out_unlock;
3046 
3047 	table = pci_seg->irq_lookup_table[alias];
3048 	if (table) {
3049 		set_remap_table_entry(iommu, devid, table);
3050 		goto out_wait;
3051 	}
3052 
3053 	table = new_table;
3054 	new_table = NULL;
3055 
3056 	if (pdev)
3057 		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3058 				       table);
3059 	else
3060 		set_remap_table_entry(iommu, devid, table);
3061 
3062 	if (devid != alias)
3063 		set_remap_table_entry(iommu, alias, table);
3064 
3065 out_wait:
3066 	iommu_completion_wait(iommu);
3067 
3068 out_unlock:
3069 	spin_unlock_irqrestore(&iommu_table_lock, flags);
3070 
3071 	if (new_table) {
3072 		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3073 		kfree(new_table);
3074 	}
3075 	return table;
3076 }
3077 
3078 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3079 			   bool align, struct pci_dev *pdev)
3080 {
3081 	struct irq_remap_table *table;
3082 	int index, c, alignment = 1;
3083 	unsigned long flags;
3084 
3085 	table = alloc_irq_table(iommu, devid, pdev);
3086 	if (!table)
3087 		return -ENODEV;
3088 
3089 	if (align)
3090 		alignment = roundup_pow_of_two(count);
3091 
3092 	raw_spin_lock_irqsave(&table->lock, flags);
3093 
3094 	/* Scan table for free entries */
3095 	for (index = ALIGN(table->min_index, alignment), c = 0;
3096 	     index < MAX_IRQS_PER_TABLE;) {
3097 		if (!iommu->irte_ops->is_allocated(table, index)) {
3098 			c += 1;
3099 		} else {
3100 			c     = 0;
3101 			index = ALIGN(index + 1, alignment);
3102 			continue;
3103 		}
3104 
3105 		if (c == count)	{
3106 			for (; c != 0; --c)
3107 				iommu->irte_ops->set_allocated(table, index - c + 1);
3108 
3109 			index -= count - 1;
3110 			goto out;
3111 		}
3112 
3113 		index++;
3114 	}
3115 
3116 	index = -ENOSPC;
3117 
3118 out:
3119 	raw_spin_unlock_irqrestore(&table->lock, flags);
3120 
3121 	return index;
3122 }
3123 
3124 static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3125 			    struct irte_ga *irte)
3126 {
3127 	struct irq_remap_table *table;
3128 	struct irte_ga *entry;
3129 	unsigned long flags;
3130 	u128 old;
3131 
3132 	table = get_irq_table(iommu, devid);
3133 	if (!table)
3134 		return -ENOMEM;
3135 
3136 	raw_spin_lock_irqsave(&table->lock, flags);
3137 
3138 	entry = (struct irte_ga *)table->table;
3139 	entry = &entry[index];
3140 
3141 	/*
3142 	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3143 	 * and it cannot be updated by the hardware or other processors
3144 	 * behind us, so the return value of cmpxchg16 should be the
3145 	 * same as the old value.
3146 	 */
3147 	old = entry->irte;
3148 	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3149 
3150 	raw_spin_unlock_irqrestore(&table->lock, flags);
3151 
3152 	return 0;
3153 }
3154 
3155 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3156 			  struct irte_ga *irte)
3157 {
3158 	bool ret;
3159 
3160 	ret = __modify_irte_ga(iommu, devid, index, irte);
3161 	if (ret)
3162 		return ret;
3163 
3164 	iommu_flush_irt_and_complete(iommu, devid);
3165 
3166 	return 0;
3167 }
3168 
3169 static int modify_irte(struct amd_iommu *iommu,
3170 		       u16 devid, int index, union irte *irte)
3171 {
3172 	struct irq_remap_table *table;
3173 	unsigned long flags;
3174 
3175 	table = get_irq_table(iommu, devid);
3176 	if (!table)
3177 		return -ENOMEM;
3178 
3179 	raw_spin_lock_irqsave(&table->lock, flags);
3180 	table->table[index] = irte->val;
3181 	raw_spin_unlock_irqrestore(&table->lock, flags);
3182 
3183 	iommu_flush_irt_and_complete(iommu, devid);
3184 
3185 	return 0;
3186 }
3187 
3188 static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3189 {
3190 	struct irq_remap_table *table;
3191 	unsigned long flags;
3192 
3193 	table = get_irq_table(iommu, devid);
3194 	if (!table)
3195 		return;
3196 
3197 	raw_spin_lock_irqsave(&table->lock, flags);
3198 	iommu->irte_ops->clear_allocated(table, index);
3199 	raw_spin_unlock_irqrestore(&table->lock, flags);
3200 
3201 	iommu_flush_irt_and_complete(iommu, devid);
3202 }
3203 
3204 static void irte_prepare(void *entry,
3205 			 u32 delivery_mode, bool dest_mode,
3206 			 u8 vector, u32 dest_apicid, int devid)
3207 {
3208 	union irte *irte = (union irte *) entry;
3209 
3210 	irte->val                = 0;
3211 	irte->fields.vector      = vector;
3212 	irte->fields.int_type    = delivery_mode;
3213 	irte->fields.destination = dest_apicid;
3214 	irte->fields.dm          = dest_mode;
3215 	irte->fields.valid       = 1;
3216 }
3217 
3218 static void irte_ga_prepare(void *entry,
3219 			    u32 delivery_mode, bool dest_mode,
3220 			    u8 vector, u32 dest_apicid, int devid)
3221 {
3222 	struct irte_ga *irte = (struct irte_ga *) entry;
3223 
3224 	irte->lo.val                      = 0;
3225 	irte->hi.val                      = 0;
3226 	irte->lo.fields_remap.int_type    = delivery_mode;
3227 	irte->lo.fields_remap.dm          = dest_mode;
3228 	irte->hi.fields.vector            = vector;
3229 	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3230 	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3231 	irte->lo.fields_remap.valid       = 1;
3232 }
3233 
3234 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3235 {
3236 	union irte *irte = (union irte *) entry;
3237 
3238 	irte->fields.valid = 1;
3239 	modify_irte(iommu, devid, index, irte);
3240 }
3241 
3242 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3243 {
3244 	struct irte_ga *irte = (struct irte_ga *) entry;
3245 
3246 	irte->lo.fields_remap.valid = 1;
3247 	modify_irte_ga(iommu, devid, index, irte);
3248 }
3249 
3250 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3251 {
3252 	union irte *irte = (union irte *) entry;
3253 
3254 	irte->fields.valid = 0;
3255 	modify_irte(iommu, devid, index, irte);
3256 }
3257 
3258 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3259 {
3260 	struct irte_ga *irte = (struct irte_ga *) entry;
3261 
3262 	irte->lo.fields_remap.valid = 0;
3263 	modify_irte_ga(iommu, devid, index, irte);
3264 }
3265 
3266 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3267 			      u8 vector, u32 dest_apicid)
3268 {
3269 	union irte *irte = (union irte *) entry;
3270 
3271 	irte->fields.vector = vector;
3272 	irte->fields.destination = dest_apicid;
3273 	modify_irte(iommu, devid, index, irte);
3274 }
3275 
3276 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3277 				 u8 vector, u32 dest_apicid)
3278 {
3279 	struct irte_ga *irte = (struct irte_ga *) entry;
3280 
3281 	if (!irte->lo.fields_remap.guest_mode) {
3282 		irte->hi.fields.vector = vector;
3283 		irte->lo.fields_remap.destination =
3284 					APICID_TO_IRTE_DEST_LO(dest_apicid);
3285 		irte->hi.fields.destination =
3286 					APICID_TO_IRTE_DEST_HI(dest_apicid);
3287 		modify_irte_ga(iommu, devid, index, irte);
3288 	}
3289 }
3290 
3291 #define IRTE_ALLOCATED (~1U)
3292 static void irte_set_allocated(struct irq_remap_table *table, int index)
3293 {
3294 	table->table[index] = IRTE_ALLOCATED;
3295 }
3296 
3297 static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3298 {
3299 	struct irte_ga *ptr = (struct irte_ga *)table->table;
3300 	struct irte_ga *irte = &ptr[index];
3301 
3302 	memset(&irte->lo.val, 0, sizeof(u64));
3303 	memset(&irte->hi.val, 0, sizeof(u64));
3304 	irte->hi.fields.vector = 0xff;
3305 }
3306 
3307 static bool irte_is_allocated(struct irq_remap_table *table, int index)
3308 {
3309 	union irte *ptr = (union irte *)table->table;
3310 	union irte *irte = &ptr[index];
3311 
3312 	return irte->val != 0;
3313 }
3314 
3315 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3316 {
3317 	struct irte_ga *ptr = (struct irte_ga *)table->table;
3318 	struct irte_ga *irte = &ptr[index];
3319 
3320 	return irte->hi.fields.vector != 0;
3321 }
3322 
3323 static void irte_clear_allocated(struct irq_remap_table *table, int index)
3324 {
3325 	table->table[index] = 0;
3326 }
3327 
3328 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3329 {
3330 	struct irte_ga *ptr = (struct irte_ga *)table->table;
3331 	struct irte_ga *irte = &ptr[index];
3332 
3333 	memset(&irte->lo.val, 0, sizeof(u64));
3334 	memset(&irte->hi.val, 0, sizeof(u64));
3335 }
3336 
3337 static int get_devid(struct irq_alloc_info *info)
3338 {
3339 	switch (info->type) {
3340 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3341 		return get_ioapic_devid(info->devid);
3342 	case X86_IRQ_ALLOC_TYPE_HPET:
3343 		return get_hpet_devid(info->devid);
3344 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3345 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3346 		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3347 	default:
3348 		WARN_ON_ONCE(1);
3349 		return -1;
3350 	}
3351 }
3352 
3353 struct irq_remap_ops amd_iommu_irq_ops = {
3354 	.prepare		= amd_iommu_prepare,
3355 	.enable			= amd_iommu_enable,
3356 	.disable		= amd_iommu_disable,
3357 	.reenable		= amd_iommu_reenable,
3358 	.enable_faulting	= amd_iommu_enable_faulting,
3359 };
3360 
3361 static void fill_msi_msg(struct msi_msg *msg, u32 index)
3362 {
3363 	msg->data = index;
3364 	msg->address_lo = 0;
3365 	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3366 	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3367 }
3368 
3369 static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3370 				       struct irq_cfg *irq_cfg,
3371 				       struct irq_alloc_info *info,
3372 				       int devid, int index, int sub_handle)
3373 {
3374 	struct irq_2_irte *irte_info = &data->irq_2_irte;
3375 	struct amd_iommu *iommu = data->iommu;
3376 
3377 	if (!iommu)
3378 		return;
3379 
3380 	data->irq_2_irte.devid = devid;
3381 	data->irq_2_irte.index = index + sub_handle;
3382 	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3383 				 apic->dest_mode_logical, irq_cfg->vector,
3384 				 irq_cfg->dest_apicid, devid);
3385 
3386 	switch (info->type) {
3387 	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3388 	case X86_IRQ_ALLOC_TYPE_HPET:
3389 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3390 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3391 		fill_msi_msg(&data->msi_entry, irte_info->index);
3392 		break;
3393 
3394 	default:
3395 		BUG_ON(1);
3396 		break;
3397 	}
3398 }
3399 
3400 struct amd_irte_ops irte_32_ops = {
3401 	.prepare = irte_prepare,
3402 	.activate = irte_activate,
3403 	.deactivate = irte_deactivate,
3404 	.set_affinity = irte_set_affinity,
3405 	.set_allocated = irte_set_allocated,
3406 	.is_allocated = irte_is_allocated,
3407 	.clear_allocated = irte_clear_allocated,
3408 };
3409 
3410 struct amd_irte_ops irte_128_ops = {
3411 	.prepare = irte_ga_prepare,
3412 	.activate = irte_ga_activate,
3413 	.deactivate = irte_ga_deactivate,
3414 	.set_affinity = irte_ga_set_affinity,
3415 	.set_allocated = irte_ga_set_allocated,
3416 	.is_allocated = irte_ga_is_allocated,
3417 	.clear_allocated = irte_ga_clear_allocated,
3418 };
3419 
3420 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3421 			       unsigned int nr_irqs, void *arg)
3422 {
3423 	struct irq_alloc_info *info = arg;
3424 	struct irq_data *irq_data;
3425 	struct amd_ir_data *data = NULL;
3426 	struct amd_iommu *iommu;
3427 	struct irq_cfg *cfg;
3428 	int i, ret, devid, seg, sbdf;
3429 	int index;
3430 
3431 	if (!info)
3432 		return -EINVAL;
3433 	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3434 		return -EINVAL;
3435 
3436 	sbdf = get_devid(info);
3437 	if (sbdf < 0)
3438 		return -EINVAL;
3439 
3440 	seg = PCI_SBDF_TO_SEGID(sbdf);
3441 	devid = PCI_SBDF_TO_DEVID(sbdf);
3442 	iommu = __rlookup_amd_iommu(seg, devid);
3443 	if (!iommu)
3444 		return -EINVAL;
3445 
3446 	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3447 	if (ret < 0)
3448 		return ret;
3449 
3450 	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3451 		struct irq_remap_table *table;
3452 
3453 		table = alloc_irq_table(iommu, devid, NULL);
3454 		if (table) {
3455 			if (!table->min_index) {
3456 				/*
3457 				 * Keep the first 32 indexes free for IOAPIC
3458 				 * interrupts.
3459 				 */
3460 				table->min_index = 32;
3461 				for (i = 0; i < 32; ++i)
3462 					iommu->irte_ops->set_allocated(table, i);
3463 			}
3464 			WARN_ON(table->min_index != 32);
3465 			index = info->ioapic.pin;
3466 		} else {
3467 			index = -ENOMEM;
3468 		}
3469 	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3470 		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3471 		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3472 
3473 		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3474 					msi_desc_to_pci_dev(info->desc));
3475 	} else {
3476 		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3477 	}
3478 
3479 	if (index < 0) {
3480 		pr_warn("Failed to allocate IRTE\n");
3481 		ret = index;
3482 		goto out_free_parent;
3483 	}
3484 
3485 	for (i = 0; i < nr_irqs; i++) {
3486 		irq_data = irq_domain_get_irq_data(domain, virq + i);
3487 		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3488 		if (!cfg) {
3489 			ret = -EINVAL;
3490 			goto out_free_data;
3491 		}
3492 
3493 		ret = -ENOMEM;
3494 		data = kzalloc(sizeof(*data), GFP_KERNEL);
3495 		if (!data)
3496 			goto out_free_data;
3497 
3498 		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3499 			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3500 		else
3501 			data->entry = kzalloc(sizeof(struct irte_ga),
3502 						     GFP_KERNEL);
3503 		if (!data->entry) {
3504 			kfree(data);
3505 			goto out_free_data;
3506 		}
3507 
3508 		data->iommu = iommu;
3509 		irq_data->hwirq = (devid << 16) + i;
3510 		irq_data->chip_data = data;
3511 		irq_data->chip = &amd_ir_chip;
3512 		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3513 		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3514 	}
3515 
3516 	return 0;
3517 
3518 out_free_data:
3519 	for (i--; i >= 0; i--) {
3520 		irq_data = irq_domain_get_irq_data(domain, virq + i);
3521 		if (irq_data)
3522 			kfree(irq_data->chip_data);
3523 	}
3524 	for (i = 0; i < nr_irqs; i++)
3525 		free_irte(iommu, devid, index + i);
3526 out_free_parent:
3527 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3528 	return ret;
3529 }
3530 
3531 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3532 			       unsigned int nr_irqs)
3533 {
3534 	struct irq_2_irte *irte_info;
3535 	struct irq_data *irq_data;
3536 	struct amd_ir_data *data;
3537 	int i;
3538 
3539 	for (i = 0; i < nr_irqs; i++) {
3540 		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3541 		if (irq_data && irq_data->chip_data) {
3542 			data = irq_data->chip_data;
3543 			irte_info = &data->irq_2_irte;
3544 			free_irte(data->iommu, irte_info->devid, irte_info->index);
3545 			kfree(data->entry);
3546 			kfree(data);
3547 		}
3548 	}
3549 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3550 }
3551 
3552 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3553 			       struct amd_ir_data *ir_data,
3554 			       struct irq_2_irte *irte_info,
3555 			       struct irq_cfg *cfg);
3556 
3557 static int irq_remapping_activate(struct irq_domain *domain,
3558 				  struct irq_data *irq_data, bool reserve)
3559 {
3560 	struct amd_ir_data *data = irq_data->chip_data;
3561 	struct irq_2_irte *irte_info = &data->irq_2_irte;
3562 	struct amd_iommu *iommu = data->iommu;
3563 	struct irq_cfg *cfg = irqd_cfg(irq_data);
3564 
3565 	if (!iommu)
3566 		return 0;
3567 
3568 	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3569 				  irte_info->index);
3570 	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3571 	return 0;
3572 }
3573 
3574 static void irq_remapping_deactivate(struct irq_domain *domain,
3575 				     struct irq_data *irq_data)
3576 {
3577 	struct amd_ir_data *data = irq_data->chip_data;
3578 	struct irq_2_irte *irte_info = &data->irq_2_irte;
3579 	struct amd_iommu *iommu = data->iommu;
3580 
3581 	if (iommu)
3582 		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3583 					    irte_info->index);
3584 }
3585 
3586 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3587 				enum irq_domain_bus_token bus_token)
3588 {
3589 	struct amd_iommu *iommu;
3590 	int devid = -1;
3591 
3592 	if (!amd_iommu_irq_remap)
3593 		return 0;
3594 
3595 	if (x86_fwspec_is_ioapic(fwspec))
3596 		devid = get_ioapic_devid(fwspec->param[0]);
3597 	else if (x86_fwspec_is_hpet(fwspec))
3598 		devid = get_hpet_devid(fwspec->param[0]);
3599 
3600 	if (devid < 0)
3601 		return 0;
3602 	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3603 
3604 	return iommu && iommu->ir_domain == d;
3605 }
3606 
3607 static const struct irq_domain_ops amd_ir_domain_ops = {
3608 	.select = irq_remapping_select,
3609 	.alloc = irq_remapping_alloc,
3610 	.free = irq_remapping_free,
3611 	.activate = irq_remapping_activate,
3612 	.deactivate = irq_remapping_deactivate,
3613 };
3614 
3615 int amd_iommu_activate_guest_mode(void *data)
3616 {
3617 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3618 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3619 	u64 valid;
3620 
3621 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3622 		return 0;
3623 
3624 	valid = entry->lo.fields_vapic.valid;
3625 
3626 	entry->lo.val = 0;
3627 	entry->hi.val = 0;
3628 
3629 	entry->lo.fields_vapic.valid       = valid;
3630 	entry->lo.fields_vapic.guest_mode  = 1;
3631 	entry->lo.fields_vapic.ga_log_intr = 1;
3632 	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3633 	entry->hi.fields.vector            = ir_data->ga_vector;
3634 	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3635 
3636 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3637 			      ir_data->irq_2_irte.index, entry);
3638 }
3639 EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3640 
3641 int amd_iommu_deactivate_guest_mode(void *data)
3642 {
3643 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3644 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3645 	struct irq_cfg *cfg = ir_data->cfg;
3646 	u64 valid;
3647 
3648 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3649 	    !entry || !entry->lo.fields_vapic.guest_mode)
3650 		return 0;
3651 
3652 	valid = entry->lo.fields_remap.valid;
3653 
3654 	entry->lo.val = 0;
3655 	entry->hi.val = 0;
3656 
3657 	entry->lo.fields_remap.valid       = valid;
3658 	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3659 	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3660 	entry->hi.fields.vector            = cfg->vector;
3661 	entry->lo.fields_remap.destination =
3662 				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3663 	entry->hi.fields.destination =
3664 				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3665 
3666 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3667 			      ir_data->irq_2_irte.index, entry);
3668 }
3669 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3670 
3671 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3672 {
3673 	int ret;
3674 	struct amd_iommu_pi_data *pi_data = vcpu_info;
3675 	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3676 	struct amd_ir_data *ir_data = data->chip_data;
3677 	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3678 	struct iommu_dev_data *dev_data;
3679 
3680 	if (ir_data->iommu == NULL)
3681 		return -EINVAL;
3682 
3683 	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3684 
3685 	/* Note:
3686 	 * This device has never been set up for guest mode.
3687 	 * we should not modify the IRTE
3688 	 */
3689 	if (!dev_data || !dev_data->use_vapic)
3690 		return 0;
3691 
3692 	ir_data->cfg = irqd_cfg(data);
3693 	pi_data->ir_data = ir_data;
3694 
3695 	/* Note:
3696 	 * SVM tries to set up for VAPIC mode, but we are in
3697 	 * legacy mode. So, we force legacy mode instead.
3698 	 */
3699 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3700 		pr_debug("%s: Fall back to using intr legacy remap\n",
3701 			 __func__);
3702 		pi_data->is_guest_mode = false;
3703 	}
3704 
3705 	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3706 	if (pi_data->is_guest_mode) {
3707 		ir_data->ga_root_ptr = (pi_data->base >> 12);
3708 		ir_data->ga_vector = vcpu_pi_info->vector;
3709 		ir_data->ga_tag = pi_data->ga_tag;
3710 		ret = amd_iommu_activate_guest_mode(ir_data);
3711 		if (!ret)
3712 			ir_data->cached_ga_tag = pi_data->ga_tag;
3713 	} else {
3714 		ret = amd_iommu_deactivate_guest_mode(ir_data);
3715 
3716 		/*
3717 		 * This communicates the ga_tag back to the caller
3718 		 * so that it can do all the necessary clean up.
3719 		 */
3720 		if (!ret)
3721 			ir_data->cached_ga_tag = 0;
3722 	}
3723 
3724 	return ret;
3725 }
3726 
3727 
3728 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3729 			       struct amd_ir_data *ir_data,
3730 			       struct irq_2_irte *irte_info,
3731 			       struct irq_cfg *cfg)
3732 {
3733 
3734 	/*
3735 	 * Atomically updates the IRTE with the new destination, vector
3736 	 * and flushes the interrupt entry cache.
3737 	 */
3738 	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3739 				      irte_info->index, cfg->vector,
3740 				      cfg->dest_apicid);
3741 }
3742 
3743 static int amd_ir_set_affinity(struct irq_data *data,
3744 			       const struct cpumask *mask, bool force)
3745 {
3746 	struct amd_ir_data *ir_data = data->chip_data;
3747 	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3748 	struct irq_cfg *cfg = irqd_cfg(data);
3749 	struct irq_data *parent = data->parent_data;
3750 	struct amd_iommu *iommu = ir_data->iommu;
3751 	int ret;
3752 
3753 	if (!iommu)
3754 		return -ENODEV;
3755 
3756 	ret = parent->chip->irq_set_affinity(parent, mask, force);
3757 	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3758 		return ret;
3759 
3760 	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3761 	/*
3762 	 * After this point, all the interrupts will start arriving
3763 	 * at the new destination. So, time to cleanup the previous
3764 	 * vector allocation.
3765 	 */
3766 	vector_schedule_cleanup(cfg);
3767 
3768 	return IRQ_SET_MASK_OK_DONE;
3769 }
3770 
3771 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3772 {
3773 	struct amd_ir_data *ir_data = irq_data->chip_data;
3774 
3775 	*msg = ir_data->msi_entry;
3776 }
3777 
3778 static struct irq_chip amd_ir_chip = {
3779 	.name			= "AMD-IR",
3780 	.irq_ack		= apic_ack_irq,
3781 	.irq_set_affinity	= amd_ir_set_affinity,
3782 	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3783 	.irq_compose_msi_msg	= ir_compose_msi_msg,
3784 };
3785 
3786 static const struct msi_parent_ops amdvi_msi_parent_ops = {
3787 	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
3788 	.prefix			= "IR-",
3789 	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3790 };
3791 
3792 int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3793 {
3794 	struct fwnode_handle *fn;
3795 
3796 	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3797 	if (!fn)
3798 		return -ENOMEM;
3799 	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3800 						       fn, &amd_ir_domain_ops, iommu);
3801 	if (!iommu->ir_domain) {
3802 		irq_domain_free_fwnode(fn);
3803 		return -ENOMEM;
3804 	}
3805 
3806 	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3807 	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3808 				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3809 	iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3810 
3811 	return 0;
3812 }
3813 
3814 int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3815 {
3816 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3817 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3818 
3819 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3820 	    !entry || !entry->lo.fields_vapic.guest_mode)
3821 		return 0;
3822 
3823 	if (!ir_data->iommu)
3824 		return -ENODEV;
3825 
3826 	if (cpu >= 0) {
3827 		entry->lo.fields_vapic.destination =
3828 					APICID_TO_IRTE_DEST_LO(cpu);
3829 		entry->hi.fields.destination =
3830 					APICID_TO_IRTE_DEST_HI(cpu);
3831 	}
3832 	entry->lo.fields_vapic.is_run = is_run;
3833 
3834 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3835 				ir_data->irq_2_irte.index, entry);
3836 }
3837 EXPORT_SYMBOL(amd_iommu_update_ga);
3838 #endif
3839