1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 4 * Author: Joerg Roedel <jroedel@suse.de> 5 * Leo Duran <leo.duran@amd.com> 6 */ 7 8 #define pr_fmt(fmt) "AMD-Vi: " fmt 9 #define dev_fmt(fmt) pr_fmt(fmt) 10 11 #include <linux/ratelimit.h> 12 #include <linux/pci.h> 13 #include <linux/acpi.h> 14 #include <linux/pci-ats.h> 15 #include <linux/bitmap.h> 16 #include <linux/slab.h> 17 #include <linux/string_choices.h> 18 #include <linux/debugfs.h> 19 #include <linux/scatterlist.h> 20 #include <linux/dma-map-ops.h> 21 #include <linux/dma-direct.h> 22 #include <linux/idr.h> 23 #include <linux/iommu-helper.h> 24 #include <linux/delay.h> 25 #include <linux/amd-iommu.h> 26 #include <linux/notifier.h> 27 #include <linux/export.h> 28 #include <linux/irq.h> 29 #include <linux/irqchip/irq-msi-lib.h> 30 #include <linux/msi.h> 31 #include <linux/irqdomain.h> 32 #include <linux/percpu.h> 33 #include <linux/io-pgtable.h> 34 #include <linux/cc_platform.h> 35 #include <asm/irq_remapping.h> 36 #include <asm/io_apic.h> 37 #include <asm/apic.h> 38 #include <asm/hw_irq.h> 39 #include <asm/proto.h> 40 #include <asm/iommu.h> 41 #include <asm/gart.h> 42 #include <asm/dma.h> 43 #include <uapi/linux/iommufd.h> 44 45 #include "amd_iommu.h" 46 #include "../dma-iommu.h" 47 #include "../irq_remapping.h" 48 #include "../iommu-pages.h" 49 50 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 51 52 /* Reserved IOVA ranges */ 53 #define MSI_RANGE_START (0xfee00000) 54 #define MSI_RANGE_END (0xfeefffff) 55 #define HT_RANGE_START (0xfd00000000ULL) 56 #define HT_RANGE_END (0xffffffffffULL) 57 58 LIST_HEAD(ioapic_map); 59 LIST_HEAD(hpet_map); 60 LIST_HEAD(acpihid_map); 61 62 const struct iommu_ops amd_iommu_ops; 63 static const struct iommu_dirty_ops amd_dirty_ops; 64 65 int amd_iommu_max_glx_val = -1; 66 67 /* 68 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap 69 * to know which ones are already in use. 70 */ 71 DEFINE_IDA(pdom_ids); 72 73 static int amd_iommu_attach_device(struct iommu_domain *dom, 74 struct device *dev); 75 76 static void set_dte_entry(struct amd_iommu *iommu, 77 struct iommu_dev_data *dev_data); 78 79 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 80 81 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 82 83 /**************************************************************************** 84 * 85 * Helper functions 86 * 87 ****************************************************************************/ 88 89 static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) 90 { 91 /* 92 * Note: 93 * We use arch_cmpxchg128_local() because: 94 * - Need cmpxchg16b instruction mainly for 128-bit store to DTE 95 * (not necessary for cmpxchg since this function is already 96 * protected by a spin_lock for this DTE). 97 * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. 98 */ 99 arch_cmpxchg128_local(ptr, *ptr, val); 100 } 101 102 static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) 103 { 104 struct dev_table_entry old; 105 106 old.data128[1] = ptr->data128[1]; 107 /* 108 * Preserve DTE_DATA2_INTR_MASK. This needs to be 109 * done here since it requires to be inside 110 * spin_lock(&dev_data->dte_lock) context. 111 */ 112 new->data[2] &= ~DTE_DATA2_INTR_MASK; 113 new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; 114 115 amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); 116 } 117 118 static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) 119 { 120 amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); 121 } 122 123 /* 124 * Note: 125 * IOMMU reads the entire Device Table entry in a single 256-bit transaction 126 * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver 127 * need to ensure the following: 128 * - DTE[V|GV] bit is being written last when setting. 129 * - DTE[V|GV] bit is being written first when clearing. 130 * 131 * This function is used only by code, which updates DMA translation part of the DTE. 132 * So, only consider control bits related to DMA when updating the entry. 133 */ 134 static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 135 struct dev_table_entry *new) 136 { 137 unsigned long flags; 138 struct dev_table_entry *dev_table = get_dev_table(iommu); 139 struct dev_table_entry *ptr = &dev_table[dev_data->devid]; 140 141 spin_lock_irqsave(&dev_data->dte_lock, flags); 142 143 if (!(ptr->data[0] & DTE_FLAG_V)) { 144 /* Existing DTE is not valid. */ 145 write_dte_upper128(ptr, new); 146 write_dte_lower128(ptr, new); 147 iommu_flush_dte_sync(iommu, dev_data->devid); 148 } else if (!(new->data[0] & DTE_FLAG_V)) { 149 /* Existing DTE is valid. New DTE is not valid. */ 150 write_dte_lower128(ptr, new); 151 write_dte_upper128(ptr, new); 152 iommu_flush_dte_sync(iommu, dev_data->devid); 153 } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { 154 /* 155 * Both DTEs are valid. 156 * Existing DTE has no guest page table. 157 */ 158 write_dte_upper128(ptr, new); 159 write_dte_lower128(ptr, new); 160 iommu_flush_dte_sync(iommu, dev_data->devid); 161 } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { 162 /* 163 * Both DTEs are valid. 164 * Existing DTE has guest page table, 165 * new DTE has no guest page table, 166 */ 167 write_dte_lower128(ptr, new); 168 write_dte_upper128(ptr, new); 169 iommu_flush_dte_sync(iommu, dev_data->devid); 170 } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != 171 FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { 172 /* 173 * Both DTEs are valid and have guest page table, 174 * but have different number of levels. So, we need 175 * to upadte both upper and lower 128-bit value, which 176 * require disabling and flushing. 177 */ 178 struct dev_table_entry clear = {}; 179 180 /* First disable DTE */ 181 write_dte_lower128(ptr, &clear); 182 iommu_flush_dte_sync(iommu, dev_data->devid); 183 184 /* Then update DTE */ 185 write_dte_upper128(ptr, new); 186 write_dte_lower128(ptr, new); 187 iommu_flush_dte_sync(iommu, dev_data->devid); 188 } else { 189 /* 190 * Both DTEs are valid and have guest page table, 191 * and same number of levels. We just need to only 192 * update the lower 128-bit. So no need to disable DTE. 193 */ 194 write_dte_lower128(ptr, new); 195 } 196 197 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 198 } 199 200 static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 201 struct dev_table_entry *dte) 202 { 203 unsigned long flags; 204 struct dev_table_entry *ptr; 205 struct dev_table_entry *dev_table = get_dev_table(iommu); 206 207 ptr = &dev_table[dev_data->devid]; 208 209 spin_lock_irqsave(&dev_data->dte_lock, flags); 210 dte->data128[0] = ptr->data128[0]; 211 dte->data128[1] = ptr->data128[1]; 212 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 213 } 214 215 static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) 216 { 217 return (pdom && (pdom->pd_mode == PD_MODE_V2)); 218 } 219 220 static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom) 221 { 222 return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY); 223 } 224 225 /* 226 * We cannot support PASID w/ existing v1 page table in the same domain 227 * since it will be nested. However, existing domain w/ v2 page table 228 * or passthrough mode can be used for PASID. 229 */ 230 static inline bool pdom_is_sva_capable(struct protection_domain *pdom) 231 { 232 return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom); 233 } 234 235 static inline int get_acpihid_device_id(struct device *dev, 236 struct acpihid_map_entry **entry) 237 { 238 struct acpi_device *adev = ACPI_COMPANION(dev); 239 struct acpihid_map_entry *p, *p1 = NULL; 240 int hid_count = 0; 241 bool fw_bug; 242 243 if (!adev) 244 return -ENODEV; 245 246 list_for_each_entry(p, &acpihid_map, list) { 247 if (acpi_dev_hid_uid_match(adev, p->hid, 248 p->uid[0] ? p->uid : NULL)) { 249 p1 = p; 250 fw_bug = false; 251 hid_count = 1; 252 break; 253 } 254 255 /* 256 * Count HID matches w/o UID, raise FW_BUG but allow exactly one match 257 */ 258 if (acpi_dev_hid_match(adev, p->hid)) { 259 p1 = p; 260 hid_count++; 261 fw_bug = true; 262 } 263 } 264 265 if (!p1) 266 return -EINVAL; 267 if (fw_bug) 268 dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", 269 hid_count, str_plural(hid_count)); 270 if (hid_count > 1) 271 return -EINVAL; 272 if (entry) 273 *entry = p1; 274 275 return p1->devid; 276 } 277 278 static inline int get_device_sbdf_id(struct device *dev) 279 { 280 int sbdf; 281 282 if (dev_is_pci(dev)) 283 sbdf = get_pci_sbdf_id(to_pci_dev(dev)); 284 else 285 sbdf = get_acpihid_device_id(dev, NULL); 286 287 return sbdf; 288 } 289 290 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu) 291 { 292 struct dev_table_entry *dev_table; 293 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 294 295 BUG_ON(pci_seg == NULL); 296 dev_table = pci_seg->dev_table; 297 BUG_ON(dev_table == NULL); 298 299 return dev_table; 300 } 301 302 static inline u16 get_device_segment(struct device *dev) 303 { 304 u16 seg; 305 306 if (dev_is_pci(dev)) { 307 struct pci_dev *pdev = to_pci_dev(dev); 308 309 seg = pci_domain_nr(pdev->bus); 310 } else { 311 u32 devid = get_acpihid_device_id(dev, NULL); 312 313 seg = PCI_SBDF_TO_SEGID(devid); 314 } 315 316 return seg; 317 } 318 319 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */ 320 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid) 321 { 322 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 323 324 pci_seg->rlookup_table[devid] = iommu; 325 } 326 327 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) 328 { 329 struct amd_iommu_pci_seg *pci_seg; 330 331 for_each_pci_segment(pci_seg) { 332 if (pci_seg->id == seg) 333 return pci_seg->rlookup_table[devid]; 334 } 335 return NULL; 336 } 337 338 static struct amd_iommu *rlookup_amd_iommu(struct device *dev) 339 { 340 u16 seg = get_device_segment(dev); 341 int devid = get_device_sbdf_id(dev); 342 343 if (devid < 0) 344 return NULL; 345 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid)); 346 } 347 348 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) 349 { 350 struct iommu_dev_data *dev_data; 351 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 352 353 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); 354 if (!dev_data) 355 return NULL; 356 357 mutex_init(&dev_data->mutex); 358 spin_lock_init(&dev_data->dte_lock); 359 dev_data->devid = devid; 360 ratelimit_default_init(&dev_data->rs); 361 362 llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list); 363 return dev_data; 364 } 365 366 struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) 367 { 368 struct iommu_dev_data *dev_data; 369 struct llist_node *node; 370 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 371 372 if (llist_empty(&pci_seg->dev_data_list)) 373 return NULL; 374 375 node = pci_seg->dev_data_list.first; 376 llist_for_each_entry(dev_data, node, dev_data_list) { 377 if (dev_data->devid == devid) 378 return dev_data; 379 } 380 381 return NULL; 382 } 383 384 static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) 385 { 386 struct dev_table_entry new; 387 struct amd_iommu *iommu; 388 struct iommu_dev_data *dev_data, *alias_data; 389 u16 devid = pci_dev_id(pdev); 390 int ret = 0; 391 392 if (devid == alias) 393 return 0; 394 395 iommu = rlookup_amd_iommu(&pdev->dev); 396 if (!iommu) 397 return 0; 398 399 /* Copy the data from pdev */ 400 dev_data = dev_iommu_priv_get(&pdev->dev); 401 if (!dev_data) { 402 pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); 403 ret = -EINVAL; 404 goto out; 405 } 406 get_dte256(iommu, dev_data, &new); 407 408 /* Setup alias */ 409 alias_data = find_dev_data(iommu, alias); 410 if (!alias_data) { 411 pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); 412 ret = -EINVAL; 413 goto out; 414 } 415 update_dte256(iommu, alias_data, &new); 416 417 amd_iommu_set_rlookup_table(iommu, alias); 418 out: 419 return ret; 420 } 421 422 static void clone_aliases(struct amd_iommu *iommu, struct device *dev) 423 { 424 struct pci_dev *pdev; 425 426 if (!dev_is_pci(dev)) 427 return; 428 pdev = to_pci_dev(dev); 429 430 /* 431 * The IVRS alias stored in the alias table may not be 432 * part of the PCI DMA aliases if it's bus differs 433 * from the original device. 434 */ 435 clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL); 436 437 pci_for_each_dma_alias(pdev, clone_alias, NULL); 438 } 439 440 static void setup_aliases(struct amd_iommu *iommu, struct device *dev) 441 { 442 struct pci_dev *pdev = to_pci_dev(dev); 443 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 444 u16 ivrs_alias; 445 446 /* For ACPI HID devices, there are no aliases */ 447 if (!dev_is_pci(dev)) 448 return; 449 450 /* 451 * Add the IVRS alias to the pci aliases if it is on the same 452 * bus. The IVRS table may know about a quirk that we don't. 453 */ 454 ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)]; 455 if (ivrs_alias != pci_dev_id(pdev) && 456 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) 457 pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1); 458 459 clone_aliases(iommu, dev); 460 } 461 462 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid) 463 { 464 struct iommu_dev_data *dev_data; 465 466 dev_data = search_dev_data(iommu, devid); 467 468 if (dev_data == NULL) { 469 dev_data = alloc_dev_data(iommu, devid); 470 if (!dev_data) 471 return NULL; 472 473 if (translation_pre_enabled(iommu)) 474 dev_data->defer_attach = true; 475 } 476 477 return dev_data; 478 } 479 480 /* 481 * Find or create an IOMMU group for a acpihid device. 482 */ 483 static struct iommu_group *acpihid_device_group(struct device *dev) 484 { 485 struct acpihid_map_entry *p, *entry = NULL; 486 int devid; 487 488 devid = get_acpihid_device_id(dev, &entry); 489 if (devid < 0) 490 return ERR_PTR(devid); 491 492 list_for_each_entry(p, &acpihid_map, list) { 493 if ((devid == p->devid) && p->group) 494 entry->group = p->group; 495 } 496 497 if (!entry->group) 498 entry->group = generic_device_group(dev); 499 else 500 iommu_group_ref_get(entry->group); 501 502 return entry->group; 503 } 504 505 static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) 506 { 507 return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); 508 } 509 510 static u32 pdev_get_caps(struct pci_dev *pdev) 511 { 512 int features; 513 u32 flags = 0; 514 515 if (pci_ats_supported(pdev)) 516 flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 517 518 if (pci_pri_supported(pdev)) 519 flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 520 521 features = pci_pasid_features(pdev); 522 if (features >= 0) { 523 flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 524 525 if (features & PCI_PASID_CAP_EXEC) 526 flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 527 528 if (features & PCI_PASID_CAP_PRIV) 529 flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 530 } 531 532 return flags; 533 } 534 535 static inline int pdev_enable_cap_ats(struct pci_dev *pdev) 536 { 537 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 538 int ret = -EINVAL; 539 540 if (dev_data->ats_enabled) 541 return 0; 542 543 if (amd_iommu_iotlb_sup && 544 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { 545 ret = pci_enable_ats(pdev, PAGE_SHIFT); 546 if (!ret) { 547 dev_data->ats_enabled = 1; 548 dev_data->ats_qdep = pci_ats_queue_depth(pdev); 549 } 550 } 551 552 return ret; 553 } 554 555 static inline void pdev_disable_cap_ats(struct pci_dev *pdev) 556 { 557 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 558 559 if (dev_data->ats_enabled) { 560 pci_disable_ats(pdev); 561 dev_data->ats_enabled = 0; 562 } 563 } 564 565 static inline int pdev_enable_cap_pri(struct pci_dev *pdev) 566 { 567 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 568 int ret = -EINVAL; 569 570 if (dev_data->pri_enabled) 571 return 0; 572 573 if (!dev_data->ats_enabled) 574 return 0; 575 576 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { 577 /* 578 * First reset the PRI state of the device. 579 * FIXME: Hardcode number of outstanding requests for now 580 */ 581 if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { 582 dev_data->pri_enabled = 1; 583 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); 584 585 ret = 0; 586 } 587 } 588 589 return ret; 590 } 591 592 static inline void pdev_disable_cap_pri(struct pci_dev *pdev) 593 { 594 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 595 596 if (dev_data->pri_enabled) { 597 pci_disable_pri(pdev); 598 dev_data->pri_enabled = 0; 599 } 600 } 601 602 static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) 603 { 604 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 605 int ret = -EINVAL; 606 607 if (dev_data->pasid_enabled) 608 return 0; 609 610 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { 611 /* Only allow access to user-accessible pages */ 612 ret = pci_enable_pasid(pdev, 0); 613 if (!ret) 614 dev_data->pasid_enabled = 1; 615 } 616 617 return ret; 618 } 619 620 static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) 621 { 622 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 623 624 if (dev_data->pasid_enabled) { 625 pci_disable_pasid(pdev); 626 dev_data->pasid_enabled = 0; 627 } 628 } 629 630 static void pdev_enable_caps(struct pci_dev *pdev) 631 { 632 pdev_enable_cap_pasid(pdev); 633 pdev_enable_cap_ats(pdev); 634 pdev_enable_cap_pri(pdev); 635 } 636 637 static void pdev_disable_caps(struct pci_dev *pdev) 638 { 639 pdev_disable_cap_ats(pdev); 640 pdev_disable_cap_pasid(pdev); 641 pdev_disable_cap_pri(pdev); 642 } 643 644 /* 645 * This function checks if the driver got a valid device from the caller to 646 * avoid dereferencing invalid pointers. 647 */ 648 static bool check_device(struct device *dev) 649 { 650 struct amd_iommu_pci_seg *pci_seg; 651 struct amd_iommu *iommu; 652 int devid, sbdf; 653 654 if (!dev) 655 return false; 656 657 sbdf = get_device_sbdf_id(dev); 658 if (sbdf < 0) 659 return false; 660 devid = PCI_SBDF_TO_DEVID(sbdf); 661 662 iommu = rlookup_amd_iommu(dev); 663 if (!iommu) 664 return false; 665 666 /* Out of our scope? */ 667 pci_seg = iommu->pci_seg; 668 if (devid > pci_seg->last_bdf) 669 return false; 670 671 return true; 672 } 673 674 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) 675 { 676 struct iommu_dev_data *dev_data; 677 int devid, sbdf; 678 679 if (dev_iommu_priv_get(dev)) 680 return 0; 681 682 sbdf = get_device_sbdf_id(dev); 683 if (sbdf < 0) 684 return sbdf; 685 686 devid = PCI_SBDF_TO_DEVID(sbdf); 687 dev_data = find_dev_data(iommu, devid); 688 if (!dev_data) 689 return -ENOMEM; 690 691 dev_data->dev = dev; 692 693 /* 694 * The dev_iommu_priv_set() needes to be called before setup_aliases. 695 * Otherwise, subsequent call to dev_iommu_priv_get() will fail. 696 */ 697 dev_iommu_priv_set(dev, dev_data); 698 setup_aliases(iommu, dev); 699 700 /* 701 * By default we use passthrough mode for IOMMUv2 capable device. 702 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to 703 * invalid address), we ignore the capability for the device so 704 * it'll be forced to go into translation mode. 705 */ 706 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && 707 dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { 708 dev_data->flags = pdev_get_caps(to_pci_dev(dev)); 709 } 710 711 return 0; 712 } 713 714 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) 715 { 716 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 717 struct dev_table_entry *dev_table = get_dev_table(iommu); 718 int devid, sbdf; 719 720 sbdf = get_device_sbdf_id(dev); 721 if (sbdf < 0) 722 return; 723 724 devid = PCI_SBDF_TO_DEVID(sbdf); 725 pci_seg->rlookup_table[devid] = NULL; 726 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry)); 727 728 setup_aliases(iommu, dev); 729 } 730 731 732 /**************************************************************************** 733 * 734 * Interrupt handling functions 735 * 736 ****************************************************************************/ 737 738 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) 739 { 740 int i; 741 struct dev_table_entry dte; 742 struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); 743 744 get_dte256(iommu, dev_data, &dte); 745 746 for (i = 0; i < 4; ++i) 747 pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); 748 } 749 750 static void dump_command(unsigned long phys_addr) 751 { 752 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 753 int i; 754 755 for (i = 0; i < 4; ++i) 756 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]); 757 } 758 759 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event) 760 { 761 struct iommu_dev_data *dev_data = NULL; 762 int devid, vmg_tag, flags; 763 struct pci_dev *pdev; 764 u64 spa; 765 766 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 767 vmg_tag = (event[1]) & 0xFFFF; 768 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 769 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8); 770 771 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 772 devid & 0xff); 773 if (pdev) 774 dev_data = dev_iommu_priv_get(&pdev->dev); 775 776 if (dev_data) { 777 if (__ratelimit(&dev_data->rs)) { 778 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 779 vmg_tag, spa, flags); 780 } 781 } else { 782 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 783 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 784 vmg_tag, spa, flags); 785 } 786 787 if (pdev) 788 pci_dev_put(pdev); 789 } 790 791 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event) 792 { 793 struct iommu_dev_data *dev_data = NULL; 794 int devid, flags_rmp, vmg_tag, flags; 795 struct pci_dev *pdev; 796 u64 gpa; 797 798 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 799 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF; 800 vmg_tag = (event[1]) & 0xFFFF; 801 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 802 gpa = ((u64)event[3] << 32) | event[2]; 803 804 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 805 devid & 0xff); 806 if (pdev) 807 dev_data = dev_iommu_priv_get(&pdev->dev); 808 809 if (dev_data) { 810 if (__ratelimit(&dev_data->rs)) { 811 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 812 vmg_tag, gpa, flags_rmp, flags); 813 } 814 } else { 815 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 816 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 817 vmg_tag, gpa, flags_rmp, flags); 818 } 819 820 if (pdev) 821 pci_dev_put(pdev); 822 } 823 824 #define IS_IOMMU_MEM_TRANSACTION(flags) \ 825 (((flags) & EVENT_FLAG_I) == 0) 826 827 #define IS_WRITE_REQUEST(flags) \ 828 ((flags) & EVENT_FLAG_RW) 829 830 static void amd_iommu_report_page_fault(struct amd_iommu *iommu, 831 u16 devid, u16 domain_id, 832 u64 address, int flags) 833 { 834 struct iommu_dev_data *dev_data = NULL; 835 struct pci_dev *pdev; 836 837 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 838 devid & 0xff); 839 if (pdev) 840 dev_data = dev_iommu_priv_get(&pdev->dev); 841 842 if (dev_data) { 843 /* 844 * If this is a DMA fault (for which the I(nterrupt) 845 * bit will be unset), allow report_iommu_fault() to 846 * prevent logging it. 847 */ 848 if (IS_IOMMU_MEM_TRANSACTION(flags)) { 849 /* Device not attached to domain properly */ 850 if (dev_data->domain == NULL) { 851 pr_err_ratelimited("Event logged [Device not attached to domain properly]\n"); 852 pr_err_ratelimited(" device=%04x:%02x:%02x.%x domain=0x%04x\n", 853 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), 854 PCI_FUNC(devid), domain_id); 855 goto out; 856 } 857 858 if (!report_iommu_fault(&dev_data->domain->domain, 859 &pdev->dev, address, 860 IS_WRITE_REQUEST(flags) ? 861 IOMMU_FAULT_WRITE : 862 IOMMU_FAULT_READ)) 863 goto out; 864 } 865 866 if (__ratelimit(&dev_data->rs)) { 867 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", 868 domain_id, address, flags); 869 } 870 } else { 871 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", 872 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 873 domain_id, address, flags); 874 } 875 876 out: 877 if (pdev) 878 pci_dev_put(pdev); 879 } 880 881 static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 882 { 883 struct device *dev = iommu->iommu.dev; 884 int type, devid, flags, tag; 885 volatile u32 *event = __evt; 886 int count = 0; 887 u64 address, ctrl; 888 u32 pasid; 889 890 retry: 891 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 892 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 893 pasid = (event[0] & EVENT_DOMID_MASK_HI) | 894 (event[1] & EVENT_DOMID_MASK_LO); 895 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 896 address = (u64)(((u64)event[3]) << 32) | event[2]; 897 ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); 898 899 if (type == 0) { 900 /* Did we hit the erratum? */ 901 if (++count == LOOP_TIMEOUT) { 902 pr_err("No event written to event log\n"); 903 return; 904 } 905 udelay(1); 906 goto retry; 907 } 908 909 if (type == EVENT_TYPE_IO_FAULT) { 910 amd_iommu_report_page_fault(iommu, devid, pasid, address, flags); 911 return; 912 } 913 914 switch (type) { 915 case EVENT_TYPE_ILL_DEV: 916 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 917 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 918 pasid, address, flags); 919 dev_err(dev, "Control Reg : 0x%llx\n", ctrl); 920 dump_dte_entry(iommu, devid); 921 break; 922 case EVENT_TYPE_DEV_TAB_ERR: 923 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x " 924 "address=0x%llx flags=0x%04x]\n", 925 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 926 address, flags); 927 break; 928 case EVENT_TYPE_PAGE_TAB_ERR: 929 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n", 930 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 931 pasid, address, flags); 932 break; 933 case EVENT_TYPE_ILL_CMD: 934 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address); 935 dump_command(address); 936 break; 937 case EVENT_TYPE_CMD_HARD_ERR: 938 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n", 939 address, flags); 940 break; 941 case EVENT_TYPE_IOTLB_INV_TO: 942 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n", 943 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 944 address); 945 break; 946 case EVENT_TYPE_INV_DEV_REQ: 947 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 948 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 949 pasid, address, flags); 950 break; 951 case EVENT_TYPE_RMP_FAULT: 952 amd_iommu_report_rmp_fault(iommu, event); 953 break; 954 case EVENT_TYPE_RMP_HW_ERR: 955 amd_iommu_report_rmp_hw_error(iommu, event); 956 break; 957 case EVENT_TYPE_INV_PPR_REQ: 958 pasid = PPR_PASID(*((u64 *)__evt)); 959 tag = event[1] & 0x03FF; 960 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n", 961 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 962 pasid, address, flags, tag); 963 break; 964 default: 965 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n", 966 event[0], event[1], event[2], event[3]); 967 } 968 969 /* 970 * To detect the hardware errata 732 we need to clear the 971 * entry back to zero. This issue does not exist on SNP 972 * enabled system. Also this buffer is not writeable on 973 * SNP enabled system. 974 */ 975 if (!amd_iommu_snp_en) 976 memset(__evt, 0, 4 * sizeof(u32)); 977 } 978 979 static void iommu_poll_events(struct amd_iommu *iommu) 980 { 981 u32 head, tail; 982 983 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 984 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 985 986 while (head != tail) { 987 iommu_print_event(iommu, iommu->evt_buf + head); 988 989 /* Update head pointer of hardware ring-buffer */ 990 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; 991 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 992 } 993 994 } 995 996 #ifdef CONFIG_IRQ_REMAP 997 static int (*iommu_ga_log_notifier)(u32); 998 999 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) 1000 { 1001 iommu_ga_log_notifier = notifier; 1002 1003 /* 1004 * Ensure all in-flight IRQ handlers run to completion before returning 1005 * to the caller, e.g. to ensure module code isn't unloaded while it's 1006 * being executed in the IRQ handler. 1007 */ 1008 if (!notifier) 1009 synchronize_rcu(); 1010 1011 return 0; 1012 } 1013 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); 1014 1015 static void iommu_poll_ga_log(struct amd_iommu *iommu) 1016 { 1017 u32 head, tail; 1018 1019 if (iommu->ga_log == NULL) 1020 return; 1021 1022 head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1023 tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); 1024 1025 while (head != tail) { 1026 volatile u64 *raw; 1027 u64 log_entry; 1028 1029 raw = (u64 *)(iommu->ga_log + head); 1030 1031 /* Avoid memcpy function-call overhead */ 1032 log_entry = *raw; 1033 1034 /* Update head pointer of hardware ring-buffer */ 1035 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; 1036 writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1037 1038 /* Handle GA entry */ 1039 switch (GA_REQ_TYPE(log_entry)) { 1040 case GA_GUEST_NR: 1041 if (!iommu_ga_log_notifier) 1042 break; 1043 1044 pr_debug("%s: devid=%#x, ga_tag=%#x\n", 1045 __func__, GA_DEVID(log_entry), 1046 GA_TAG(log_entry)); 1047 1048 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) 1049 pr_err("GA log notifier failed.\n"); 1050 break; 1051 default: 1052 break; 1053 } 1054 } 1055 } 1056 1057 static void 1058 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) 1059 { 1060 if (!irq_remapping_enabled || !dev_is_pci(dev) || 1061 !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev))) 1062 return; 1063 1064 dev_set_msi_domain(dev, iommu->ir_domain); 1065 } 1066 1067 #else /* CONFIG_IRQ_REMAP */ 1068 static inline void 1069 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } 1070 #endif /* !CONFIG_IRQ_REMAP */ 1071 1072 static void amd_iommu_handle_irq(void *data, const char *evt_type, 1073 u32 int_mask, u32 overflow_mask, 1074 void (*int_handler)(struct amd_iommu *), 1075 void (*overflow_handler)(struct amd_iommu *)) 1076 { 1077 struct amd_iommu *iommu = (struct amd_iommu *) data; 1078 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1079 u32 mask = int_mask | overflow_mask; 1080 1081 while (status & mask) { 1082 /* Enable interrupt sources again */ 1083 writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET); 1084 1085 if (int_handler) { 1086 pr_devel("Processing IOMMU (ivhd%d) %s Log\n", 1087 iommu->index, evt_type); 1088 int_handler(iommu); 1089 } 1090 1091 if ((status & overflow_mask) && overflow_handler) 1092 overflow_handler(iommu); 1093 1094 /* 1095 * Hardware bug: ERBT1312 1096 * When re-enabling interrupt (by writing 1 1097 * to clear the bit), the hardware might also try to set 1098 * the interrupt bit in the event status register. 1099 * In this scenario, the bit will be set, and disable 1100 * subsequent interrupts. 1101 * 1102 * Workaround: The IOMMU driver should read back the 1103 * status register and check if the interrupt bits are cleared. 1104 * If not, driver will need to go through the interrupt handler 1105 * again and re-clear the bits 1106 */ 1107 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1108 } 1109 } 1110 1111 irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data) 1112 { 1113 amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK, 1114 MMIO_STATUS_EVT_OVERFLOW_MASK, 1115 iommu_poll_events, amd_iommu_restart_event_logging); 1116 1117 return IRQ_HANDLED; 1118 } 1119 1120 irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data) 1121 { 1122 amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK, 1123 MMIO_STATUS_PPR_OVERFLOW_MASK, 1124 amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log); 1125 1126 return IRQ_HANDLED; 1127 } 1128 1129 irqreturn_t amd_iommu_int_thread_galog(int irq, void *data) 1130 { 1131 #ifdef CONFIG_IRQ_REMAP 1132 amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK, 1133 MMIO_STATUS_GALOG_OVERFLOW_MASK, 1134 iommu_poll_ga_log, amd_iommu_restart_ga_log); 1135 #endif 1136 1137 return IRQ_HANDLED; 1138 } 1139 1140 irqreturn_t amd_iommu_int_thread(int irq, void *data) 1141 { 1142 amd_iommu_int_thread_evtlog(irq, data); 1143 amd_iommu_int_thread_pprlog(irq, data); 1144 amd_iommu_int_thread_galog(irq, data); 1145 1146 return IRQ_HANDLED; 1147 } 1148 1149 irqreturn_t amd_iommu_int_handler(int irq, void *data) 1150 { 1151 return IRQ_WAKE_THREAD; 1152 } 1153 1154 /**************************************************************************** 1155 * 1156 * IOMMU command queuing functions 1157 * 1158 ****************************************************************************/ 1159 1160 static int wait_on_sem(struct amd_iommu *iommu, u64 data) 1161 { 1162 int i = 0; 1163 1164 while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) { 1165 udelay(1); 1166 i += 1; 1167 } 1168 1169 if (i == LOOP_TIMEOUT) { 1170 pr_alert("Completion-Wait loop timed out\n"); 1171 return -EIO; 1172 } 1173 1174 return 0; 1175 } 1176 1177 static void copy_cmd_to_buffer(struct amd_iommu *iommu, 1178 struct iommu_cmd *cmd) 1179 { 1180 u8 *target; 1181 u32 tail; 1182 1183 /* Copy command to buffer */ 1184 tail = iommu->cmd_buf_tail; 1185 target = iommu->cmd_buf + tail; 1186 memcpy(target, cmd, sizeof(*cmd)); 1187 1188 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1189 iommu->cmd_buf_tail = tail; 1190 1191 /* Tell the IOMMU about it */ 1192 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1193 } 1194 1195 static void build_completion_wait(struct iommu_cmd *cmd, 1196 struct amd_iommu *iommu, 1197 u64 data) 1198 { 1199 u64 paddr = iommu->cmd_sem_paddr; 1200 1201 memset(cmd, 0, sizeof(*cmd)); 1202 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 1203 cmd->data[1] = upper_32_bits(paddr); 1204 cmd->data[2] = lower_32_bits(data); 1205 cmd->data[3] = upper_32_bits(data); 1206 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 1207 } 1208 1209 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 1210 { 1211 memset(cmd, 0, sizeof(*cmd)); 1212 cmd->data[0] = devid; 1213 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 1214 } 1215 1216 /* 1217 * Builds an invalidation address which is suitable for one page or multiple 1218 * pages. Sets the size bit (S) as needed is more than one page is flushed. 1219 */ 1220 static inline u64 build_inv_address(u64 address, size_t size) 1221 { 1222 u64 pages, end, msb_diff; 1223 1224 pages = iommu_num_pages(address, size, PAGE_SIZE); 1225 1226 if (pages == 1) 1227 return address & PAGE_MASK; 1228 1229 end = address + size - 1; 1230 1231 /* 1232 * msb_diff would hold the index of the most significant bit that 1233 * flipped between the start and end. 1234 */ 1235 msb_diff = fls64(end ^ address) - 1; 1236 1237 /* 1238 * Bits 63:52 are sign extended. If for some reason bit 51 is different 1239 * between the start and the end, invalidate everything. 1240 */ 1241 if (unlikely(msb_diff > 51)) { 1242 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 1243 } else { 1244 /* 1245 * The msb-bit must be clear on the address. Just set all the 1246 * lower bits. 1247 */ 1248 address |= (1ull << msb_diff) - 1; 1249 } 1250 1251 /* Clear bits 11:0 */ 1252 address &= PAGE_MASK; 1253 1254 /* Set the size bit - we flush more than one 4kb page */ 1255 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK; 1256 } 1257 1258 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 1259 size_t size, u16 domid, 1260 ioasid_t pasid, bool gn) 1261 { 1262 u64 inv_address = build_inv_address(address, size); 1263 1264 memset(cmd, 0, sizeof(*cmd)); 1265 1266 cmd->data[1] |= domid; 1267 cmd->data[2] = lower_32_bits(inv_address); 1268 cmd->data[3] = upper_32_bits(inv_address); 1269 /* PDE bit - we want to flush everything, not only the PTEs */ 1270 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 1271 if (gn) { 1272 cmd->data[0] |= pasid; 1273 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1274 } 1275 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 1276 } 1277 1278 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 1279 u64 address, size_t size, 1280 ioasid_t pasid, bool gn) 1281 { 1282 u64 inv_address = build_inv_address(address, size); 1283 1284 memset(cmd, 0, sizeof(*cmd)); 1285 1286 cmd->data[0] = devid; 1287 cmd->data[0] |= (qdep & 0xff) << 24; 1288 cmd->data[1] = devid; 1289 cmd->data[2] = lower_32_bits(inv_address); 1290 cmd->data[3] = upper_32_bits(inv_address); 1291 if (gn) { 1292 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 1293 cmd->data[1] |= (pasid & 0xff) << 16; 1294 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1295 } 1296 1297 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 1298 } 1299 1300 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, 1301 int status, int tag, u8 gn) 1302 { 1303 memset(cmd, 0, sizeof(*cmd)); 1304 1305 cmd->data[0] = devid; 1306 if (gn) { 1307 cmd->data[1] = pasid; 1308 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 1309 } 1310 cmd->data[3] = tag & 0x1ff; 1311 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 1312 1313 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 1314 } 1315 1316 static void build_inv_all(struct iommu_cmd *cmd) 1317 { 1318 memset(cmd, 0, sizeof(*cmd)); 1319 CMD_SET_TYPE(cmd, CMD_INV_ALL); 1320 } 1321 1322 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 1323 { 1324 memset(cmd, 0, sizeof(*cmd)); 1325 cmd->data[0] = devid; 1326 CMD_SET_TYPE(cmd, CMD_INV_IRT); 1327 } 1328 1329 /* 1330 * Writes the command to the IOMMUs command buffer and informs the 1331 * hardware about the new command. 1332 */ 1333 static int __iommu_queue_command_sync(struct amd_iommu *iommu, 1334 struct iommu_cmd *cmd, 1335 bool sync) 1336 { 1337 unsigned int count = 0; 1338 u32 left, next_tail; 1339 1340 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1341 again: 1342 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; 1343 1344 if (left <= 0x20) { 1345 /* Skip udelay() the first time around */ 1346 if (count++) { 1347 if (count == LOOP_TIMEOUT) { 1348 pr_err("Command buffer timeout\n"); 1349 return -EIO; 1350 } 1351 1352 udelay(1); 1353 } 1354 1355 /* Update head and recheck remaining space */ 1356 iommu->cmd_buf_head = readl(iommu->mmio_base + 1357 MMIO_CMD_HEAD_OFFSET); 1358 1359 goto again; 1360 } 1361 1362 copy_cmd_to_buffer(iommu, cmd); 1363 1364 /* Do we need to make sure all commands are processed? */ 1365 iommu->need_sync = sync; 1366 1367 return 0; 1368 } 1369 1370 static int iommu_queue_command_sync(struct amd_iommu *iommu, 1371 struct iommu_cmd *cmd, 1372 bool sync) 1373 { 1374 unsigned long flags; 1375 int ret; 1376 1377 raw_spin_lock_irqsave(&iommu->lock, flags); 1378 ret = __iommu_queue_command_sync(iommu, cmd, sync); 1379 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1380 1381 return ret; 1382 } 1383 1384 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 1385 { 1386 return iommu_queue_command_sync(iommu, cmd, true); 1387 } 1388 1389 /* 1390 * This function queues a completion wait command into the command 1391 * buffer of an IOMMU 1392 */ 1393 static int iommu_completion_wait(struct amd_iommu *iommu) 1394 { 1395 struct iommu_cmd cmd; 1396 unsigned long flags; 1397 int ret; 1398 u64 data; 1399 1400 if (!iommu->need_sync) 1401 return 0; 1402 1403 data = atomic64_inc_return(&iommu->cmd_sem_val); 1404 build_completion_wait(&cmd, iommu, data); 1405 1406 raw_spin_lock_irqsave(&iommu->lock, flags); 1407 1408 ret = __iommu_queue_command_sync(iommu, &cmd, false); 1409 if (ret) 1410 goto out_unlock; 1411 1412 ret = wait_on_sem(iommu, data); 1413 1414 out_unlock: 1415 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1416 1417 return ret; 1418 } 1419 1420 static void domain_flush_complete(struct protection_domain *domain) 1421 { 1422 struct pdom_iommu_info *pdom_iommu_info; 1423 unsigned long i; 1424 1425 lockdep_assert_held(&domain->lock); 1426 1427 /* 1428 * Devices of this domain are behind this IOMMU 1429 * We need to wait for completion of all commands. 1430 */ 1431 xa_for_each(&domain->iommu_array, i, pdom_iommu_info) 1432 iommu_completion_wait(pdom_iommu_info->iommu); 1433 } 1434 1435 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 1436 { 1437 struct iommu_cmd cmd; 1438 1439 build_inv_dte(&cmd, devid); 1440 1441 return iommu_queue_command(iommu, &cmd); 1442 } 1443 1444 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) 1445 { 1446 int ret; 1447 1448 ret = iommu_flush_dte(iommu, devid); 1449 if (!ret) 1450 iommu_completion_wait(iommu); 1451 } 1452 1453 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) 1454 { 1455 u32 devid; 1456 u16 last_bdf = iommu->pci_seg->last_bdf; 1457 1458 for (devid = 0; devid <= last_bdf; ++devid) 1459 iommu_flush_dte(iommu, devid); 1460 1461 iommu_completion_wait(iommu); 1462 } 1463 1464 /* 1465 * This function uses heavy locking and may disable irqs for some time. But 1466 * this is no issue because it is only called during resume. 1467 */ 1468 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) 1469 { 1470 u32 dom_id; 1471 u16 last_bdf = iommu->pci_seg->last_bdf; 1472 1473 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { 1474 struct iommu_cmd cmd; 1475 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1476 dom_id, IOMMU_NO_PASID, false); 1477 iommu_queue_command(iommu, &cmd); 1478 } 1479 1480 iommu_completion_wait(iommu); 1481 } 1482 1483 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) 1484 { 1485 struct iommu_cmd cmd; 1486 1487 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1488 dom_id, IOMMU_NO_PASID, false); 1489 iommu_queue_command(iommu, &cmd); 1490 1491 iommu_completion_wait(iommu); 1492 } 1493 1494 static void amd_iommu_flush_all(struct amd_iommu *iommu) 1495 { 1496 struct iommu_cmd cmd; 1497 1498 build_inv_all(&cmd); 1499 1500 iommu_queue_command(iommu, &cmd); 1501 iommu_completion_wait(iommu); 1502 } 1503 1504 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 1505 { 1506 struct iommu_cmd cmd; 1507 1508 build_inv_irt(&cmd, devid); 1509 1510 iommu_queue_command(iommu, &cmd); 1511 } 1512 1513 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) 1514 { 1515 u32 devid; 1516 u16 last_bdf = iommu->pci_seg->last_bdf; 1517 1518 if (iommu->irtcachedis_enabled) 1519 return; 1520 1521 for (devid = 0; devid <= last_bdf; devid++) 1522 iommu_flush_irt(iommu, devid); 1523 1524 iommu_completion_wait(iommu); 1525 } 1526 1527 void amd_iommu_flush_all_caches(struct amd_iommu *iommu) 1528 { 1529 if (check_feature(FEATURE_IA)) { 1530 amd_iommu_flush_all(iommu); 1531 } else { 1532 amd_iommu_flush_dte_all(iommu); 1533 amd_iommu_flush_irt_all(iommu); 1534 amd_iommu_flush_tlb_all(iommu); 1535 } 1536 } 1537 1538 /* 1539 * Command send function for flushing on-device TLB 1540 */ 1541 static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, 1542 size_t size, ioasid_t pasid, bool gn) 1543 { 1544 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1545 struct iommu_cmd cmd; 1546 int qdep = dev_data->ats_qdep; 1547 1548 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, 1549 size, pasid, gn); 1550 1551 return iommu_queue_command(iommu, &cmd); 1552 } 1553 1554 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data) 1555 { 1556 struct amd_iommu *iommu = data; 1557 1558 return iommu_flush_dte(iommu, alias); 1559 } 1560 1561 /* 1562 * Command send function for invalidating a device table entry 1563 */ 1564 static int device_flush_dte(struct iommu_dev_data *dev_data) 1565 { 1566 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1567 struct pci_dev *pdev = NULL; 1568 struct amd_iommu_pci_seg *pci_seg; 1569 u16 alias; 1570 int ret; 1571 1572 if (dev_is_pci(dev_data->dev)) 1573 pdev = to_pci_dev(dev_data->dev); 1574 1575 if (pdev) 1576 ret = pci_for_each_dma_alias(pdev, 1577 device_flush_dte_alias, iommu); 1578 else 1579 ret = iommu_flush_dte(iommu, dev_data->devid); 1580 if (ret) 1581 return ret; 1582 1583 pci_seg = iommu->pci_seg; 1584 alias = pci_seg->alias_table[dev_data->devid]; 1585 if (alias != dev_data->devid) { 1586 ret = iommu_flush_dte(iommu, alias); 1587 if (ret) 1588 return ret; 1589 } 1590 1591 if (dev_data->ats_enabled) { 1592 /* Invalidate the entire contents of an IOTLB */ 1593 ret = device_flush_iotlb(dev_data, 0, ~0UL, 1594 IOMMU_NO_PASID, false); 1595 } 1596 1597 return ret; 1598 } 1599 1600 static int domain_flush_pages_v2(struct protection_domain *pdom, 1601 u64 address, size_t size) 1602 { 1603 struct iommu_dev_data *dev_data; 1604 struct iommu_cmd cmd; 1605 int ret = 0; 1606 1607 lockdep_assert_held(&pdom->lock); 1608 list_for_each_entry(dev_data, &pdom->dev_list, list) { 1609 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1610 u16 domid = dev_data->gcr3_info.domid; 1611 1612 build_inv_iommu_pages(&cmd, address, size, 1613 domid, IOMMU_NO_PASID, true); 1614 1615 ret |= iommu_queue_command(iommu, &cmd); 1616 } 1617 1618 return ret; 1619 } 1620 1621 static int domain_flush_pages_v1(struct protection_domain *pdom, 1622 u64 address, size_t size) 1623 { 1624 struct pdom_iommu_info *pdom_iommu_info; 1625 struct iommu_cmd cmd; 1626 int ret = 0; 1627 unsigned long i; 1628 1629 lockdep_assert_held(&pdom->lock); 1630 1631 build_inv_iommu_pages(&cmd, address, size, 1632 pdom->id, IOMMU_NO_PASID, false); 1633 1634 xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { 1635 /* 1636 * Devices of this domain are behind this IOMMU 1637 * We need a TLB flush 1638 */ 1639 ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); 1640 } 1641 1642 return ret; 1643 } 1644 1645 /* 1646 * TLB invalidation function which is called from the mapping functions. 1647 * It flushes range of PTEs of the domain. 1648 */ 1649 static void __domain_flush_pages(struct protection_domain *domain, 1650 u64 address, size_t size) 1651 { 1652 struct iommu_dev_data *dev_data; 1653 int ret = 0; 1654 ioasid_t pasid = IOMMU_NO_PASID; 1655 bool gn = false; 1656 1657 lockdep_assert_held(&domain->lock); 1658 1659 if (pdom_is_v2_pgtbl_mode(domain)) { 1660 gn = true; 1661 ret = domain_flush_pages_v2(domain, address, size); 1662 } else { 1663 ret = domain_flush_pages_v1(domain, address, size); 1664 } 1665 1666 list_for_each_entry(dev_data, &domain->dev_list, list) { 1667 1668 if (!dev_data->ats_enabled) 1669 continue; 1670 1671 ret |= device_flush_iotlb(dev_data, address, size, pasid, gn); 1672 } 1673 1674 WARN_ON(ret); 1675 } 1676 1677 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 1678 u64 address, size_t size) 1679 { 1680 lockdep_assert_held(&domain->lock); 1681 1682 if (likely(!amd_iommu_np_cache)) { 1683 __domain_flush_pages(domain, address, size); 1684 1685 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1686 domain_flush_complete(domain); 1687 1688 return; 1689 } 1690 1691 /* 1692 * When NpCache is on, we infer that we run in a VM and use a vIOMMU. 1693 * In such setups it is best to avoid flushes of ranges which are not 1694 * naturally aligned, since it would lead to flushes of unmodified 1695 * PTEs. Such flushes would require the hypervisor to do more work than 1696 * necessary. Therefore, perform repeated flushes of aligned ranges 1697 * until you cover the range. Each iteration flushes the smaller 1698 * between the natural alignment of the address that we flush and the 1699 * greatest naturally aligned region that fits in the range. 1700 */ 1701 while (size != 0) { 1702 int addr_alignment = __ffs(address); 1703 int size_alignment = __fls(size); 1704 int min_alignment; 1705 size_t flush_size; 1706 1707 /* 1708 * size is always non-zero, but address might be zero, causing 1709 * addr_alignment to be negative. As the casting of the 1710 * argument in __ffs(address) to long might trim the high bits 1711 * of the address on x86-32, cast to long when doing the check. 1712 */ 1713 if (likely((unsigned long)address != 0)) 1714 min_alignment = min(addr_alignment, size_alignment); 1715 else 1716 min_alignment = size_alignment; 1717 1718 flush_size = 1ul << min_alignment; 1719 1720 __domain_flush_pages(domain, address, flush_size); 1721 address += flush_size; 1722 size -= flush_size; 1723 } 1724 1725 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1726 domain_flush_complete(domain); 1727 } 1728 1729 /* Flush the whole IO/TLB for a given protection domain - including PDE */ 1730 static void amd_iommu_domain_flush_all(struct protection_domain *domain) 1731 { 1732 amd_iommu_domain_flush_pages(domain, 0, 1733 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1734 } 1735 1736 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, 1737 ioasid_t pasid, u64 address, size_t size) 1738 { 1739 struct iommu_cmd cmd; 1740 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1741 1742 build_inv_iommu_pages(&cmd, address, size, 1743 dev_data->gcr3_info.domid, pasid, true); 1744 iommu_queue_command(iommu, &cmd); 1745 1746 if (dev_data->ats_enabled) 1747 device_flush_iotlb(dev_data, address, size, pasid, true); 1748 1749 iommu_completion_wait(iommu); 1750 } 1751 1752 static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, 1753 ioasid_t pasid) 1754 { 1755 amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, 1756 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1757 } 1758 1759 /* Flush the not present cache if it exists */ 1760 static void domain_flush_np_cache(struct protection_domain *domain, 1761 dma_addr_t iova, size_t size) 1762 { 1763 if (unlikely(amd_iommu_np_cache)) { 1764 unsigned long flags; 1765 1766 spin_lock_irqsave(&domain->lock, flags); 1767 amd_iommu_domain_flush_pages(domain, iova, size); 1768 spin_unlock_irqrestore(&domain->lock, flags); 1769 } 1770 } 1771 1772 1773 /* 1774 * This function flushes the DTEs for all devices in domain 1775 */ 1776 void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) 1777 { 1778 struct iommu_dev_data *dev_data; 1779 1780 lockdep_assert_held(&domain->lock); 1781 1782 list_for_each_entry(dev_data, &domain->dev_list, list) { 1783 struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 1784 1785 set_dte_entry(iommu, dev_data); 1786 clone_aliases(iommu, dev_data->dev); 1787 } 1788 1789 list_for_each_entry(dev_data, &domain->dev_list, list) 1790 device_flush_dte(dev_data); 1791 1792 domain_flush_complete(domain); 1793 } 1794 1795 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1796 { 1797 struct iommu_dev_data *dev_data; 1798 struct amd_iommu *iommu; 1799 struct iommu_cmd cmd; 1800 1801 dev_data = dev_iommu_priv_get(dev); 1802 iommu = get_amd_iommu_from_dev(dev); 1803 1804 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 1805 tag, dev_data->pri_tlp); 1806 1807 return iommu_queue_command(iommu, &cmd); 1808 } 1809 1810 /**************************************************************************** 1811 * 1812 * The next functions belong to the domain allocation. A domain is 1813 * allocated for every IOMMU as the default domain. If device isolation 1814 * is enabled, every device get its own domain. The most important thing 1815 * about domains is the page table mapping the DMA address space they 1816 * contain. 1817 * 1818 ****************************************************************************/ 1819 1820 static int pdom_id_alloc(void) 1821 { 1822 return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); 1823 } 1824 1825 static void pdom_id_free(int id) 1826 { 1827 ida_free(&pdom_ids, id); 1828 } 1829 1830 static void free_gcr3_tbl_level1(u64 *tbl) 1831 { 1832 u64 *ptr; 1833 int i; 1834 1835 for (i = 0; i < 512; ++i) { 1836 if (!(tbl[i] & GCR3_VALID)) 1837 continue; 1838 1839 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1840 1841 iommu_free_pages(ptr); 1842 } 1843 } 1844 1845 static void free_gcr3_tbl_level2(u64 *tbl) 1846 { 1847 u64 *ptr; 1848 int i; 1849 1850 for (i = 0; i < 512; ++i) { 1851 if (!(tbl[i] & GCR3_VALID)) 1852 continue; 1853 1854 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1855 1856 free_gcr3_tbl_level1(ptr); 1857 } 1858 } 1859 1860 static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) 1861 { 1862 if (gcr3_info->glx == 2) 1863 free_gcr3_tbl_level2(gcr3_info->gcr3_tbl); 1864 else if (gcr3_info->glx == 1) 1865 free_gcr3_tbl_level1(gcr3_info->gcr3_tbl); 1866 else 1867 WARN_ON_ONCE(gcr3_info->glx != 0); 1868 1869 gcr3_info->glx = 0; 1870 1871 /* Free per device domain ID */ 1872 pdom_id_free(gcr3_info->domid); 1873 1874 iommu_free_pages(gcr3_info->gcr3_tbl); 1875 gcr3_info->gcr3_tbl = NULL; 1876 } 1877 1878 /* 1879 * Number of GCR3 table levels required. Level must be 4-Kbyte 1880 * page and can contain up to 512 entries. 1881 */ 1882 static int get_gcr3_levels(int pasids) 1883 { 1884 int levels; 1885 1886 if (pasids == -1) 1887 return amd_iommu_max_glx_val; 1888 1889 levels = get_count_order(pasids); 1890 1891 return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; 1892 } 1893 1894 static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, 1895 struct amd_iommu *iommu, int pasids) 1896 { 1897 int levels = get_gcr3_levels(pasids); 1898 int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 1899 int domid; 1900 1901 if (levels > amd_iommu_max_glx_val) 1902 return -EINVAL; 1903 1904 if (gcr3_info->gcr3_tbl) 1905 return -EBUSY; 1906 1907 /* Allocate per device domain ID */ 1908 domid = pdom_id_alloc(); 1909 if (domid <= 0) 1910 return -ENOSPC; 1911 gcr3_info->domid = domid; 1912 1913 gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K); 1914 if (gcr3_info->gcr3_tbl == NULL) { 1915 pdom_id_free(domid); 1916 return -ENOMEM; 1917 } 1918 1919 gcr3_info->glx = levels; 1920 1921 return 0; 1922 } 1923 1924 static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info, 1925 ioasid_t pasid, bool alloc) 1926 { 1927 int index; 1928 u64 *pte; 1929 u64 *root = gcr3_info->gcr3_tbl; 1930 int level = gcr3_info->glx; 1931 1932 while (true) { 1933 1934 index = (pasid >> (9 * level)) & 0x1ff; 1935 pte = &root[index]; 1936 1937 if (level == 0) 1938 break; 1939 1940 if (!(*pte & GCR3_VALID)) { 1941 if (!alloc) 1942 return NULL; 1943 1944 root = (void *)get_zeroed_page(GFP_ATOMIC); 1945 if (root == NULL) 1946 return NULL; 1947 1948 *pte = iommu_virt_to_phys(root) | GCR3_VALID; 1949 } 1950 1951 root = iommu_phys_to_virt(*pte & PAGE_MASK); 1952 1953 level -= 1; 1954 } 1955 1956 return pte; 1957 } 1958 1959 static int update_gcr3(struct iommu_dev_data *dev_data, 1960 ioasid_t pasid, unsigned long gcr3, bool set) 1961 { 1962 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 1963 u64 *pte; 1964 1965 pte = __get_gcr3_pte(gcr3_info, pasid, true); 1966 if (pte == NULL) 1967 return -ENOMEM; 1968 1969 if (set) 1970 *pte = (gcr3 & PAGE_MASK) | GCR3_VALID; 1971 else 1972 *pte = 0; 1973 1974 dev_flush_pasid_all(dev_data, pasid); 1975 return 0; 1976 } 1977 1978 int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, 1979 unsigned long gcr3) 1980 { 1981 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 1982 int ret; 1983 1984 iommu_group_mutex_assert(dev_data->dev); 1985 1986 ret = update_gcr3(dev_data, pasid, gcr3, true); 1987 if (ret) 1988 return ret; 1989 1990 gcr3_info->pasid_cnt++; 1991 return ret; 1992 } 1993 1994 int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) 1995 { 1996 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 1997 int ret; 1998 1999 iommu_group_mutex_assert(dev_data->dev); 2000 2001 ret = update_gcr3(dev_data, pasid, 0, false); 2002 if (ret) 2003 return ret; 2004 2005 gcr3_info->pasid_cnt--; 2006 return ret; 2007 } 2008 2009 static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr, 2010 struct dev_table_entry *new) 2011 { 2012 /* All existing DTE must have V bit set */ 2013 new->data128[0] = DTE_FLAG_V; 2014 new->data128[1] = 0; 2015 } 2016 2017 /* 2018 * Note: 2019 * The old value for GCR3 table and GPT have been cleared from caller. 2020 */ 2021 static void set_dte_gcr3_table(struct amd_iommu *iommu, 2022 struct iommu_dev_data *dev_data, 2023 struct dev_table_entry *target) 2024 { 2025 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2026 u64 gcr3; 2027 2028 if (!gcr3_info->gcr3_tbl) 2029 return; 2030 2031 pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n", 2032 __func__, dev_data->devid, gcr3_info->glx, 2033 (unsigned long long)gcr3_info->gcr3_tbl); 2034 2035 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2036 2037 target->data[0] |= DTE_FLAG_GV | 2038 FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2039 FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); 2040 if (pdom_is_v2_pgtbl_mode(dev_data->domain)) 2041 target->data[0] |= DTE_FLAG_GIOV; 2042 2043 target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2044 FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2045 2046 /* Guest page table can only support 4 and 5 levels */ 2047 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2048 target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2049 else 2050 target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2051 } 2052 2053 static void set_dte_entry(struct amd_iommu *iommu, 2054 struct iommu_dev_data *dev_data) 2055 { 2056 u16 domid; 2057 u32 old_domid; 2058 struct dev_table_entry *initial_dte; 2059 struct dev_table_entry new = {}; 2060 struct protection_domain *domain = dev_data->domain; 2061 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2062 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2063 2064 if (gcr3_info && gcr3_info->gcr3_tbl) 2065 domid = dev_data->gcr3_info.domid; 2066 else 2067 domid = domain->id; 2068 2069 make_clear_dte(dev_data, dte, &new); 2070 2071 if (domain->iop.mode != PAGE_MODE_NONE) 2072 new.data[0] |= iommu_virt_to_phys(domain->iop.root); 2073 2074 new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) 2075 << DEV_ENTRY_MODE_SHIFT; 2076 2077 new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; 2078 2079 /* 2080 * When SNP is enabled, we can only support TV=1 with non-zero domain ID. 2081 * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in 2082 * do_iommu_domain_alloc(). 2083 */ 2084 WARN_ON(amd_iommu_snp_en && (domid == 0)); 2085 new.data[0] |= DTE_FLAG_TV; 2086 2087 if (dev_data->ppr) 2088 new.data[0] |= 1ULL << DEV_ENTRY_PPR; 2089 2090 if (domain->dirty_tracking) 2091 new.data[0] |= DTE_FLAG_HAD; 2092 2093 if (dev_data->ats_enabled) 2094 new.data[1] |= DTE_FLAG_IOTLB; 2095 2096 old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK; 2097 new.data[1] |= domid; 2098 2099 /* 2100 * Restore cached persistent DTE bits, which can be set by information 2101 * in IVRS table. See set_dev_entry_from_acpi(). 2102 */ 2103 initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); 2104 if (initial_dte) { 2105 new.data128[0] |= initial_dte->data128[0]; 2106 new.data128[1] |= initial_dte->data128[1]; 2107 } 2108 2109 set_dte_gcr3_table(iommu, dev_data, &new); 2110 2111 update_dte256(iommu, dev_data, &new); 2112 2113 /* 2114 * A kdump kernel might be replacing a domain ID that was copied from 2115 * the previous kernel--if so, it needs to flush the translation cache 2116 * entries for the old domain ID that is being overwritten 2117 */ 2118 if (old_domid) { 2119 amd_iommu_flush_tlb_domid(iommu, old_domid); 2120 } 2121 } 2122 2123 /* 2124 * Clear DMA-remap related flags to block all DMA (blockeded domain) 2125 */ 2126 static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) 2127 { 2128 struct dev_table_entry new = {}; 2129 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2130 2131 make_clear_dte(dev_data, dte, &new); 2132 update_dte256(iommu, dev_data, &new); 2133 } 2134 2135 /* Update and flush DTE for the given device */ 2136 static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) 2137 { 2138 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2139 2140 if (set) 2141 set_dte_entry(iommu, dev_data); 2142 else 2143 clear_dte_entry(iommu, dev_data); 2144 2145 clone_aliases(iommu, dev_data->dev); 2146 device_flush_dte(dev_data); 2147 iommu_completion_wait(iommu); 2148 } 2149 2150 /* 2151 * If domain is SVA capable then initialize GCR3 table. Also if domain is 2152 * in v2 page table mode then update GCR3[0]. 2153 */ 2154 static int init_gcr3_table(struct iommu_dev_data *dev_data, 2155 struct protection_domain *pdom) 2156 { 2157 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2158 int max_pasids = dev_data->max_pasids; 2159 int ret = 0; 2160 2161 /* 2162 * If domain is in pt mode then setup GCR3 table only if device 2163 * is PASID capable 2164 */ 2165 if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data)) 2166 return ret; 2167 2168 /* 2169 * By default, setup GCR3 table to support MAX PASIDs 2170 * supported by the device/IOMMU. 2171 */ 2172 ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 2173 max_pasids > 0 ? max_pasids : 1); 2174 if (ret) 2175 return ret; 2176 2177 /* Setup GCR3[0] only if domain is setup with v2 page table mode */ 2178 if (!pdom_is_v2_pgtbl_mode(pdom)) 2179 return ret; 2180 2181 ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); 2182 if (ret) 2183 free_gcr3_table(&dev_data->gcr3_info); 2184 2185 return ret; 2186 } 2187 2188 static void destroy_gcr3_table(struct iommu_dev_data *dev_data, 2189 struct protection_domain *pdom) 2190 { 2191 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2192 2193 if (pdom_is_v2_pgtbl_mode(pdom)) 2194 update_gcr3(dev_data, 0, 0, false); 2195 2196 if (gcr3_info->gcr3_tbl == NULL) 2197 return; 2198 2199 free_gcr3_table(gcr3_info); 2200 } 2201 2202 static int pdom_attach_iommu(struct amd_iommu *iommu, 2203 struct protection_domain *pdom) 2204 { 2205 struct pdom_iommu_info *pdom_iommu_info, *curr; 2206 unsigned long flags; 2207 int ret = 0; 2208 2209 spin_lock_irqsave(&pdom->lock, flags); 2210 2211 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2212 if (pdom_iommu_info) { 2213 pdom_iommu_info->refcnt++; 2214 goto out_unlock; 2215 } 2216 2217 pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); 2218 if (!pdom_iommu_info) { 2219 ret = -ENOMEM; 2220 goto out_unlock; 2221 } 2222 2223 pdom_iommu_info->iommu = iommu; 2224 pdom_iommu_info->refcnt = 1; 2225 2226 curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, 2227 NULL, pdom_iommu_info, GFP_ATOMIC); 2228 if (curr) { 2229 kfree(pdom_iommu_info); 2230 ret = -ENOSPC; 2231 goto out_unlock; 2232 } 2233 2234 out_unlock: 2235 spin_unlock_irqrestore(&pdom->lock, flags); 2236 return ret; 2237 } 2238 2239 static void pdom_detach_iommu(struct amd_iommu *iommu, 2240 struct protection_domain *pdom) 2241 { 2242 struct pdom_iommu_info *pdom_iommu_info; 2243 unsigned long flags; 2244 2245 spin_lock_irqsave(&pdom->lock, flags); 2246 2247 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2248 if (!pdom_iommu_info) { 2249 spin_unlock_irqrestore(&pdom->lock, flags); 2250 return; 2251 } 2252 2253 pdom_iommu_info->refcnt--; 2254 if (pdom_iommu_info->refcnt == 0) { 2255 xa_erase(&pdom->iommu_array, iommu->index); 2256 kfree(pdom_iommu_info); 2257 } 2258 2259 spin_unlock_irqrestore(&pdom->lock, flags); 2260 } 2261 2262 /* 2263 * If a device is not yet associated with a domain, this function makes the 2264 * device visible in the domain 2265 */ 2266 static int attach_device(struct device *dev, 2267 struct protection_domain *domain) 2268 { 2269 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2270 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2271 struct pci_dev *pdev; 2272 unsigned long flags; 2273 int ret = 0; 2274 2275 mutex_lock(&dev_data->mutex); 2276 2277 if (dev_data->domain != NULL) { 2278 ret = -EBUSY; 2279 goto out; 2280 } 2281 2282 /* Do reference counting */ 2283 ret = pdom_attach_iommu(iommu, domain); 2284 if (ret) 2285 goto out; 2286 2287 /* Setup GCR3 table */ 2288 if (pdom_is_sva_capable(domain)) { 2289 ret = init_gcr3_table(dev_data, domain); 2290 if (ret) { 2291 pdom_detach_iommu(iommu, domain); 2292 goto out; 2293 } 2294 } 2295 2296 pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; 2297 if (pdev && pdom_is_sva_capable(domain)) { 2298 pdev_enable_caps(pdev); 2299 2300 /* 2301 * Device can continue to function even if IOPF 2302 * enablement failed. Hence in error path just 2303 * disable device PRI support. 2304 */ 2305 if (amd_iommu_iopf_add_device(iommu, dev_data)) 2306 pdev_disable_cap_pri(pdev); 2307 } else if (pdev) { 2308 pdev_enable_cap_ats(pdev); 2309 } 2310 2311 /* Update data structures */ 2312 dev_data->domain = domain; 2313 spin_lock_irqsave(&domain->lock, flags); 2314 list_add(&dev_data->list, &domain->dev_list); 2315 spin_unlock_irqrestore(&domain->lock, flags); 2316 2317 /* Update device table */ 2318 dev_update_dte(dev_data, true); 2319 2320 out: 2321 mutex_unlock(&dev_data->mutex); 2322 2323 return ret; 2324 } 2325 2326 /* 2327 * Removes a device from a protection domain (with devtable_lock held) 2328 */ 2329 static void detach_device(struct device *dev) 2330 { 2331 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2332 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2333 struct protection_domain *domain = dev_data->domain; 2334 unsigned long flags; 2335 2336 mutex_lock(&dev_data->mutex); 2337 2338 /* 2339 * First check if the device is still attached. It might already 2340 * be detached from its domain because the generic 2341 * iommu_detach_group code detached it and we try again here in 2342 * our alias handling. 2343 */ 2344 if (WARN_ON(!dev_data->domain)) 2345 goto out; 2346 2347 /* Remove IOPF handler */ 2348 if (dev_data->ppr) { 2349 iopf_queue_flush_dev(dev); 2350 amd_iommu_iopf_remove_device(iommu, dev_data); 2351 } 2352 2353 if (dev_is_pci(dev)) 2354 pdev_disable_caps(to_pci_dev(dev)); 2355 2356 /* Clear DTE and flush the entry */ 2357 dev_update_dte(dev_data, false); 2358 2359 /* Flush IOTLB and wait for the flushes to finish */ 2360 spin_lock_irqsave(&domain->lock, flags); 2361 amd_iommu_domain_flush_all(domain); 2362 list_del(&dev_data->list); 2363 spin_unlock_irqrestore(&domain->lock, flags); 2364 2365 /* Clear GCR3 table */ 2366 if (pdom_is_sva_capable(domain)) 2367 destroy_gcr3_table(dev_data, domain); 2368 2369 /* Update data structures */ 2370 dev_data->domain = NULL; 2371 2372 /* decrease reference counters - needs to happen after the flushes */ 2373 pdom_detach_iommu(iommu, domain); 2374 2375 out: 2376 mutex_unlock(&dev_data->mutex); 2377 } 2378 2379 static struct iommu_device *amd_iommu_probe_device(struct device *dev) 2380 { 2381 struct iommu_device *iommu_dev; 2382 struct amd_iommu *iommu; 2383 struct iommu_dev_data *dev_data; 2384 int ret; 2385 2386 if (!check_device(dev)) 2387 return ERR_PTR(-ENODEV); 2388 2389 iommu = rlookup_amd_iommu(dev); 2390 if (!iommu) 2391 return ERR_PTR(-ENODEV); 2392 2393 /* Not registered yet? */ 2394 if (!iommu->iommu.ops) 2395 return ERR_PTR(-ENODEV); 2396 2397 if (dev_iommu_priv_get(dev)) 2398 return &iommu->iommu; 2399 2400 ret = iommu_init_device(iommu, dev); 2401 if (ret) { 2402 dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); 2403 iommu_dev = ERR_PTR(ret); 2404 iommu_ignore_device(iommu, dev); 2405 goto out_err; 2406 } 2407 2408 amd_iommu_set_pci_msi_domain(dev, iommu); 2409 iommu_dev = &iommu->iommu; 2410 2411 /* 2412 * If IOMMU and device supports PASID then it will contain max 2413 * supported PASIDs, else it will be zero. 2414 */ 2415 dev_data = dev_iommu_priv_get(dev); 2416 if (amd_iommu_pasid_supported() && dev_is_pci(dev) && 2417 pdev_pasid_supported(dev_data)) { 2418 dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids, 2419 pci_max_pasids(to_pci_dev(dev))); 2420 } 2421 2422 if (amd_iommu_pgtable == PD_MODE_NONE) { 2423 pr_warn_once("%s: DMA translation not supported by iommu.\n", 2424 __func__); 2425 iommu_dev = ERR_PTR(-ENODEV); 2426 goto out_err; 2427 } 2428 2429 out_err: 2430 2431 iommu_completion_wait(iommu); 2432 2433 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 2434 dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; 2435 else 2436 dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; 2437 2438 if (dev_is_pci(dev)) 2439 pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); 2440 2441 return iommu_dev; 2442 } 2443 2444 static void amd_iommu_release_device(struct device *dev) 2445 { 2446 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2447 2448 WARN_ON(dev_data->domain); 2449 2450 /* 2451 * We keep dev_data around for unplugged devices and reuse it when the 2452 * device is re-plugged - not doing so would introduce a ton of races. 2453 */ 2454 } 2455 2456 static struct iommu_group *amd_iommu_device_group(struct device *dev) 2457 { 2458 if (dev_is_pci(dev)) 2459 return pci_device_group(dev); 2460 2461 return acpihid_device_group(dev); 2462 } 2463 2464 /***************************************************************************** 2465 * 2466 * The following functions belong to the exported interface of AMD IOMMU 2467 * 2468 * This interface allows access to lower level functions of the IOMMU 2469 * like protection domain handling and assignement of devices to domains 2470 * which is not possible with the dma_ops interface. 2471 * 2472 *****************************************************************************/ 2473 2474 static void protection_domain_init(struct protection_domain *domain) 2475 { 2476 spin_lock_init(&domain->lock); 2477 INIT_LIST_HEAD(&domain->dev_list); 2478 INIT_LIST_HEAD(&domain->dev_data_list); 2479 xa_init(&domain->iommu_array); 2480 } 2481 2482 struct protection_domain *protection_domain_alloc(void) 2483 { 2484 struct protection_domain *domain; 2485 int domid; 2486 2487 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2488 if (!domain) 2489 return NULL; 2490 2491 domid = pdom_id_alloc(); 2492 if (domid <= 0) { 2493 kfree(domain); 2494 return NULL; 2495 } 2496 domain->id = domid; 2497 2498 protection_domain_init(domain); 2499 2500 return domain; 2501 } 2502 2503 static int pdom_setup_pgtable(struct protection_domain *domain, 2504 struct device *dev) 2505 { 2506 struct io_pgtable_ops *pgtbl_ops; 2507 enum io_pgtable_fmt fmt; 2508 2509 switch (domain->pd_mode) { 2510 case PD_MODE_V1: 2511 fmt = AMD_IOMMU_V1; 2512 break; 2513 case PD_MODE_V2: 2514 fmt = AMD_IOMMU_V2; 2515 break; 2516 case PD_MODE_NONE: 2517 WARN_ON_ONCE(1); 2518 return -EPERM; 2519 } 2520 2521 domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); 2522 pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); 2523 if (!pgtbl_ops) 2524 return -ENOMEM; 2525 2526 return 0; 2527 } 2528 2529 static inline u64 dma_max_address(enum protection_domain_mode pgtable) 2530 { 2531 if (pgtable == PD_MODE_V1) 2532 return PM_LEVEL_SIZE(amd_iommu_hpt_level); 2533 2534 /* 2535 * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page 2536 * Translation" shows that the V2 table sign extends the top of the 2537 * address space creating a reserved region in the middle of the 2538 * translation, just like the CPU does. Further Vasant says the docs are 2539 * incomplete and this only applies to non-zero PASIDs. If the AMDv2 2540 * page table is assigned to the 0 PASID then there is no sign extension 2541 * check. 2542 * 2543 * Since the IOMMU must have a fixed geometry, and the core code does 2544 * not understand sign extended addressing, we have to chop off the high 2545 * bit to get consistent behavior with attachments of the domain to any 2546 * PASID. 2547 */ 2548 return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); 2549 } 2550 2551 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2552 { 2553 if (amd_iommu_hatdis) 2554 return false; 2555 2556 return iommu && (iommu->features & FEATURE_HDSUP); 2557 } 2558 2559 static struct iommu_domain * 2560 do_iommu_domain_alloc(struct device *dev, u32 flags, 2561 enum protection_domain_mode pgtable) 2562 { 2563 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 2564 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2565 struct protection_domain *domain; 2566 int ret; 2567 2568 domain = protection_domain_alloc(); 2569 if (!domain) 2570 return ERR_PTR(-ENOMEM); 2571 2572 domain->pd_mode = pgtable; 2573 ret = pdom_setup_pgtable(domain, dev); 2574 if (ret) { 2575 pdom_id_free(domain->id); 2576 kfree(domain); 2577 return ERR_PTR(ret); 2578 } 2579 2580 domain->domain.geometry.aperture_start = 0; 2581 domain->domain.geometry.aperture_end = dma_max_address(pgtable); 2582 domain->domain.geometry.force_aperture = true; 2583 domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; 2584 2585 domain->domain.type = IOMMU_DOMAIN_UNMANAGED; 2586 domain->domain.ops = iommu->iommu.ops->default_domain_ops; 2587 2588 if (dirty_tracking) 2589 domain->domain.dirty_ops = &amd_dirty_ops; 2590 2591 return &domain->domain; 2592 } 2593 2594 static struct iommu_domain * 2595 amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 2596 const struct iommu_user_data *user_data) 2597 2598 { 2599 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2600 const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 2601 IOMMU_HWPT_ALLOC_PASID; 2602 2603 if ((flags & ~supported_flags) || user_data) 2604 return ERR_PTR(-EOPNOTSUPP); 2605 2606 switch (flags & supported_flags) { 2607 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: 2608 /* Allocate domain with v1 page table for dirty tracking */ 2609 if (!amd_iommu_hd_support(iommu)) 2610 break; 2611 return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); 2612 case IOMMU_HWPT_ALLOC_PASID: 2613 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2614 if (!amd_iommu_pasid_supported()) 2615 break; 2616 return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); 2617 case 0: 2618 /* If nothing specific is required use the kernel commandline default */ 2619 return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); 2620 default: 2621 break; 2622 } 2623 return ERR_PTR(-EOPNOTSUPP); 2624 } 2625 2626 void amd_iommu_domain_free(struct iommu_domain *dom) 2627 { 2628 struct protection_domain *domain = to_pdomain(dom); 2629 2630 WARN_ON(!list_empty(&domain->dev_list)); 2631 if (domain->domain.type & __IOMMU_DOMAIN_PAGING) 2632 free_io_pgtable_ops(&domain->iop.pgtbl.ops); 2633 pdom_id_free(domain->id); 2634 kfree(domain); 2635 } 2636 2637 static int blocked_domain_attach_device(struct iommu_domain *domain, 2638 struct device *dev) 2639 { 2640 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2641 2642 if (dev_data->domain) 2643 detach_device(dev); 2644 2645 /* Clear DTE and flush the entry */ 2646 mutex_lock(&dev_data->mutex); 2647 dev_update_dte(dev_data, false); 2648 mutex_unlock(&dev_data->mutex); 2649 2650 return 0; 2651 } 2652 2653 static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, 2654 struct device *dev, ioasid_t pasid, 2655 struct iommu_domain *old) 2656 { 2657 amd_iommu_remove_dev_pasid(dev, pasid, old); 2658 return 0; 2659 } 2660 2661 static struct iommu_domain blocked_domain = { 2662 .type = IOMMU_DOMAIN_BLOCKED, 2663 .ops = &(const struct iommu_domain_ops) { 2664 .attach_dev = blocked_domain_attach_device, 2665 .set_dev_pasid = blocked_domain_set_dev_pasid, 2666 } 2667 }; 2668 2669 static struct protection_domain identity_domain; 2670 2671 static const struct iommu_domain_ops identity_domain_ops = { 2672 .attach_dev = amd_iommu_attach_device, 2673 }; 2674 2675 void amd_iommu_init_identity_domain(void) 2676 { 2677 struct iommu_domain *domain = &identity_domain.domain; 2678 2679 domain->type = IOMMU_DOMAIN_IDENTITY; 2680 domain->ops = &identity_domain_ops; 2681 domain->owner = &amd_iommu_ops; 2682 2683 identity_domain.id = pdom_id_alloc(); 2684 2685 protection_domain_init(&identity_domain); 2686 } 2687 2688 /* Same as blocked domain except it supports only ops->attach_dev() */ 2689 static struct iommu_domain release_domain = { 2690 .type = IOMMU_DOMAIN_BLOCKED, 2691 .ops = &(const struct iommu_domain_ops) { 2692 .attach_dev = blocked_domain_attach_device, 2693 } 2694 }; 2695 2696 static int amd_iommu_attach_device(struct iommu_domain *dom, 2697 struct device *dev) 2698 { 2699 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2700 struct protection_domain *domain = to_pdomain(dom); 2701 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2702 int ret; 2703 2704 /* 2705 * Skip attach device to domain if new domain is same as 2706 * devices current domain 2707 */ 2708 if (dev_data->domain == domain) 2709 return 0; 2710 2711 dev_data->defer_attach = false; 2712 2713 /* 2714 * Restrict to devices with compatible IOMMU hardware support 2715 * when enforcement of dirty tracking is enabled. 2716 */ 2717 if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) 2718 return -EINVAL; 2719 2720 if (dev_data->domain) 2721 detach_device(dev); 2722 2723 ret = attach_device(dev, domain); 2724 2725 #ifdef CONFIG_IRQ_REMAP 2726 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { 2727 if (dom->type == IOMMU_DOMAIN_UNMANAGED) 2728 dev_data->use_vapic = 1; 2729 else 2730 dev_data->use_vapic = 0; 2731 } 2732 #endif 2733 2734 return ret; 2735 } 2736 2737 static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2738 unsigned long iova, size_t size) 2739 { 2740 struct protection_domain *domain = to_pdomain(dom); 2741 struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2742 2743 if (ops->map_pages) 2744 domain_flush_np_cache(domain, iova, size); 2745 return 0; 2746 } 2747 2748 static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, 2749 phys_addr_t paddr, size_t pgsize, size_t pgcount, 2750 int iommu_prot, gfp_t gfp, size_t *mapped) 2751 { 2752 struct protection_domain *domain = to_pdomain(dom); 2753 struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2754 int prot = 0; 2755 int ret = -EINVAL; 2756 2757 if ((domain->pd_mode == PD_MODE_V1) && 2758 (domain->iop.mode == PAGE_MODE_NONE)) 2759 return -EINVAL; 2760 2761 if (iommu_prot & IOMMU_READ) 2762 prot |= IOMMU_PROT_IR; 2763 if (iommu_prot & IOMMU_WRITE) 2764 prot |= IOMMU_PROT_IW; 2765 2766 if (ops->map_pages) { 2767 ret = ops->map_pages(ops, iova, paddr, pgsize, 2768 pgcount, prot, gfp, mapped); 2769 } 2770 2771 return ret; 2772 } 2773 2774 static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, 2775 struct iommu_iotlb_gather *gather, 2776 unsigned long iova, size_t size) 2777 { 2778 /* 2779 * AMD's IOMMU can flush as many pages as necessary in a single flush. 2780 * Unless we run in a virtual machine, which can be inferred according 2781 * to whether "non-present cache" is on, it is probably best to prefer 2782 * (potentially) too extensive TLB flushing (i.e., more misses) over 2783 * mutliple TLB flushes (i.e., more flushes). For virtual machines the 2784 * hypervisor needs to synchronize the host IOMMU PTEs with those of 2785 * the guest, and the trade-off is different: unnecessary TLB flushes 2786 * should be avoided. 2787 */ 2788 if (amd_iommu_np_cache && 2789 iommu_iotlb_gather_is_disjoint(gather, iova, size)) 2790 iommu_iotlb_sync(domain, gather); 2791 2792 iommu_iotlb_gather_add_range(gather, iova, size); 2793 } 2794 2795 static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, 2796 size_t pgsize, size_t pgcount, 2797 struct iommu_iotlb_gather *gather) 2798 { 2799 struct protection_domain *domain = to_pdomain(dom); 2800 struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2801 size_t r; 2802 2803 if ((domain->pd_mode == PD_MODE_V1) && 2804 (domain->iop.mode == PAGE_MODE_NONE)) 2805 return 0; 2806 2807 r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; 2808 2809 if (r) 2810 amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); 2811 2812 return r; 2813 } 2814 2815 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2816 dma_addr_t iova) 2817 { 2818 struct protection_domain *domain = to_pdomain(dom); 2819 struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2820 2821 return ops->iova_to_phys(ops, iova); 2822 } 2823 2824 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) 2825 { 2826 switch (cap) { 2827 case IOMMU_CAP_CACHE_COHERENCY: 2828 return true; 2829 case IOMMU_CAP_NOEXEC: 2830 return false; 2831 case IOMMU_CAP_PRE_BOOT_PROTECTION: 2832 return amdr_ivrs_remap_support; 2833 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 2834 return true; 2835 case IOMMU_CAP_DEFERRED_FLUSH: 2836 return true; 2837 case IOMMU_CAP_DIRTY_TRACKING: { 2838 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2839 2840 return amd_iommu_hd_support(iommu); 2841 } 2842 default: 2843 break; 2844 } 2845 2846 return false; 2847 } 2848 2849 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 2850 bool enable) 2851 { 2852 struct protection_domain *pdomain = to_pdomain(domain); 2853 struct dev_table_entry *dte; 2854 struct iommu_dev_data *dev_data; 2855 bool domain_flush = false; 2856 struct amd_iommu *iommu; 2857 unsigned long flags; 2858 u64 new; 2859 2860 spin_lock_irqsave(&pdomain->lock, flags); 2861 if (!(pdomain->dirty_tracking ^ enable)) { 2862 spin_unlock_irqrestore(&pdomain->lock, flags); 2863 return 0; 2864 } 2865 2866 list_for_each_entry(dev_data, &pdomain->dev_list, list) { 2867 spin_lock(&dev_data->dte_lock); 2868 iommu = get_amd_iommu_from_dev_data(dev_data); 2869 dte = &get_dev_table(iommu)[dev_data->devid]; 2870 new = dte->data[0]; 2871 new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); 2872 dte->data[0] = new; 2873 spin_unlock(&dev_data->dte_lock); 2874 2875 /* Flush device DTE */ 2876 device_flush_dte(dev_data); 2877 domain_flush = true; 2878 } 2879 2880 /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ 2881 if (domain_flush) 2882 amd_iommu_domain_flush_all(pdomain); 2883 2884 pdomain->dirty_tracking = enable; 2885 spin_unlock_irqrestore(&pdomain->lock, flags); 2886 2887 return 0; 2888 } 2889 2890 static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, 2891 unsigned long iova, size_t size, 2892 unsigned long flags, 2893 struct iommu_dirty_bitmap *dirty) 2894 { 2895 struct protection_domain *pdomain = to_pdomain(domain); 2896 struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; 2897 unsigned long lflags; 2898 2899 if (!ops || !ops->read_and_clear_dirty) 2900 return -EOPNOTSUPP; 2901 2902 spin_lock_irqsave(&pdomain->lock, lflags); 2903 if (!pdomain->dirty_tracking && dirty->bitmap) { 2904 spin_unlock_irqrestore(&pdomain->lock, lflags); 2905 return -EINVAL; 2906 } 2907 spin_unlock_irqrestore(&pdomain->lock, lflags); 2908 2909 return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); 2910 } 2911 2912 static void amd_iommu_get_resv_regions(struct device *dev, 2913 struct list_head *head) 2914 { 2915 struct iommu_resv_region *region; 2916 struct unity_map_entry *entry; 2917 struct amd_iommu *iommu; 2918 struct amd_iommu_pci_seg *pci_seg; 2919 int devid, sbdf; 2920 2921 sbdf = get_device_sbdf_id(dev); 2922 if (sbdf < 0) 2923 return; 2924 2925 devid = PCI_SBDF_TO_DEVID(sbdf); 2926 iommu = get_amd_iommu_from_dev(dev); 2927 pci_seg = iommu->pci_seg; 2928 2929 list_for_each_entry(entry, &pci_seg->unity_map, list) { 2930 int type, prot = 0; 2931 size_t length; 2932 2933 if (devid < entry->devid_start || devid > entry->devid_end) 2934 continue; 2935 2936 type = IOMMU_RESV_DIRECT; 2937 length = entry->address_end - entry->address_start; 2938 if (entry->prot & IOMMU_PROT_IR) 2939 prot |= IOMMU_READ; 2940 if (entry->prot & IOMMU_PROT_IW) 2941 prot |= IOMMU_WRITE; 2942 if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE) 2943 /* Exclusion range */ 2944 type = IOMMU_RESV_RESERVED; 2945 2946 region = iommu_alloc_resv_region(entry->address_start, 2947 length, prot, type, 2948 GFP_KERNEL); 2949 if (!region) { 2950 dev_err(dev, "Out of memory allocating dm-regions\n"); 2951 return; 2952 } 2953 list_add_tail(®ion->list, head); 2954 } 2955 2956 region = iommu_alloc_resv_region(MSI_RANGE_START, 2957 MSI_RANGE_END - MSI_RANGE_START + 1, 2958 0, IOMMU_RESV_MSI, GFP_KERNEL); 2959 if (!region) 2960 return; 2961 list_add_tail(®ion->list, head); 2962 2963 if (amd_iommu_ht_range_ignore()) 2964 return; 2965 2966 region = iommu_alloc_resv_region(HT_RANGE_START, 2967 HT_RANGE_END - HT_RANGE_START + 1, 2968 0, IOMMU_RESV_RESERVED, GFP_KERNEL); 2969 if (!region) 2970 return; 2971 list_add_tail(®ion->list, head); 2972 } 2973 2974 static bool amd_iommu_is_attach_deferred(struct device *dev) 2975 { 2976 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2977 2978 return dev_data->defer_attach; 2979 } 2980 2981 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2982 { 2983 struct protection_domain *dom = to_pdomain(domain); 2984 unsigned long flags; 2985 2986 spin_lock_irqsave(&dom->lock, flags); 2987 amd_iommu_domain_flush_all(dom); 2988 spin_unlock_irqrestore(&dom->lock, flags); 2989 } 2990 2991 static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2992 struct iommu_iotlb_gather *gather) 2993 { 2994 struct protection_domain *dom = to_pdomain(domain); 2995 unsigned long flags; 2996 2997 spin_lock_irqsave(&dom->lock, flags); 2998 amd_iommu_domain_flush_pages(dom, gather->start, 2999 gather->end - gather->start + 1); 3000 spin_unlock_irqrestore(&dom->lock, flags); 3001 } 3002 3003 static int amd_iommu_def_domain_type(struct device *dev) 3004 { 3005 struct iommu_dev_data *dev_data; 3006 3007 dev_data = dev_iommu_priv_get(dev); 3008 if (!dev_data) 3009 return 0; 3010 3011 /* Always use DMA domain for untrusted device */ 3012 if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) 3013 return IOMMU_DOMAIN_DMA; 3014 3015 /* 3016 * Do not identity map IOMMUv2 capable devices when: 3017 * - memory encryption is active, because some of those devices 3018 * (AMD GPUs) don't have the encryption bit in their DMA-mask 3019 * and require remapping. 3020 * - SNP is enabled, because it prohibits DTE[Mode]=0. 3021 */ 3022 if (pdev_pasid_supported(dev_data) && 3023 !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && 3024 !amd_iommu_snp_en) { 3025 return IOMMU_DOMAIN_IDENTITY; 3026 } 3027 3028 return 0; 3029 } 3030 3031 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3032 { 3033 /* IOMMU_PTE_FC is always set */ 3034 return true; 3035 } 3036 3037 static const struct iommu_dirty_ops amd_dirty_ops = { 3038 .set_dirty_tracking = amd_iommu_set_dirty_tracking, 3039 .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, 3040 }; 3041 3042 const struct iommu_ops amd_iommu_ops = { 3043 .capable = amd_iommu_capable, 3044 .blocked_domain = &blocked_domain, 3045 .release_domain = &release_domain, 3046 .identity_domain = &identity_domain.domain, 3047 .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, 3048 .domain_alloc_sva = amd_iommu_domain_alloc_sva, 3049 .probe_device = amd_iommu_probe_device, 3050 .release_device = amd_iommu_release_device, 3051 .device_group = amd_iommu_device_group, 3052 .get_resv_regions = amd_iommu_get_resv_regions, 3053 .is_attach_deferred = amd_iommu_is_attach_deferred, 3054 .def_domain_type = amd_iommu_def_domain_type, 3055 .page_response = amd_iommu_page_response, 3056 .default_domain_ops = &(const struct iommu_domain_ops) { 3057 .attach_dev = amd_iommu_attach_device, 3058 .map_pages = amd_iommu_map_pages, 3059 .unmap_pages = amd_iommu_unmap_pages, 3060 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 3061 .iova_to_phys = amd_iommu_iova_to_phys, 3062 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 3063 .iotlb_sync = amd_iommu_iotlb_sync, 3064 .free = amd_iommu_domain_free, 3065 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 3066 } 3067 }; 3068 3069 #ifdef CONFIG_IRQ_REMAP 3070 3071 /***************************************************************************** 3072 * 3073 * Interrupt Remapping Implementation 3074 * 3075 *****************************************************************************/ 3076 3077 static struct irq_chip amd_ir_chip; 3078 static DEFINE_SPINLOCK(iommu_table_lock); 3079 3080 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) 3081 { 3082 int ret; 3083 u64 data; 3084 unsigned long flags; 3085 struct iommu_cmd cmd, cmd2; 3086 3087 if (iommu->irtcachedis_enabled) 3088 return; 3089 3090 build_inv_irt(&cmd, devid); 3091 data = atomic64_inc_return(&iommu->cmd_sem_val); 3092 build_completion_wait(&cmd2, iommu, data); 3093 3094 raw_spin_lock_irqsave(&iommu->lock, flags); 3095 ret = __iommu_queue_command_sync(iommu, &cmd, true); 3096 if (ret) 3097 goto out; 3098 ret = __iommu_queue_command_sync(iommu, &cmd2, false); 3099 if (ret) 3100 goto out; 3101 wait_on_sem(iommu, data); 3102 out: 3103 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3104 } 3105 3106 static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) 3107 { 3108 if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) 3109 return DTE_INTTABLEN_2K; 3110 return DTE_INTTABLEN_512; 3111 } 3112 3113 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, 3114 struct irq_remap_table *table) 3115 { 3116 u64 new; 3117 struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; 3118 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3119 3120 if (dev_data) 3121 spin_lock(&dev_data->dte_lock); 3122 3123 new = READ_ONCE(dte->data[2]); 3124 new &= ~DTE_IRQ_PHYS_ADDR_MASK; 3125 new |= iommu_virt_to_phys(table->table); 3126 new |= DTE_IRQ_REMAP_INTCTL; 3127 new |= iommu_get_int_tablen(dev_data); 3128 new |= DTE_IRQ_REMAP_ENABLE; 3129 WRITE_ONCE(dte->data[2], new); 3130 3131 if (dev_data) 3132 spin_unlock(&dev_data->dte_lock); 3133 } 3134 3135 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) 3136 { 3137 struct irq_remap_table *table; 3138 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3139 3140 if (WARN_ONCE(!pci_seg->rlookup_table[devid], 3141 "%s: no iommu for devid %x:%x\n", 3142 __func__, pci_seg->id, devid)) 3143 return NULL; 3144 3145 table = pci_seg->irq_lookup_table[devid]; 3146 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n", 3147 __func__, pci_seg->id, devid)) 3148 return NULL; 3149 3150 return table; 3151 } 3152 3153 static struct irq_remap_table *__alloc_irq_table(int nid, size_t size) 3154 { 3155 struct irq_remap_table *table; 3156 3157 table = kzalloc(sizeof(*table), GFP_KERNEL); 3158 if (!table) 3159 return NULL; 3160 3161 table->table = iommu_alloc_pages_node_sz( 3162 nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size)); 3163 if (!table->table) { 3164 kfree(table); 3165 return NULL; 3166 } 3167 raw_spin_lock_init(&table->lock); 3168 3169 return table; 3170 } 3171 3172 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, 3173 struct irq_remap_table *table) 3174 { 3175 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3176 3177 pci_seg->irq_lookup_table[devid] = table; 3178 set_dte_irq_entry(iommu, devid, table); 3179 iommu_flush_dte(iommu, devid); 3180 } 3181 3182 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, 3183 void *data) 3184 { 3185 struct irq_remap_table *table = data; 3186 struct amd_iommu_pci_seg *pci_seg; 3187 struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev); 3188 3189 if (!iommu) 3190 return -EINVAL; 3191 3192 pci_seg = iommu->pci_seg; 3193 pci_seg->irq_lookup_table[alias] = table; 3194 set_dte_irq_entry(iommu, alias, table); 3195 iommu_flush_dte(pci_seg->rlookup_table[alias], alias); 3196 3197 return 0; 3198 } 3199 3200 static inline size_t get_irq_table_size(unsigned int max_irqs) 3201 { 3202 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3203 return max_irqs * sizeof(u32); 3204 3205 return max_irqs * (sizeof(u64) * 2); 3206 } 3207 3208 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, 3209 u16 devid, struct pci_dev *pdev, 3210 unsigned int max_irqs) 3211 { 3212 struct irq_remap_table *table = NULL; 3213 struct irq_remap_table *new_table = NULL; 3214 struct amd_iommu_pci_seg *pci_seg; 3215 unsigned long flags; 3216 int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3217 u16 alias; 3218 3219 spin_lock_irqsave(&iommu_table_lock, flags); 3220 3221 pci_seg = iommu->pci_seg; 3222 table = pci_seg->irq_lookup_table[devid]; 3223 if (table) 3224 goto out_unlock; 3225 3226 alias = pci_seg->alias_table[devid]; 3227 table = pci_seg->irq_lookup_table[alias]; 3228 if (table) { 3229 set_remap_table_entry(iommu, devid, table); 3230 goto out_wait; 3231 } 3232 spin_unlock_irqrestore(&iommu_table_lock, flags); 3233 3234 /* Nothing there yet, allocate new irq remapping table */ 3235 new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs)); 3236 if (!new_table) 3237 return NULL; 3238 3239 spin_lock_irqsave(&iommu_table_lock, flags); 3240 3241 table = pci_seg->irq_lookup_table[devid]; 3242 if (table) 3243 goto out_unlock; 3244 3245 table = pci_seg->irq_lookup_table[alias]; 3246 if (table) { 3247 set_remap_table_entry(iommu, devid, table); 3248 goto out_wait; 3249 } 3250 3251 table = new_table; 3252 new_table = NULL; 3253 3254 if (pdev) 3255 pci_for_each_dma_alias(pdev, set_remap_table_entry_alias, 3256 table); 3257 else 3258 set_remap_table_entry(iommu, devid, table); 3259 3260 if (devid != alias) 3261 set_remap_table_entry(iommu, alias, table); 3262 3263 out_wait: 3264 iommu_completion_wait(iommu); 3265 3266 out_unlock: 3267 spin_unlock_irqrestore(&iommu_table_lock, flags); 3268 3269 if (new_table) { 3270 iommu_free_pages(new_table->table); 3271 kfree(new_table); 3272 } 3273 return table; 3274 } 3275 3276 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, 3277 bool align, struct pci_dev *pdev, 3278 unsigned long max_irqs) 3279 { 3280 struct irq_remap_table *table; 3281 int index, c, alignment = 1; 3282 unsigned long flags; 3283 3284 table = alloc_irq_table(iommu, devid, pdev, max_irqs); 3285 if (!table) 3286 return -ENODEV; 3287 3288 if (align) 3289 alignment = roundup_pow_of_two(count); 3290 3291 raw_spin_lock_irqsave(&table->lock, flags); 3292 3293 /* Scan table for free entries */ 3294 for (index = ALIGN(table->min_index, alignment), c = 0; 3295 index < max_irqs;) { 3296 if (!iommu->irte_ops->is_allocated(table, index)) { 3297 c += 1; 3298 } else { 3299 c = 0; 3300 index = ALIGN(index + 1, alignment); 3301 continue; 3302 } 3303 3304 if (c == count) { 3305 for (; c != 0; --c) 3306 iommu->irte_ops->set_allocated(table, index - c + 1); 3307 3308 index -= count - 1; 3309 goto out; 3310 } 3311 3312 index++; 3313 } 3314 3315 index = -ENOSPC; 3316 3317 out: 3318 raw_spin_unlock_irqrestore(&table->lock, flags); 3319 3320 return index; 3321 } 3322 3323 static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3324 struct irte_ga *irte) 3325 { 3326 struct irq_remap_table *table; 3327 struct irte_ga *entry; 3328 unsigned long flags; 3329 u128 old; 3330 3331 table = get_irq_table(iommu, devid); 3332 if (!table) 3333 return -ENOMEM; 3334 3335 raw_spin_lock_irqsave(&table->lock, flags); 3336 3337 entry = (struct irte_ga *)table->table; 3338 entry = &entry[index]; 3339 3340 /* 3341 * We use cmpxchg16 to atomically update the 128-bit IRTE, 3342 * and it cannot be updated by the hardware or other processors 3343 * behind us, so the return value of cmpxchg16 should be the 3344 * same as the old value. 3345 */ 3346 old = entry->irte; 3347 WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte)); 3348 3349 raw_spin_unlock_irqrestore(&table->lock, flags); 3350 3351 return 0; 3352 } 3353 3354 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3355 struct irte_ga *irte) 3356 { 3357 bool ret; 3358 3359 ret = __modify_irte_ga(iommu, devid, index, irte); 3360 if (ret) 3361 return ret; 3362 3363 iommu_flush_irt_and_complete(iommu, devid); 3364 3365 return 0; 3366 } 3367 3368 static int modify_irte(struct amd_iommu *iommu, 3369 u16 devid, int index, union irte *irte) 3370 { 3371 struct irq_remap_table *table; 3372 unsigned long flags; 3373 3374 table = get_irq_table(iommu, devid); 3375 if (!table) 3376 return -ENOMEM; 3377 3378 raw_spin_lock_irqsave(&table->lock, flags); 3379 table->table[index] = irte->val; 3380 raw_spin_unlock_irqrestore(&table->lock, flags); 3381 3382 iommu_flush_irt_and_complete(iommu, devid); 3383 3384 return 0; 3385 } 3386 3387 static void free_irte(struct amd_iommu *iommu, u16 devid, int index) 3388 { 3389 struct irq_remap_table *table; 3390 unsigned long flags; 3391 3392 table = get_irq_table(iommu, devid); 3393 if (!table) 3394 return; 3395 3396 raw_spin_lock_irqsave(&table->lock, flags); 3397 iommu->irte_ops->clear_allocated(table, index); 3398 raw_spin_unlock_irqrestore(&table->lock, flags); 3399 3400 iommu_flush_irt_and_complete(iommu, devid); 3401 } 3402 3403 static void irte_prepare(void *entry, 3404 u32 delivery_mode, bool dest_mode, 3405 u8 vector, u32 dest_apicid, int devid) 3406 { 3407 union irte *irte = (union irte *) entry; 3408 3409 irte->val = 0; 3410 irte->fields.vector = vector; 3411 irte->fields.int_type = delivery_mode; 3412 irte->fields.destination = dest_apicid; 3413 irte->fields.dm = dest_mode; 3414 irte->fields.valid = 1; 3415 } 3416 3417 static void irte_ga_prepare(void *entry, 3418 u32 delivery_mode, bool dest_mode, 3419 u8 vector, u32 dest_apicid, int devid) 3420 { 3421 struct irte_ga *irte = (struct irte_ga *) entry; 3422 3423 irte->lo.val = 0; 3424 irte->hi.val = 0; 3425 irte->lo.fields_remap.int_type = delivery_mode; 3426 irte->lo.fields_remap.dm = dest_mode; 3427 irte->hi.fields.vector = vector; 3428 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid); 3429 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid); 3430 irte->lo.fields_remap.valid = 1; 3431 } 3432 3433 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3434 { 3435 union irte *irte = (union irte *) entry; 3436 3437 irte->fields.valid = 1; 3438 modify_irte(iommu, devid, index, irte); 3439 } 3440 3441 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3442 { 3443 struct irte_ga *irte = (struct irte_ga *) entry; 3444 3445 irte->lo.fields_remap.valid = 1; 3446 modify_irte_ga(iommu, devid, index, irte); 3447 } 3448 3449 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3450 { 3451 union irte *irte = (union irte *) entry; 3452 3453 irte->fields.valid = 0; 3454 modify_irte(iommu, devid, index, irte); 3455 } 3456 3457 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3458 { 3459 struct irte_ga *irte = (struct irte_ga *) entry; 3460 3461 irte->lo.fields_remap.valid = 0; 3462 modify_irte_ga(iommu, devid, index, irte); 3463 } 3464 3465 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3466 u8 vector, u32 dest_apicid) 3467 { 3468 union irte *irte = (union irte *) entry; 3469 3470 irte->fields.vector = vector; 3471 irte->fields.destination = dest_apicid; 3472 modify_irte(iommu, devid, index, irte); 3473 } 3474 3475 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3476 u8 vector, u32 dest_apicid) 3477 { 3478 struct irte_ga *irte = (struct irte_ga *) entry; 3479 3480 if (!irte->lo.fields_remap.guest_mode) { 3481 irte->hi.fields.vector = vector; 3482 irte->lo.fields_remap.destination = 3483 APICID_TO_IRTE_DEST_LO(dest_apicid); 3484 irte->hi.fields.destination = 3485 APICID_TO_IRTE_DEST_HI(dest_apicid); 3486 modify_irte_ga(iommu, devid, index, irte); 3487 } 3488 } 3489 3490 #define IRTE_ALLOCATED (~1U) 3491 static void irte_set_allocated(struct irq_remap_table *table, int index) 3492 { 3493 table->table[index] = IRTE_ALLOCATED; 3494 } 3495 3496 static void irte_ga_set_allocated(struct irq_remap_table *table, int index) 3497 { 3498 struct irte_ga *ptr = (struct irte_ga *)table->table; 3499 struct irte_ga *irte = &ptr[index]; 3500 3501 memset(&irte->lo.val, 0, sizeof(u64)); 3502 memset(&irte->hi.val, 0, sizeof(u64)); 3503 irte->hi.fields.vector = 0xff; 3504 } 3505 3506 static bool irte_is_allocated(struct irq_remap_table *table, int index) 3507 { 3508 union irte *ptr = (union irte *)table->table; 3509 union irte *irte = &ptr[index]; 3510 3511 return irte->val != 0; 3512 } 3513 3514 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) 3515 { 3516 struct irte_ga *ptr = (struct irte_ga *)table->table; 3517 struct irte_ga *irte = &ptr[index]; 3518 3519 return irte->hi.fields.vector != 0; 3520 } 3521 3522 static void irte_clear_allocated(struct irq_remap_table *table, int index) 3523 { 3524 table->table[index] = 0; 3525 } 3526 3527 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) 3528 { 3529 struct irte_ga *ptr = (struct irte_ga *)table->table; 3530 struct irte_ga *irte = &ptr[index]; 3531 3532 memset(&irte->lo.val, 0, sizeof(u64)); 3533 memset(&irte->hi.val, 0, sizeof(u64)); 3534 } 3535 3536 static int get_devid(struct irq_alloc_info *info) 3537 { 3538 switch (info->type) { 3539 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3540 return get_ioapic_devid(info->devid); 3541 case X86_IRQ_ALLOC_TYPE_HPET: 3542 return get_hpet_devid(info->devid); 3543 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3544 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3545 return get_device_sbdf_id(msi_desc_to_dev(info->desc)); 3546 default: 3547 WARN_ON_ONCE(1); 3548 return -1; 3549 } 3550 } 3551 3552 struct irq_remap_ops amd_iommu_irq_ops = { 3553 .prepare = amd_iommu_prepare, 3554 .enable = amd_iommu_enable, 3555 .disable = amd_iommu_disable, 3556 .reenable = amd_iommu_reenable, 3557 .enable_faulting = amd_iommu_enable_faulting, 3558 }; 3559 3560 static void fill_msi_msg(struct msi_msg *msg, u32 index) 3561 { 3562 msg->data = index; 3563 msg->address_lo = 0; 3564 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; 3565 /* 3566 * The struct msi_msg.dest_mode_logical is used to set the DM bit 3567 * in MSI Message Address Register. For device w/ 2K int-remap support, 3568 * this is bit must be set to 1 regardless of the actual destination 3569 * mode, which is signified by the IRTE[DM]. 3570 */ 3571 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 3572 msg->arch_addr_lo.dest_mode_logical = true; 3573 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; 3574 } 3575 3576 static void irq_remapping_prepare_irte(struct amd_ir_data *data, 3577 struct irq_cfg *irq_cfg, 3578 struct irq_alloc_info *info, 3579 int devid, int index, int sub_handle) 3580 { 3581 struct irq_2_irte *irte_info = &data->irq_2_irte; 3582 struct amd_iommu *iommu = data->iommu; 3583 3584 if (!iommu) 3585 return; 3586 3587 data->irq_2_irte.devid = devid; 3588 data->irq_2_irte.index = index + sub_handle; 3589 iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED, 3590 apic->dest_mode_logical, irq_cfg->vector, 3591 irq_cfg->dest_apicid, devid); 3592 3593 switch (info->type) { 3594 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3595 case X86_IRQ_ALLOC_TYPE_HPET: 3596 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3597 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3598 fill_msi_msg(&data->msi_entry, irte_info->index); 3599 break; 3600 3601 default: 3602 BUG_ON(1); 3603 break; 3604 } 3605 } 3606 3607 struct amd_irte_ops irte_32_ops = { 3608 .prepare = irte_prepare, 3609 .activate = irte_activate, 3610 .deactivate = irte_deactivate, 3611 .set_affinity = irte_set_affinity, 3612 .set_allocated = irte_set_allocated, 3613 .is_allocated = irte_is_allocated, 3614 .clear_allocated = irte_clear_allocated, 3615 }; 3616 3617 struct amd_irte_ops irte_128_ops = { 3618 .prepare = irte_ga_prepare, 3619 .activate = irte_ga_activate, 3620 .deactivate = irte_ga_deactivate, 3621 .set_affinity = irte_ga_set_affinity, 3622 .set_allocated = irte_ga_set_allocated, 3623 .is_allocated = irte_ga_is_allocated, 3624 .clear_allocated = irte_ga_clear_allocated, 3625 }; 3626 3627 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, 3628 unsigned int nr_irqs, void *arg) 3629 { 3630 struct irq_alloc_info *info = arg; 3631 struct irq_data *irq_data; 3632 struct amd_ir_data *data = NULL; 3633 struct amd_iommu *iommu; 3634 struct irq_cfg *cfg; 3635 struct iommu_dev_data *dev_data; 3636 unsigned long max_irqs; 3637 int i, ret, devid, seg, sbdf; 3638 int index; 3639 3640 if (!info) 3641 return -EINVAL; 3642 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) 3643 return -EINVAL; 3644 3645 sbdf = get_devid(info); 3646 if (sbdf < 0) 3647 return -EINVAL; 3648 3649 seg = PCI_SBDF_TO_SEGID(sbdf); 3650 devid = PCI_SBDF_TO_DEVID(sbdf); 3651 iommu = __rlookup_amd_iommu(seg, devid); 3652 if (!iommu) 3653 return -EINVAL; 3654 3655 dev_data = search_dev_data(iommu, devid); 3656 max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; 3657 3658 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); 3659 if (ret < 0) 3660 return ret; 3661 3662 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { 3663 struct irq_remap_table *table; 3664 3665 table = alloc_irq_table(iommu, devid, NULL, max_irqs); 3666 if (table) { 3667 if (!table->min_index) { 3668 /* 3669 * Keep the first 32 indexes free for IOAPIC 3670 * interrupts. 3671 */ 3672 table->min_index = 32; 3673 for (i = 0; i < 32; ++i) 3674 iommu->irte_ops->set_allocated(table, i); 3675 } 3676 WARN_ON(table->min_index != 32); 3677 index = info->ioapic.pin; 3678 } else { 3679 index = -ENOMEM; 3680 } 3681 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI || 3682 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) { 3683 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); 3684 3685 index = alloc_irq_index(iommu, devid, nr_irqs, align, 3686 msi_desc_to_pci_dev(info->desc), 3687 max_irqs); 3688 } else { 3689 index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, 3690 max_irqs); 3691 } 3692 3693 if (index < 0) { 3694 pr_warn("Failed to allocate IRTE\n"); 3695 ret = index; 3696 goto out_free_parent; 3697 } 3698 3699 for (i = 0; i < nr_irqs; i++) { 3700 irq_data = irq_domain_get_irq_data(domain, virq + i); 3701 cfg = irq_data ? irqd_cfg(irq_data) : NULL; 3702 if (!cfg) { 3703 ret = -EINVAL; 3704 goto out_free_data; 3705 } 3706 3707 ret = -ENOMEM; 3708 data = kzalloc(sizeof(*data), GFP_KERNEL); 3709 if (!data) 3710 goto out_free_data; 3711 3712 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3713 data->entry = kzalloc(sizeof(union irte), GFP_KERNEL); 3714 else 3715 data->entry = kzalloc(sizeof(struct irte_ga), 3716 GFP_KERNEL); 3717 if (!data->entry) { 3718 kfree(data); 3719 goto out_free_data; 3720 } 3721 3722 data->iommu = iommu; 3723 irq_data->hwirq = (devid << 16) + i; 3724 irq_data->chip_data = data; 3725 irq_data->chip = &amd_ir_chip; 3726 irq_remapping_prepare_irte(data, cfg, info, devid, index, i); 3727 } 3728 3729 return 0; 3730 3731 out_free_data: 3732 for (i--; i >= 0; i--) { 3733 irq_data = irq_domain_get_irq_data(domain, virq + i); 3734 if (irq_data) 3735 kfree(irq_data->chip_data); 3736 } 3737 for (i = 0; i < nr_irqs; i++) 3738 free_irte(iommu, devid, index + i); 3739 out_free_parent: 3740 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3741 return ret; 3742 } 3743 3744 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, 3745 unsigned int nr_irqs) 3746 { 3747 struct irq_2_irte *irte_info; 3748 struct irq_data *irq_data; 3749 struct amd_ir_data *data; 3750 int i; 3751 3752 for (i = 0; i < nr_irqs; i++) { 3753 irq_data = irq_domain_get_irq_data(domain, virq + i); 3754 if (irq_data && irq_data->chip_data) { 3755 data = irq_data->chip_data; 3756 irte_info = &data->irq_2_irte; 3757 free_irte(data->iommu, irte_info->devid, irte_info->index); 3758 kfree(data->entry); 3759 kfree(data); 3760 } 3761 } 3762 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3763 } 3764 3765 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 3766 struct amd_ir_data *ir_data, 3767 struct irq_2_irte *irte_info, 3768 struct irq_cfg *cfg); 3769 3770 static int irq_remapping_activate(struct irq_domain *domain, 3771 struct irq_data *irq_data, bool reserve) 3772 { 3773 struct amd_ir_data *data = irq_data->chip_data; 3774 struct irq_2_irte *irte_info = &data->irq_2_irte; 3775 struct amd_iommu *iommu = data->iommu; 3776 struct irq_cfg *cfg = irqd_cfg(irq_data); 3777 3778 if (!iommu) 3779 return 0; 3780 3781 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid, 3782 irte_info->index); 3783 amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg); 3784 return 0; 3785 } 3786 3787 static void irq_remapping_deactivate(struct irq_domain *domain, 3788 struct irq_data *irq_data) 3789 { 3790 struct amd_ir_data *data = irq_data->chip_data; 3791 struct irq_2_irte *irte_info = &data->irq_2_irte; 3792 struct amd_iommu *iommu = data->iommu; 3793 3794 if (iommu) 3795 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid, 3796 irte_info->index); 3797 } 3798 3799 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, 3800 enum irq_domain_bus_token bus_token) 3801 { 3802 struct amd_iommu *iommu; 3803 int devid = -1; 3804 3805 if (!amd_iommu_irq_remap) 3806 return 0; 3807 3808 if (x86_fwspec_is_ioapic(fwspec)) 3809 devid = get_ioapic_devid(fwspec->param[0]); 3810 else if (x86_fwspec_is_hpet(fwspec)) 3811 devid = get_hpet_devid(fwspec->param[0]); 3812 3813 if (devid < 0) 3814 return 0; 3815 iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff)); 3816 3817 return iommu && iommu->ir_domain == d; 3818 } 3819 3820 static const struct irq_domain_ops amd_ir_domain_ops = { 3821 .select = irq_remapping_select, 3822 .alloc = irq_remapping_alloc, 3823 .free = irq_remapping_free, 3824 .activate = irq_remapping_activate, 3825 .deactivate = irq_remapping_deactivate, 3826 }; 3827 3828 static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, 3829 bool ga_log_intr) 3830 { 3831 if (cpu >= 0) { 3832 entry->lo.fields_vapic.destination = 3833 APICID_TO_IRTE_DEST_LO(cpu); 3834 entry->hi.fields.destination = 3835 APICID_TO_IRTE_DEST_HI(cpu); 3836 entry->lo.fields_vapic.is_run = true; 3837 entry->lo.fields_vapic.ga_log_intr = false; 3838 } else { 3839 entry->lo.fields_vapic.is_run = false; 3840 entry->lo.fields_vapic.ga_log_intr = ga_log_intr; 3841 } 3842 } 3843 3844 /* 3845 * Update the pCPU information for an IRTE that is configured to post IRQs to 3846 * a vCPU, without issuing an IOMMU invalidation for the IRTE. 3847 * 3848 * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination 3849 * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't 3850 * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based 3851 * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is 3852 * blocking and requires a notification wake event). I.e. treat vCPUs that are 3853 * associated with a pCPU as running. This API is intended to be used when a 3854 * vCPU is scheduled in/out (or stops running for any reason), to do a fast 3855 * update of IsRun, GALogIntr, and (conditionally) Destination. 3856 * 3857 * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached 3858 * and thus don't require an invalidation to ensure the IOMMU consumes fresh 3859 * information. 3860 */ 3861 int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 3862 { 3863 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3864 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3865 3866 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3867 return -EINVAL; 3868 3869 if (!entry || !entry->lo.fields_vapic.guest_mode) 3870 return 0; 3871 3872 if (!ir_data->iommu) 3873 return -ENODEV; 3874 3875 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3876 3877 return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3878 ir_data->irq_2_irte.index, entry); 3879 } 3880 EXPORT_SYMBOL(amd_iommu_update_ga); 3881 3882 int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 3883 { 3884 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3885 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3886 u64 valid; 3887 3888 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3889 return -EINVAL; 3890 3891 if (!entry) 3892 return 0; 3893 3894 valid = entry->lo.fields_vapic.valid; 3895 3896 entry->lo.val = 0; 3897 entry->hi.val = 0; 3898 3899 entry->lo.fields_vapic.valid = valid; 3900 entry->lo.fields_vapic.guest_mode = 1; 3901 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; 3902 entry->hi.fields.vector = ir_data->ga_vector; 3903 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; 3904 3905 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3906 3907 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3908 ir_data->irq_2_irte.index, entry); 3909 } 3910 EXPORT_SYMBOL(amd_iommu_activate_guest_mode); 3911 3912 int amd_iommu_deactivate_guest_mode(void *data) 3913 { 3914 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3915 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3916 struct irq_cfg *cfg = ir_data->cfg; 3917 u64 valid; 3918 3919 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3920 return -EINVAL; 3921 3922 if (!entry || !entry->lo.fields_vapic.guest_mode) 3923 return 0; 3924 3925 valid = entry->lo.fields_remap.valid; 3926 3927 entry->lo.val = 0; 3928 entry->hi.val = 0; 3929 3930 entry->lo.fields_remap.valid = valid; 3931 entry->lo.fields_remap.dm = apic->dest_mode_logical; 3932 entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED; 3933 entry->hi.fields.vector = cfg->vector; 3934 entry->lo.fields_remap.destination = 3935 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); 3936 entry->hi.fields.destination = 3937 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid); 3938 3939 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3940 ir_data->irq_2_irte.index, entry); 3941 } 3942 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); 3943 3944 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) 3945 { 3946 int ret; 3947 struct amd_iommu_pi_data *pi_data = info; 3948 struct amd_ir_data *ir_data = data->chip_data; 3949 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 3950 struct iommu_dev_data *dev_data; 3951 3952 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3953 return -EINVAL; 3954 3955 if (ir_data->iommu == NULL) 3956 return -EINVAL; 3957 3958 dev_data = search_dev_data(ir_data->iommu, irte_info->devid); 3959 3960 /* Note: 3961 * This device has never been set up for guest mode. 3962 * we should not modify the IRTE 3963 */ 3964 if (!dev_data || !dev_data->use_vapic) 3965 return -EINVAL; 3966 3967 ir_data->cfg = irqd_cfg(data); 3968 3969 if (pi_data) { 3970 pi_data->ir_data = ir_data; 3971 3972 ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); 3973 ir_data->ga_vector = pi_data->vector; 3974 ir_data->ga_tag = pi_data->ga_tag; 3975 if (pi_data->is_guest_mode) 3976 ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, 3977 pi_data->ga_log_intr); 3978 else 3979 ret = amd_iommu_deactivate_guest_mode(ir_data); 3980 } else { 3981 ret = amd_iommu_deactivate_guest_mode(ir_data); 3982 } 3983 3984 return ret; 3985 } 3986 3987 3988 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 3989 struct amd_ir_data *ir_data, 3990 struct irq_2_irte *irte_info, 3991 struct irq_cfg *cfg) 3992 { 3993 3994 /* 3995 * Atomically updates the IRTE with the new destination, vector 3996 * and flushes the interrupt entry cache. 3997 */ 3998 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid, 3999 irte_info->index, cfg->vector, 4000 cfg->dest_apicid); 4001 } 4002 4003 static int amd_ir_set_affinity(struct irq_data *data, 4004 const struct cpumask *mask, bool force) 4005 { 4006 struct amd_ir_data *ir_data = data->chip_data; 4007 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4008 struct irq_cfg *cfg = irqd_cfg(data); 4009 struct irq_data *parent = data->parent_data; 4010 struct amd_iommu *iommu = ir_data->iommu; 4011 int ret; 4012 4013 if (!iommu) 4014 return -ENODEV; 4015 4016 ret = parent->chip->irq_set_affinity(parent, mask, force); 4017 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 4018 return ret; 4019 4020 amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg); 4021 /* 4022 * After this point, all the interrupts will start arriving 4023 * at the new destination. So, time to cleanup the previous 4024 * vector allocation. 4025 */ 4026 vector_schedule_cleanup(cfg); 4027 4028 return IRQ_SET_MASK_OK_DONE; 4029 } 4030 4031 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) 4032 { 4033 struct amd_ir_data *ir_data = irq_data->chip_data; 4034 4035 *msg = ir_data->msi_entry; 4036 } 4037 4038 static struct irq_chip amd_ir_chip = { 4039 .name = "AMD-IR", 4040 .irq_ack = apic_ack_irq, 4041 .irq_set_affinity = amd_ir_set_affinity, 4042 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, 4043 .irq_compose_msi_msg = ir_compose_msi_msg, 4044 }; 4045 4046 static const struct msi_parent_ops amdvi_msi_parent_ops = { 4047 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, 4048 .bus_select_token = DOMAIN_BUS_AMDVI, 4049 .bus_select_mask = MATCH_PCI_MSI, 4050 .prefix = "IR-", 4051 .init_dev_msi_info = msi_parent_init_dev_msi_info, 4052 }; 4053 4054 int amd_iommu_create_irq_domain(struct amd_iommu *iommu) 4055 { 4056 struct irq_domain_info info = { 4057 .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index), 4058 .ops = &amd_ir_domain_ops, 4059 .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, 4060 .host_data = iommu, 4061 .parent = arch_get_ir_parent_domain(), 4062 }; 4063 4064 if (!info.fwnode) 4065 return -ENOMEM; 4066 4067 iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops); 4068 if (!iommu->ir_domain) { 4069 irq_domain_free_fwnode(info.fwnode); 4070 return -ENOMEM; 4071 } 4072 return 0; 4073 } 4074 #endif 4075