1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 4 * Author: Joerg Roedel <jroedel@suse.de> 5 * Leo Duran <leo.duran@amd.com> 6 */ 7 8 #define pr_fmt(fmt) "AMD-Vi: " fmt 9 #define dev_fmt(fmt) pr_fmt(fmt) 10 11 #include <linux/ratelimit.h> 12 #include <linux/pci.h> 13 #include <linux/acpi.h> 14 #include <linux/pci-ats.h> 15 #include <linux/bitmap.h> 16 #include <linux/slab.h> 17 #include <linux/string_choices.h> 18 #include <linux/debugfs.h> 19 #include <linux/scatterlist.h> 20 #include <linux/dma-map-ops.h> 21 #include <linux/dma-direct.h> 22 #include <linux/idr.h> 23 #include <linux/iommu-helper.h> 24 #include <linux/delay.h> 25 #include <linux/amd-iommu.h> 26 #include <linux/notifier.h> 27 #include <linux/export.h> 28 #include <linux/irq.h> 29 #include <linux/irqchip/irq-msi-lib.h> 30 #include <linux/msi.h> 31 #include <linux/irqdomain.h> 32 #include <linux/percpu.h> 33 #include <linux/cc_platform.h> 34 #include <asm/irq_remapping.h> 35 #include <asm/io_apic.h> 36 #include <asm/apic.h> 37 #include <asm/hw_irq.h> 38 #include <asm/proto.h> 39 #include <asm/iommu.h> 40 #include <asm/gart.h> 41 #include <asm/dma.h> 42 #include <uapi/linux/iommufd.h> 43 #include <linux/generic_pt/iommu.h> 44 45 #include "amd_iommu.h" 46 #include "iommufd.h" 47 #include "../irq_remapping.h" 48 #include "../iommu-pages.h" 49 50 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 51 52 /* Reserved IOVA ranges */ 53 #define MSI_RANGE_START (0xfee00000) 54 #define MSI_RANGE_END (0xfeefffff) 55 #define HT_RANGE_START (0xfd00000000ULL) 56 #define HT_RANGE_END (0xffffffffffULL) 57 58 LIST_HEAD(ioapic_map); 59 LIST_HEAD(hpet_map); 60 LIST_HEAD(acpihid_map); 61 62 const struct iommu_ops amd_iommu_ops; 63 64 int amd_iommu_max_glx_val = -1; 65 66 /* 67 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap 68 * to know which ones are already in use. 69 */ 70 DEFINE_IDA(pdom_ids); 71 72 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 73 struct iommu_domain *old); 74 75 static void set_dte_entry(struct amd_iommu *iommu, 76 struct iommu_dev_data *dev_data, 77 phys_addr_t top_paddr, unsigned int top_level); 78 79 static int device_flush_dte(struct iommu_dev_data *dev_data); 80 81 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 82 phys_addr_t top_paddr, unsigned int top_level); 83 84 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 85 86 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 87 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 88 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 89 bool enable); 90 91 static void clone_aliases(struct amd_iommu *iommu, struct device *dev); 92 93 static int iommu_completion_wait(struct amd_iommu *iommu); 94 95 /**************************************************************************** 96 * 97 * Helper functions 98 * 99 ****************************************************************************/ 100 101 static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) 102 { 103 /* 104 * Note: 105 * We use arch_cmpxchg128_local() because: 106 * - Need cmpxchg16b instruction mainly for 128-bit store to DTE 107 * (not necessary for cmpxchg since this function is already 108 * protected by a spin_lock for this DTE). 109 * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. 110 */ 111 arch_cmpxchg128_local(ptr, *ptr, val); 112 } 113 114 static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) 115 { 116 struct dev_table_entry old; 117 118 old.data128[1] = ptr->data128[1]; 119 /* 120 * Preserve DTE_DATA2_INTR_MASK. This needs to be 121 * done here since it requires to be inside 122 * spin_lock(&dev_data->dte_lock) context. 123 */ 124 new->data[2] &= ~DTE_DATA2_INTR_MASK; 125 new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; 126 127 amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); 128 } 129 130 static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) 131 { 132 amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); 133 } 134 135 /* 136 * Note: 137 * IOMMU reads the entire Device Table entry in a single 256-bit transaction 138 * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver 139 * need to ensure the following: 140 * - DTE[V|GV] bit is being written last when setting. 141 * - DTE[V|GV] bit is being written first when clearing. 142 * 143 * This function is used only by code, which updates DMA translation part of the DTE. 144 * So, only consider control bits related to DMA when updating the entry. 145 */ 146 static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 147 struct dev_table_entry *new) 148 { 149 unsigned long flags; 150 struct dev_table_entry *dev_table = get_dev_table(iommu); 151 struct dev_table_entry *ptr = &dev_table[dev_data->devid]; 152 153 spin_lock_irqsave(&dev_data->dte_lock, flags); 154 155 if (!(ptr->data[0] & DTE_FLAG_V)) { 156 /* Existing DTE is not valid. */ 157 write_dte_upper128(ptr, new); 158 write_dte_lower128(ptr, new); 159 iommu_flush_dte_sync(iommu, dev_data->devid); 160 } else if (!(new->data[0] & DTE_FLAG_V)) { 161 /* Existing DTE is valid. New DTE is not valid. */ 162 write_dte_lower128(ptr, new); 163 write_dte_upper128(ptr, new); 164 iommu_flush_dte_sync(iommu, dev_data->devid); 165 } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { 166 /* 167 * Both DTEs are valid. 168 * Existing DTE has no guest page table. 169 */ 170 write_dte_upper128(ptr, new); 171 write_dte_lower128(ptr, new); 172 iommu_flush_dte_sync(iommu, dev_data->devid); 173 } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { 174 /* 175 * Both DTEs are valid. 176 * Existing DTE has guest page table, 177 * new DTE has no guest page table, 178 */ 179 write_dte_lower128(ptr, new); 180 write_dte_upper128(ptr, new); 181 iommu_flush_dte_sync(iommu, dev_data->devid); 182 } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != 183 FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { 184 /* 185 * Both DTEs are valid and have guest page table, 186 * but have different number of levels. So, we need 187 * to upadte both upper and lower 128-bit value, which 188 * require disabling and flushing. 189 */ 190 struct dev_table_entry clear = {}; 191 192 /* First disable DTE */ 193 write_dte_lower128(ptr, &clear); 194 iommu_flush_dte_sync(iommu, dev_data->devid); 195 196 /* Then update DTE */ 197 write_dte_upper128(ptr, new); 198 write_dte_lower128(ptr, new); 199 iommu_flush_dte_sync(iommu, dev_data->devid); 200 } else { 201 /* 202 * Both DTEs are valid and have guest page table, 203 * and same number of levels. We just need to only 204 * update the lower 128-bit. So no need to disable DTE. 205 */ 206 write_dte_lower128(ptr, new); 207 } 208 209 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 210 } 211 212 void amd_iommu_update_dte(struct amd_iommu *iommu, 213 struct iommu_dev_data *dev_data, 214 struct dev_table_entry *new) 215 { 216 update_dte256(iommu, dev_data, new); 217 clone_aliases(iommu, dev_data->dev); 218 device_flush_dte(dev_data); 219 iommu_completion_wait(iommu); 220 } 221 222 static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 223 struct dev_table_entry *dte) 224 { 225 unsigned long flags; 226 struct dev_table_entry *ptr; 227 struct dev_table_entry *dev_table = get_dev_table(iommu); 228 229 ptr = &dev_table[dev_data->devid]; 230 231 spin_lock_irqsave(&dev_data->dte_lock, flags); 232 dte->data128[0] = ptr->data128[0]; 233 dte->data128[1] = ptr->data128[1]; 234 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 235 } 236 237 static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) 238 { 239 return (pdom && (pdom->pd_mode == PD_MODE_V2)); 240 } 241 242 static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom) 243 { 244 return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY); 245 } 246 247 /* 248 * We cannot support PASID w/ existing v1 page table in the same domain 249 * since it will be nested. However, existing domain w/ v2 page table 250 * or passthrough mode can be used for PASID. 251 */ 252 static inline bool pdom_is_sva_capable(struct protection_domain *pdom) 253 { 254 return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom); 255 } 256 257 static inline int get_acpihid_device_id(struct device *dev, 258 struct acpihid_map_entry **entry) 259 { 260 struct acpi_device *adev = ACPI_COMPANION(dev); 261 struct acpihid_map_entry *p, *p1 = NULL; 262 int hid_count = 0; 263 bool fw_bug; 264 265 if (!adev) 266 return -ENODEV; 267 268 list_for_each_entry(p, &acpihid_map, list) { 269 if (acpi_dev_hid_uid_match(adev, p->hid, 270 p->uid[0] ? p->uid : NULL)) { 271 p1 = p; 272 fw_bug = false; 273 hid_count = 1; 274 break; 275 } 276 277 /* 278 * Count HID matches w/o UID, raise FW_BUG but allow exactly one match 279 */ 280 if (acpi_dev_hid_match(adev, p->hid)) { 281 p1 = p; 282 hid_count++; 283 fw_bug = true; 284 } 285 } 286 287 if (!p1) 288 return -EINVAL; 289 if (fw_bug) 290 dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", 291 hid_count, str_plural(hid_count)); 292 if (hid_count > 1) 293 return -EINVAL; 294 if (entry) 295 *entry = p1; 296 297 return p1->devid; 298 } 299 300 static inline int get_device_sbdf_id(struct device *dev) 301 { 302 int sbdf; 303 304 if (dev_is_pci(dev)) 305 sbdf = get_pci_sbdf_id(to_pci_dev(dev)); 306 else 307 sbdf = get_acpihid_device_id(dev, NULL); 308 309 return sbdf; 310 } 311 312 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu) 313 { 314 struct dev_table_entry *dev_table; 315 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 316 317 BUG_ON(pci_seg == NULL); 318 dev_table = pci_seg->dev_table; 319 BUG_ON(dev_table == NULL); 320 321 return dev_table; 322 } 323 324 static inline u16 get_device_segment(struct device *dev) 325 { 326 u16 seg; 327 328 if (dev_is_pci(dev)) { 329 struct pci_dev *pdev = to_pci_dev(dev); 330 331 seg = pci_domain_nr(pdev->bus); 332 } else { 333 u32 devid = get_acpihid_device_id(dev, NULL); 334 335 seg = PCI_SBDF_TO_SEGID(devid); 336 } 337 338 return seg; 339 } 340 341 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */ 342 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid) 343 { 344 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 345 346 pci_seg->rlookup_table[devid] = iommu; 347 } 348 349 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) 350 { 351 struct amd_iommu_pci_seg *pci_seg; 352 353 for_each_pci_segment(pci_seg) { 354 if (pci_seg->id == seg) 355 return pci_seg->rlookup_table[devid]; 356 } 357 return NULL; 358 } 359 360 static struct amd_iommu *rlookup_amd_iommu(struct device *dev) 361 { 362 u16 seg = get_device_segment(dev); 363 int devid = get_device_sbdf_id(dev); 364 365 if (devid < 0) 366 return NULL; 367 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid)); 368 } 369 370 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) 371 { 372 struct iommu_dev_data *dev_data; 373 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 374 375 dev_data = kzalloc_obj(*dev_data); 376 if (!dev_data) 377 return NULL; 378 379 mutex_init(&dev_data->mutex); 380 spin_lock_init(&dev_data->dte_lock); 381 dev_data->devid = devid; 382 ratelimit_default_init(&dev_data->rs); 383 384 llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list); 385 return dev_data; 386 } 387 388 struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) 389 { 390 struct iommu_dev_data *dev_data; 391 struct llist_node *node; 392 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 393 394 if (llist_empty(&pci_seg->dev_data_list)) 395 return NULL; 396 397 node = pci_seg->dev_data_list.first; 398 llist_for_each_entry(dev_data, node, dev_data_list) { 399 if (dev_data->devid == devid) 400 return dev_data; 401 } 402 403 return NULL; 404 } 405 406 static int clone_alias(struct pci_dev *pdev_origin, u16 alias, void *data) 407 { 408 struct dev_table_entry new; 409 struct amd_iommu *iommu; 410 struct iommu_dev_data *dev_data, *alias_data; 411 struct pci_dev *pdev = data; 412 u16 devid = pci_dev_id(pdev); 413 int ret = 0; 414 415 if (devid == alias) 416 return 0; 417 418 iommu = rlookup_amd_iommu(&pdev->dev); 419 if (!iommu) 420 return 0; 421 422 /* Copy the data from pdev */ 423 dev_data = dev_iommu_priv_get(&pdev->dev); 424 if (!dev_data) { 425 pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); 426 ret = -EINVAL; 427 goto out; 428 } 429 get_dte256(iommu, dev_data, &new); 430 431 /* Setup alias */ 432 alias_data = find_dev_data(iommu, alias); 433 if (!alias_data) { 434 pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); 435 ret = -EINVAL; 436 goto out; 437 } 438 update_dte256(iommu, alias_data, &new); 439 440 amd_iommu_set_rlookup_table(iommu, alias); 441 out: 442 return ret; 443 } 444 445 static void clone_aliases(struct amd_iommu *iommu, struct device *dev) 446 { 447 struct pci_dev *pdev; 448 449 if (!dev_is_pci(dev)) 450 return; 451 pdev = to_pci_dev(dev); 452 453 /* 454 * The IVRS alias stored in the alias table may not be 455 * part of the PCI DMA aliases if it's bus differs 456 * from the original device. 457 */ 458 clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], pdev); 459 460 pci_for_each_dma_alias(pdev, clone_alias, pdev); 461 } 462 463 static void setup_aliases(struct amd_iommu *iommu, struct device *dev) 464 { 465 struct pci_dev *pdev = to_pci_dev(dev); 466 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 467 u16 ivrs_alias; 468 469 /* For ACPI HID devices, there are no aliases */ 470 if (!dev_is_pci(dev)) 471 return; 472 473 /* 474 * Add the IVRS alias to the pci aliases if it is on the same 475 * bus. The IVRS table may know about a quirk that we don't. 476 */ 477 ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)]; 478 if (ivrs_alias != pci_dev_id(pdev) && 479 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) 480 pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1); 481 482 clone_aliases(iommu, dev); 483 } 484 485 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid) 486 { 487 struct iommu_dev_data *dev_data; 488 489 dev_data = search_dev_data(iommu, devid); 490 491 if (dev_data == NULL) { 492 dev_data = alloc_dev_data(iommu, devid); 493 if (!dev_data) 494 return NULL; 495 496 if (translation_pre_enabled(iommu)) 497 dev_data->defer_attach = true; 498 } 499 500 return dev_data; 501 } 502 503 /* 504 * Find or create an IOMMU group for a acpihid device. 505 */ 506 static struct iommu_group *acpihid_device_group(struct device *dev) 507 { 508 struct acpihid_map_entry *p, *entry = NULL; 509 int devid; 510 511 devid = get_acpihid_device_id(dev, &entry); 512 if (devid < 0) 513 return ERR_PTR(devid); 514 515 list_for_each_entry(p, &acpihid_map, list) { 516 if ((devid == p->devid) && p->group) 517 entry->group = p->group; 518 } 519 520 if (!entry->group) 521 entry->group = generic_device_group(dev); 522 else 523 iommu_group_ref_get(entry->group); 524 525 return entry->group; 526 } 527 528 static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) 529 { 530 return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); 531 } 532 533 static u32 pdev_get_caps(struct pci_dev *pdev) 534 { 535 int features; 536 u32 flags = 0; 537 538 if (pci_ats_supported(pdev)) 539 flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 540 541 if (pci_pri_supported(pdev)) 542 flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 543 544 features = pci_pasid_features(pdev); 545 if (features >= 0) { 546 flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 547 548 if (features & PCI_PASID_CAP_EXEC) 549 flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 550 551 if (features & PCI_PASID_CAP_PRIV) 552 flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 553 } 554 555 return flags; 556 } 557 558 static inline int pdev_enable_cap_ats(struct pci_dev *pdev) 559 { 560 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 561 int ret = -EINVAL; 562 563 if (dev_data->ats_enabled) 564 return 0; 565 566 if (amd_iommu_iotlb_sup && 567 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { 568 ret = pci_enable_ats(pdev, PAGE_SHIFT); 569 if (!ret) { 570 dev_data->ats_enabled = 1; 571 dev_data->ats_qdep = pci_ats_queue_depth(pdev); 572 } 573 } 574 575 return ret; 576 } 577 578 static inline void pdev_disable_cap_ats(struct pci_dev *pdev) 579 { 580 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 581 582 if (dev_data->ats_enabled) { 583 pci_disable_ats(pdev); 584 dev_data->ats_enabled = 0; 585 } 586 } 587 588 static inline int pdev_enable_cap_pri(struct pci_dev *pdev) 589 { 590 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 591 int ret = -EINVAL; 592 593 if (dev_data->pri_enabled) 594 return 0; 595 596 if (!dev_data->ats_enabled) 597 return 0; 598 599 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { 600 /* 601 * First reset the PRI state of the device. 602 * FIXME: Hardcode number of outstanding requests for now 603 */ 604 if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { 605 dev_data->pri_enabled = 1; 606 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); 607 608 ret = 0; 609 } 610 } 611 612 return ret; 613 } 614 615 static inline void pdev_disable_cap_pri(struct pci_dev *pdev) 616 { 617 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 618 619 if (dev_data->pri_enabled) { 620 pci_disable_pri(pdev); 621 dev_data->pri_enabled = 0; 622 } 623 } 624 625 static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) 626 { 627 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 628 int ret = -EINVAL; 629 630 if (dev_data->pasid_enabled) 631 return 0; 632 633 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { 634 /* Only allow access to user-accessible pages */ 635 ret = pci_enable_pasid(pdev, 0); 636 if (!ret) 637 dev_data->pasid_enabled = 1; 638 } 639 640 return ret; 641 } 642 643 static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) 644 { 645 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 646 647 if (dev_data->pasid_enabled) { 648 pci_disable_pasid(pdev); 649 dev_data->pasid_enabled = 0; 650 } 651 } 652 653 static void pdev_enable_caps(struct pci_dev *pdev) 654 { 655 pdev_enable_cap_pasid(pdev); 656 pdev_enable_cap_ats(pdev); 657 pdev_enable_cap_pri(pdev); 658 } 659 660 static void pdev_disable_caps(struct pci_dev *pdev) 661 { 662 pdev_disable_cap_ats(pdev); 663 pdev_disable_cap_pasid(pdev); 664 pdev_disable_cap_pri(pdev); 665 } 666 667 /* 668 * This function checks if the driver got a valid device from the caller to 669 * avoid dereferencing invalid pointers. 670 */ 671 static bool check_device(struct device *dev) 672 { 673 struct amd_iommu_pci_seg *pci_seg; 674 struct amd_iommu *iommu; 675 int devid, sbdf; 676 677 if (!dev) 678 return false; 679 680 sbdf = get_device_sbdf_id(dev); 681 if (sbdf < 0) 682 return false; 683 devid = PCI_SBDF_TO_DEVID(sbdf); 684 685 iommu = rlookup_amd_iommu(dev); 686 if (!iommu) 687 return false; 688 689 /* Out of our scope? */ 690 pci_seg = iommu->pci_seg; 691 if (devid > pci_seg->last_bdf) 692 return false; 693 694 return true; 695 } 696 697 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) 698 { 699 struct iommu_dev_data *dev_data; 700 int devid, sbdf; 701 702 if (dev_iommu_priv_get(dev)) 703 return 0; 704 705 sbdf = get_device_sbdf_id(dev); 706 if (sbdf < 0) 707 return sbdf; 708 709 devid = PCI_SBDF_TO_DEVID(sbdf); 710 dev_data = find_dev_data(iommu, devid); 711 if (!dev_data) 712 return -ENOMEM; 713 714 dev_data->dev = dev; 715 716 /* 717 * The dev_iommu_priv_set() needes to be called before setup_aliases. 718 * Otherwise, subsequent call to dev_iommu_priv_get() will fail. 719 */ 720 dev_iommu_priv_set(dev, dev_data); 721 setup_aliases(iommu, dev); 722 723 /* 724 * By default we use passthrough mode for IOMMUv2 capable device. 725 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to 726 * invalid address), we ignore the capability for the device so 727 * it'll be forced to go into translation mode. 728 */ 729 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && 730 dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { 731 dev_data->flags = pdev_get_caps(to_pci_dev(dev)); 732 } 733 734 return 0; 735 } 736 737 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) 738 { 739 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 740 struct dev_table_entry *dev_table = get_dev_table(iommu); 741 int devid, sbdf; 742 743 sbdf = get_device_sbdf_id(dev); 744 if (sbdf < 0) 745 return; 746 747 devid = PCI_SBDF_TO_DEVID(sbdf); 748 pci_seg->rlookup_table[devid] = NULL; 749 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry)); 750 751 setup_aliases(iommu, dev); 752 } 753 754 755 /**************************************************************************** 756 * 757 * Interrupt handling functions 758 * 759 ****************************************************************************/ 760 761 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) 762 { 763 int i; 764 struct dev_table_entry dte; 765 struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); 766 767 get_dte256(iommu, dev_data, &dte); 768 769 for (i = 0; i < 4; ++i) 770 pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); 771 } 772 773 static void dump_command(unsigned long phys_addr) 774 { 775 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 776 int i; 777 778 for (i = 0; i < 4; ++i) 779 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]); 780 } 781 782 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event) 783 { 784 struct iommu_dev_data *dev_data = NULL; 785 int devid, vmg_tag, flags; 786 struct pci_dev *pdev; 787 u64 spa; 788 789 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 790 vmg_tag = (event[1]) & 0xFFFF; 791 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 792 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8); 793 794 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 795 devid & 0xff); 796 if (pdev) 797 dev_data = dev_iommu_priv_get(&pdev->dev); 798 799 if (dev_data) { 800 if (__ratelimit(&dev_data->rs)) { 801 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 802 vmg_tag, spa, flags); 803 } 804 } else { 805 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 806 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 807 vmg_tag, spa, flags); 808 } 809 810 if (pdev) 811 pci_dev_put(pdev); 812 } 813 814 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event) 815 { 816 struct iommu_dev_data *dev_data = NULL; 817 int devid, flags_rmp, vmg_tag, flags; 818 struct pci_dev *pdev; 819 u64 gpa; 820 821 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 822 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF; 823 vmg_tag = (event[1]) & 0xFFFF; 824 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 825 gpa = ((u64)event[3] << 32) | event[2]; 826 827 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 828 devid & 0xff); 829 if (pdev) 830 dev_data = dev_iommu_priv_get(&pdev->dev); 831 832 if (dev_data) { 833 if (__ratelimit(&dev_data->rs)) { 834 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 835 vmg_tag, gpa, flags_rmp, flags); 836 } 837 } else { 838 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 839 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 840 vmg_tag, gpa, flags_rmp, flags); 841 } 842 843 if (pdev) 844 pci_dev_put(pdev); 845 } 846 847 #define IS_IOMMU_MEM_TRANSACTION(flags) \ 848 (((flags) & EVENT_FLAG_I) == 0) 849 850 #define IS_WRITE_REQUEST(flags) \ 851 ((flags) & EVENT_FLAG_RW) 852 853 static void amd_iommu_report_page_fault(struct amd_iommu *iommu, 854 u16 devid, u16 domain_id, 855 u64 address, int flags) 856 { 857 struct iommu_dev_data *dev_data = NULL; 858 struct pci_dev *pdev; 859 860 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 861 devid & 0xff); 862 if (pdev) 863 dev_data = dev_iommu_priv_get(&pdev->dev); 864 865 if (dev_data) { 866 /* 867 * If this is a DMA fault (for which the I(nterrupt) 868 * bit will be unset), allow report_iommu_fault() to 869 * prevent logging it. 870 */ 871 if (IS_IOMMU_MEM_TRANSACTION(flags)) { 872 /* Device not attached to domain properly */ 873 if (dev_data->domain == NULL) { 874 pr_err_ratelimited("Event logged [Device not attached to domain properly]\n"); 875 pr_err_ratelimited(" device=%04x:%02x:%02x.%x domain=0x%04x\n", 876 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), 877 PCI_FUNC(devid), domain_id); 878 goto out; 879 } 880 881 if (!report_iommu_fault(&dev_data->domain->domain, 882 &pdev->dev, address, 883 IS_WRITE_REQUEST(flags) ? 884 IOMMU_FAULT_WRITE : 885 IOMMU_FAULT_READ)) 886 goto out; 887 } 888 889 if (__ratelimit(&dev_data->rs)) { 890 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", 891 domain_id, address, flags); 892 } 893 } else { 894 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", 895 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 896 domain_id, address, flags); 897 } 898 899 out: 900 if (pdev) 901 pci_dev_put(pdev); 902 } 903 904 static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 905 { 906 struct device *dev = iommu->iommu.dev; 907 int type, devid, flags, tag; 908 volatile u32 *event = __evt; 909 int count = 0; 910 u64 address, ctrl; 911 u32 pasid; 912 913 retry: 914 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 915 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 916 pasid = (event[0] & EVENT_DOMID_MASK_HI) | 917 (event[1] & EVENT_DOMID_MASK_LO); 918 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 919 address = (u64)(((u64)event[3]) << 32) | event[2]; 920 ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); 921 922 if (type == 0) { 923 /* Did we hit the erratum? */ 924 if (++count == LOOP_TIMEOUT) { 925 pr_err("No event written to event log\n"); 926 return; 927 } 928 udelay(1); 929 goto retry; 930 } 931 932 if (type == EVENT_TYPE_IO_FAULT) { 933 amd_iommu_report_page_fault(iommu, devid, pasid, address, flags); 934 return; 935 } 936 937 switch (type) { 938 case EVENT_TYPE_ILL_DEV: 939 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 940 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 941 pasid, address, flags); 942 dev_err(dev, "Control Reg : 0x%llx\n", ctrl); 943 dump_dte_entry(iommu, devid); 944 break; 945 case EVENT_TYPE_DEV_TAB_ERR: 946 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x " 947 "address=0x%llx flags=0x%04x]\n", 948 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 949 address, flags); 950 break; 951 case EVENT_TYPE_PAGE_TAB_ERR: 952 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n", 953 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 954 pasid, address, flags); 955 break; 956 case EVENT_TYPE_ILL_CMD: 957 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address); 958 dump_command(address); 959 break; 960 case EVENT_TYPE_CMD_HARD_ERR: 961 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n", 962 address, flags); 963 break; 964 case EVENT_TYPE_IOTLB_INV_TO: 965 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n", 966 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 967 address); 968 break; 969 case EVENT_TYPE_INV_DEV_REQ: 970 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 971 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 972 pasid, address, flags); 973 break; 974 case EVENT_TYPE_RMP_FAULT: 975 amd_iommu_report_rmp_fault(iommu, event); 976 break; 977 case EVENT_TYPE_RMP_HW_ERR: 978 amd_iommu_report_rmp_hw_error(iommu, event); 979 break; 980 case EVENT_TYPE_INV_PPR_REQ: 981 pasid = PPR_PASID(*((u64 *)__evt)); 982 tag = event[1] & 0x03FF; 983 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n", 984 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 985 pasid, address, flags, tag); 986 break; 987 default: 988 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n", 989 event[0], event[1], event[2], event[3]); 990 } 991 992 /* 993 * To detect the hardware errata 732 we need to clear the 994 * entry back to zero. This issue does not exist on SNP 995 * enabled system. Also this buffer is not writeable on 996 * SNP enabled system. 997 */ 998 if (!amd_iommu_snp_en) 999 memset(__evt, 0, 4 * sizeof(u32)); 1000 } 1001 1002 static void iommu_poll_events(struct amd_iommu *iommu) 1003 { 1004 u32 head, tail; 1005 1006 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1007 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 1008 1009 while (head != tail) { 1010 iommu_print_event(iommu, iommu->evt_buf + head); 1011 1012 /* Update head pointer of hardware ring-buffer */ 1013 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; 1014 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1015 } 1016 1017 } 1018 1019 #ifdef CONFIG_IRQ_REMAP 1020 static int (*iommu_ga_log_notifier)(u32); 1021 1022 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) 1023 { 1024 iommu_ga_log_notifier = notifier; 1025 1026 /* 1027 * Ensure all in-flight IRQ handlers run to completion before returning 1028 * to the caller, e.g. to ensure module code isn't unloaded while it's 1029 * being executed in the IRQ handler. 1030 */ 1031 if (!notifier) 1032 synchronize_rcu(); 1033 1034 return 0; 1035 } 1036 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); 1037 1038 static void iommu_poll_ga_log(struct amd_iommu *iommu) 1039 { 1040 u32 head, tail; 1041 1042 if (iommu->ga_log == NULL) 1043 return; 1044 1045 head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1046 tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); 1047 1048 while (head != tail) { 1049 volatile u64 *raw; 1050 u64 log_entry; 1051 1052 raw = (u64 *)(iommu->ga_log + head); 1053 1054 /* Avoid memcpy function-call overhead */ 1055 log_entry = *raw; 1056 1057 /* Update head pointer of hardware ring-buffer */ 1058 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; 1059 writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1060 1061 /* Handle GA entry */ 1062 switch (GA_REQ_TYPE(log_entry)) { 1063 case GA_GUEST_NR: 1064 if (!iommu_ga_log_notifier) 1065 break; 1066 1067 pr_debug("%s: devid=%#x, ga_tag=%#x\n", 1068 __func__, GA_DEVID(log_entry), 1069 GA_TAG(log_entry)); 1070 1071 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) 1072 pr_err("GA log notifier failed.\n"); 1073 break; 1074 default: 1075 break; 1076 } 1077 } 1078 } 1079 1080 static void 1081 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) 1082 { 1083 if (!irq_remapping_enabled || !dev_is_pci(dev) || 1084 !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev))) 1085 return; 1086 1087 dev_set_msi_domain(dev, iommu->ir_domain); 1088 } 1089 1090 #else /* CONFIG_IRQ_REMAP */ 1091 static inline void 1092 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } 1093 #endif /* !CONFIG_IRQ_REMAP */ 1094 1095 static void amd_iommu_handle_irq(void *data, const char *evt_type, 1096 u32 int_mask, u32 overflow_mask, 1097 void (*int_handler)(struct amd_iommu *), 1098 void (*overflow_handler)(struct amd_iommu *)) 1099 { 1100 struct amd_iommu *iommu = (struct amd_iommu *) data; 1101 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1102 u32 mask = int_mask | overflow_mask; 1103 1104 while (status & mask) { 1105 /* Enable interrupt sources again */ 1106 writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET); 1107 1108 if (int_handler) { 1109 pr_devel("Processing IOMMU (ivhd%d) %s Log\n", 1110 iommu->index, evt_type); 1111 int_handler(iommu); 1112 } 1113 1114 if ((status & overflow_mask) && overflow_handler) 1115 overflow_handler(iommu); 1116 1117 /* 1118 * Hardware bug: ERBT1312 1119 * When re-enabling interrupt (by writing 1 1120 * to clear the bit), the hardware might also try to set 1121 * the interrupt bit in the event status register. 1122 * In this scenario, the bit will be set, and disable 1123 * subsequent interrupts. 1124 * 1125 * Workaround: The IOMMU driver should read back the 1126 * status register and check if the interrupt bits are cleared. 1127 * If not, driver will need to go through the interrupt handler 1128 * again and re-clear the bits 1129 */ 1130 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1131 } 1132 } 1133 1134 irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data) 1135 { 1136 amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK, 1137 MMIO_STATUS_EVT_OVERFLOW_MASK, 1138 iommu_poll_events, amd_iommu_restart_event_logging); 1139 1140 return IRQ_HANDLED; 1141 } 1142 1143 irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data) 1144 { 1145 amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK, 1146 MMIO_STATUS_PPR_OVERFLOW_MASK, 1147 amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log); 1148 1149 return IRQ_HANDLED; 1150 } 1151 1152 irqreturn_t amd_iommu_int_thread_galog(int irq, void *data) 1153 { 1154 #ifdef CONFIG_IRQ_REMAP 1155 amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK, 1156 MMIO_STATUS_GALOG_OVERFLOW_MASK, 1157 iommu_poll_ga_log, amd_iommu_restart_ga_log); 1158 #endif 1159 1160 return IRQ_HANDLED; 1161 } 1162 1163 irqreturn_t amd_iommu_int_thread(int irq, void *data) 1164 { 1165 amd_iommu_int_thread_evtlog(irq, data); 1166 amd_iommu_int_thread_pprlog(irq, data); 1167 amd_iommu_int_thread_galog(irq, data); 1168 1169 return IRQ_HANDLED; 1170 } 1171 1172 /**************************************************************************** 1173 * 1174 * IOMMU command queuing functions 1175 * 1176 ****************************************************************************/ 1177 1178 static void dump_command_buffer(struct amd_iommu *iommu) 1179 { 1180 struct iommu_cmd *cmd; 1181 u32 head, tail; 1182 int i; 1183 1184 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 1185 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1186 1187 pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), 1188 MMIO_CMD_BUFFER_TAIL(tail)); 1189 1190 for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { 1191 cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); 1192 pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], 1193 cmd->data[3]); 1194 } 1195 } 1196 1197 static int wait_on_sem(struct amd_iommu *iommu, u64 data) 1198 { 1199 int i = 0; 1200 1201 /* 1202 * cmd_sem holds a monotonically non-decreasing completion sequence 1203 * number. 1204 */ 1205 while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && 1206 i < LOOP_TIMEOUT) { 1207 udelay(1); 1208 i += 1; 1209 } 1210 1211 if (i == LOOP_TIMEOUT) { 1212 1213 pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", 1214 iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), 1215 PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); 1216 1217 if (amd_iommu_dump) 1218 DO_ONCE_LITE(dump_command_buffer, iommu); 1219 1220 return -EIO; 1221 } 1222 1223 return 0; 1224 } 1225 1226 static void copy_cmd_to_buffer(struct amd_iommu *iommu, 1227 struct iommu_cmd *cmd) 1228 { 1229 u8 *target; 1230 u32 tail; 1231 1232 /* Copy command to buffer */ 1233 tail = iommu->cmd_buf_tail; 1234 target = iommu->cmd_buf + tail; 1235 memcpy(target, cmd, sizeof(*cmd)); 1236 1237 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1238 iommu->cmd_buf_tail = tail; 1239 1240 /* Tell the IOMMU about it */ 1241 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1242 } 1243 1244 static void build_completion_wait(struct iommu_cmd *cmd, 1245 struct amd_iommu *iommu, 1246 u64 data) 1247 { 1248 u64 paddr = iommu->cmd_sem_paddr; 1249 1250 memset(cmd, 0, sizeof(*cmd)); 1251 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 1252 cmd->data[1] = upper_32_bits(paddr); 1253 cmd->data[2] = lower_32_bits(data); 1254 cmd->data[3] = upper_32_bits(data); 1255 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 1256 } 1257 1258 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 1259 { 1260 memset(cmd, 0, sizeof(*cmd)); 1261 cmd->data[0] = devid; 1262 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 1263 } 1264 1265 /* 1266 * Builds an invalidation address which is suitable for one page or multiple 1267 * pages. Sets the size bit (S) as needed is more than one page is flushed. 1268 */ 1269 static inline u64 build_inv_address(u64 address, size_t size) 1270 { 1271 u64 pages, end, msb_diff; 1272 1273 pages = iommu_num_pages(address, size, PAGE_SIZE); 1274 1275 if (pages == 1) 1276 return address & PAGE_MASK; 1277 1278 end = address + size - 1; 1279 1280 /* 1281 * msb_diff would hold the index of the most significant bit that 1282 * flipped between the start and end. 1283 */ 1284 msb_diff = fls64(end ^ address) - 1; 1285 1286 /* 1287 * Bits 63:52 are sign extended. If for some reason bit 51 is different 1288 * between the start and the end, invalidate everything. 1289 */ 1290 if (unlikely(msb_diff > 51)) { 1291 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 1292 } else { 1293 /* 1294 * The msb-bit must be clear on the address. Just set all the 1295 * lower bits. 1296 */ 1297 address |= (1ull << msb_diff) - 1; 1298 } 1299 1300 /* Clear bits 11:0 */ 1301 address &= PAGE_MASK; 1302 1303 /* Set the size bit - we flush more than one 4kb page */ 1304 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK; 1305 } 1306 1307 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 1308 size_t size, u16 domid, 1309 ioasid_t pasid, bool gn) 1310 { 1311 u64 inv_address = build_inv_address(address, size); 1312 1313 memset(cmd, 0, sizeof(*cmd)); 1314 1315 cmd->data[1] |= domid; 1316 cmd->data[2] = lower_32_bits(inv_address); 1317 cmd->data[3] = upper_32_bits(inv_address); 1318 /* PDE bit - we want to flush everything, not only the PTEs */ 1319 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 1320 if (gn) { 1321 cmd->data[0] |= pasid; 1322 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1323 } 1324 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 1325 } 1326 1327 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 1328 u64 address, size_t size, 1329 ioasid_t pasid, bool gn) 1330 { 1331 u64 inv_address = build_inv_address(address, size); 1332 1333 memset(cmd, 0, sizeof(*cmd)); 1334 1335 cmd->data[0] = devid; 1336 cmd->data[0] |= (qdep & 0xff) << 24; 1337 cmd->data[1] = devid; 1338 cmd->data[2] = lower_32_bits(inv_address); 1339 cmd->data[3] = upper_32_bits(inv_address); 1340 if (gn) { 1341 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 1342 cmd->data[1] |= (pasid & 0xff) << 16; 1343 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1344 } 1345 1346 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 1347 } 1348 1349 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, 1350 int status, int tag, u8 gn) 1351 { 1352 memset(cmd, 0, sizeof(*cmd)); 1353 1354 cmd->data[0] = devid; 1355 if (gn) { 1356 cmd->data[1] = pasid; 1357 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 1358 } 1359 cmd->data[3] = tag & 0x1ff; 1360 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 1361 1362 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 1363 } 1364 1365 static void build_inv_all(struct iommu_cmd *cmd) 1366 { 1367 memset(cmd, 0, sizeof(*cmd)); 1368 CMD_SET_TYPE(cmd, CMD_INV_ALL); 1369 } 1370 1371 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 1372 { 1373 memset(cmd, 0, sizeof(*cmd)); 1374 cmd->data[0] = devid; 1375 CMD_SET_TYPE(cmd, CMD_INV_IRT); 1376 } 1377 1378 /* 1379 * Writes the command to the IOMMUs command buffer and informs the 1380 * hardware about the new command. 1381 */ 1382 static int __iommu_queue_command_sync(struct amd_iommu *iommu, 1383 struct iommu_cmd *cmd, 1384 bool sync) 1385 { 1386 unsigned int count = 0; 1387 u32 left, next_tail; 1388 1389 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1390 again: 1391 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; 1392 1393 if (left <= 0x20) { 1394 /* Skip udelay() the first time around */ 1395 if (count++) { 1396 if (count == LOOP_TIMEOUT) { 1397 pr_err("Command buffer timeout\n"); 1398 return -EIO; 1399 } 1400 1401 udelay(1); 1402 } 1403 1404 /* Update head and recheck remaining space */ 1405 iommu->cmd_buf_head = readl(iommu->mmio_base + 1406 MMIO_CMD_HEAD_OFFSET); 1407 1408 goto again; 1409 } 1410 1411 copy_cmd_to_buffer(iommu, cmd); 1412 1413 /* Do we need to make sure all commands are processed? */ 1414 iommu->need_sync = sync; 1415 1416 return 0; 1417 } 1418 1419 static int iommu_queue_command_sync(struct amd_iommu *iommu, 1420 struct iommu_cmd *cmd, 1421 bool sync) 1422 { 1423 unsigned long flags; 1424 int ret; 1425 1426 raw_spin_lock_irqsave(&iommu->lock, flags); 1427 ret = __iommu_queue_command_sync(iommu, cmd, sync); 1428 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1429 1430 return ret; 1431 } 1432 1433 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 1434 { 1435 return iommu_queue_command_sync(iommu, cmd, true); 1436 } 1437 1438 static u64 get_cmdsem_val(struct amd_iommu *iommu) 1439 { 1440 lockdep_assert_held(&iommu->lock); 1441 return ++iommu->cmd_sem_val; 1442 } 1443 1444 /* 1445 * This function queues a completion wait command into the command 1446 * buffer of an IOMMU 1447 */ 1448 static int iommu_completion_wait(struct amd_iommu *iommu) 1449 { 1450 struct iommu_cmd cmd; 1451 unsigned long flags; 1452 int ret; 1453 u64 data; 1454 1455 if (!iommu->need_sync) 1456 return 0; 1457 1458 raw_spin_lock_irqsave(&iommu->lock, flags); 1459 1460 data = get_cmdsem_val(iommu); 1461 build_completion_wait(&cmd, iommu, data); 1462 1463 ret = __iommu_queue_command_sync(iommu, &cmd, false); 1464 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1465 1466 if (ret) 1467 return ret; 1468 1469 ret = wait_on_sem(iommu, data); 1470 1471 return ret; 1472 } 1473 1474 static void domain_flush_complete(struct protection_domain *domain) 1475 { 1476 struct pdom_iommu_info *pdom_iommu_info; 1477 unsigned long i; 1478 1479 lockdep_assert_held(&domain->lock); 1480 1481 /* 1482 * Devices of this domain are behind this IOMMU 1483 * We need to wait for completion of all commands. 1484 */ 1485 xa_for_each(&domain->iommu_array, i, pdom_iommu_info) 1486 iommu_completion_wait(pdom_iommu_info->iommu); 1487 } 1488 1489 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 1490 { 1491 struct iommu_cmd cmd; 1492 1493 build_inv_dte(&cmd, devid); 1494 1495 return iommu_queue_command(iommu, &cmd); 1496 } 1497 1498 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) 1499 { 1500 int ret; 1501 1502 ret = iommu_flush_dte(iommu, devid); 1503 if (!ret) 1504 iommu_completion_wait(iommu); 1505 } 1506 1507 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) 1508 { 1509 u32 devid; 1510 u16 last_bdf = iommu->pci_seg->last_bdf; 1511 1512 for (devid = 0; devid <= last_bdf; ++devid) 1513 iommu_flush_dte(iommu, devid); 1514 1515 iommu_completion_wait(iommu); 1516 } 1517 1518 /* 1519 * This function uses heavy locking and may disable irqs for some time. But 1520 * this is no issue because it is only called during resume. 1521 */ 1522 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) 1523 { 1524 u32 dom_id; 1525 u16 last_bdf = iommu->pci_seg->last_bdf; 1526 1527 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { 1528 struct iommu_cmd cmd; 1529 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1530 dom_id, IOMMU_NO_PASID, false); 1531 iommu_queue_command(iommu, &cmd); 1532 } 1533 1534 iommu_completion_wait(iommu); 1535 } 1536 1537 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) 1538 { 1539 struct iommu_cmd cmd; 1540 1541 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1542 dom_id, IOMMU_NO_PASID, false); 1543 iommu_queue_command(iommu, &cmd); 1544 1545 iommu_completion_wait(iommu); 1546 } 1547 1548 static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size) 1549 { 1550 int ret = 0; 1551 struct amd_iommu_viommu *aviommu; 1552 1553 list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) { 1554 unsigned long i; 1555 struct guest_domain_mapping_info *gdom_info; 1556 struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, 1557 struct amd_iommu, iommu); 1558 1559 xa_lock(&aviommu->gdomid_array); 1560 xa_for_each(&aviommu->gdomid_array, i, gdom_info) { 1561 struct iommu_cmd cmd; 1562 1563 pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__, 1564 iommu->devid, gdom_info->hdom_id); 1565 build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id, 1566 IOMMU_NO_PASID, false); 1567 ret |= iommu_queue_command(iommu, &cmd); 1568 } 1569 xa_unlock(&aviommu->gdomid_array); 1570 } 1571 return ret; 1572 } 1573 1574 static void amd_iommu_flush_all(struct amd_iommu *iommu) 1575 { 1576 struct iommu_cmd cmd; 1577 1578 build_inv_all(&cmd); 1579 1580 iommu_queue_command(iommu, &cmd); 1581 iommu_completion_wait(iommu); 1582 } 1583 1584 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 1585 { 1586 struct iommu_cmd cmd; 1587 1588 build_inv_irt(&cmd, devid); 1589 1590 iommu_queue_command(iommu, &cmd); 1591 } 1592 1593 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) 1594 { 1595 u32 devid; 1596 u16 last_bdf = iommu->pci_seg->last_bdf; 1597 1598 if (iommu->irtcachedis_enabled) 1599 return; 1600 1601 for (devid = 0; devid <= last_bdf; devid++) 1602 iommu_flush_irt(iommu, devid); 1603 1604 iommu_completion_wait(iommu); 1605 } 1606 1607 void amd_iommu_flush_all_caches(struct amd_iommu *iommu) 1608 { 1609 if (check_feature(FEATURE_IA)) { 1610 amd_iommu_flush_all(iommu); 1611 } else { 1612 amd_iommu_flush_dte_all(iommu); 1613 amd_iommu_flush_irt_all(iommu); 1614 amd_iommu_flush_tlb_all(iommu); 1615 } 1616 } 1617 1618 /* 1619 * Command send function for flushing on-device TLB 1620 */ 1621 static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, 1622 size_t size, ioasid_t pasid, bool gn) 1623 { 1624 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1625 struct iommu_cmd cmd; 1626 int qdep = dev_data->ats_qdep; 1627 1628 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, 1629 size, pasid, gn); 1630 1631 return iommu_queue_command(iommu, &cmd); 1632 } 1633 1634 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data) 1635 { 1636 struct amd_iommu *iommu = data; 1637 1638 return iommu_flush_dte(iommu, alias); 1639 } 1640 1641 /* 1642 * Command send function for invalidating a device table entry 1643 */ 1644 static int device_flush_dte(struct iommu_dev_data *dev_data) 1645 { 1646 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1647 struct pci_dev *pdev = NULL; 1648 struct amd_iommu_pci_seg *pci_seg; 1649 u16 alias; 1650 int ret; 1651 1652 if (dev_is_pci(dev_data->dev)) 1653 pdev = to_pci_dev(dev_data->dev); 1654 1655 if (pdev) 1656 ret = pci_for_each_dma_alias(pdev, 1657 device_flush_dte_alias, iommu); 1658 else 1659 ret = iommu_flush_dte(iommu, dev_data->devid); 1660 if (ret) 1661 return ret; 1662 1663 pci_seg = iommu->pci_seg; 1664 alias = pci_seg->alias_table[dev_data->devid]; 1665 if (alias != dev_data->devid) { 1666 ret = iommu_flush_dte(iommu, alias); 1667 if (ret) 1668 return ret; 1669 } 1670 1671 if (dev_data->ats_enabled) { 1672 /* Invalidate the entire contents of an IOTLB */ 1673 ret = device_flush_iotlb(dev_data, 0, ~0UL, 1674 IOMMU_NO_PASID, false); 1675 } 1676 1677 return ret; 1678 } 1679 1680 static int domain_flush_pages_v2(struct protection_domain *pdom, 1681 u64 address, size_t size) 1682 { 1683 struct iommu_dev_data *dev_data; 1684 struct iommu_cmd cmd; 1685 int ret = 0; 1686 1687 lockdep_assert_held(&pdom->lock); 1688 list_for_each_entry(dev_data, &pdom->dev_list, list) { 1689 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1690 u16 domid = dev_data->gcr3_info.domid; 1691 1692 build_inv_iommu_pages(&cmd, address, size, 1693 domid, IOMMU_NO_PASID, true); 1694 1695 ret |= iommu_queue_command(iommu, &cmd); 1696 } 1697 1698 return ret; 1699 } 1700 1701 static int domain_flush_pages_v1(struct protection_domain *pdom, 1702 u64 address, size_t size) 1703 { 1704 struct pdom_iommu_info *pdom_iommu_info; 1705 struct iommu_cmd cmd; 1706 int ret = 0; 1707 unsigned long i; 1708 1709 lockdep_assert_held(&pdom->lock); 1710 1711 build_inv_iommu_pages(&cmd, address, size, 1712 pdom->id, IOMMU_NO_PASID, false); 1713 1714 xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { 1715 /* 1716 * Devices of this domain are behind this IOMMU 1717 * We need a TLB flush 1718 */ 1719 ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); 1720 } 1721 1722 /* 1723 * A domain w/ v1 table can be a nest parent, which can have 1724 * multiple nested domains. Each nested domain has 1:1 mapping 1725 * between gDomID and hDomID. Therefore, flush every hDomID 1726 * associated to this nest parent domain. 1727 * 1728 * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested() 1729 */ 1730 if (!list_empty(&pdom->viommu_list)) 1731 ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size); 1732 1733 return ret; 1734 } 1735 1736 /* 1737 * TLB invalidation function which is called from the mapping functions. 1738 * It flushes range of PTEs of the domain. 1739 */ 1740 static void __domain_flush_pages(struct protection_domain *domain, 1741 u64 address, size_t size) 1742 { 1743 struct iommu_dev_data *dev_data; 1744 int ret = 0; 1745 ioasid_t pasid = IOMMU_NO_PASID; 1746 bool gn = false; 1747 1748 lockdep_assert_held(&domain->lock); 1749 1750 if (pdom_is_v2_pgtbl_mode(domain)) { 1751 gn = true; 1752 ret = domain_flush_pages_v2(domain, address, size); 1753 } else { 1754 ret = domain_flush_pages_v1(domain, address, size); 1755 } 1756 1757 list_for_each_entry(dev_data, &domain->dev_list, list) { 1758 1759 if (!dev_data->ats_enabled) 1760 continue; 1761 1762 ret |= device_flush_iotlb(dev_data, address, size, pasid, gn); 1763 } 1764 1765 WARN_ON(ret); 1766 } 1767 1768 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 1769 u64 address, size_t size) 1770 { 1771 lockdep_assert_held(&domain->lock); 1772 1773 if (likely(!amd_iommu_np_cache)) { 1774 __domain_flush_pages(domain, address, size); 1775 1776 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1777 domain_flush_complete(domain); 1778 1779 return; 1780 } 1781 1782 /* 1783 * When NpCache is on, we infer that we run in a VM and use a vIOMMU. 1784 * In such setups it is best to avoid flushes of ranges which are not 1785 * naturally aligned, since it would lead to flushes of unmodified 1786 * PTEs. Such flushes would require the hypervisor to do more work than 1787 * necessary. Therefore, perform repeated flushes of aligned ranges 1788 * until you cover the range. Each iteration flushes the smaller 1789 * between the natural alignment of the address that we flush and the 1790 * greatest naturally aligned region that fits in the range. 1791 */ 1792 while (size != 0) { 1793 int addr_alignment = __ffs(address); 1794 int size_alignment = __fls(size); 1795 int min_alignment; 1796 size_t flush_size; 1797 1798 /* 1799 * size is always non-zero, but address might be zero, causing 1800 * addr_alignment to be negative. As the casting of the 1801 * argument in __ffs(address) to long might trim the high bits 1802 * of the address on x86-32, cast to long when doing the check. 1803 */ 1804 if (likely((unsigned long)address != 0)) 1805 min_alignment = min(addr_alignment, size_alignment); 1806 else 1807 min_alignment = size_alignment; 1808 1809 flush_size = 1ul << min_alignment; 1810 1811 __domain_flush_pages(domain, address, flush_size); 1812 address += flush_size; 1813 size -= flush_size; 1814 } 1815 1816 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1817 domain_flush_complete(domain); 1818 } 1819 1820 /* Flush the whole IO/TLB for a given protection domain - including PDE */ 1821 static void amd_iommu_domain_flush_all(struct protection_domain *domain) 1822 { 1823 amd_iommu_domain_flush_pages(domain, 0, 1824 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1825 } 1826 1827 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, 1828 ioasid_t pasid, u64 address, size_t size) 1829 { 1830 struct iommu_cmd cmd; 1831 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1832 1833 build_inv_iommu_pages(&cmd, address, size, 1834 dev_data->gcr3_info.domid, pasid, true); 1835 iommu_queue_command(iommu, &cmd); 1836 1837 if (dev_data->ats_enabled) 1838 device_flush_iotlb(dev_data, address, size, pasid, true); 1839 1840 iommu_completion_wait(iommu); 1841 } 1842 1843 static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, 1844 ioasid_t pasid) 1845 { 1846 amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, 1847 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1848 } 1849 1850 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1851 { 1852 struct iommu_dev_data *dev_data; 1853 struct amd_iommu *iommu; 1854 struct iommu_cmd cmd; 1855 1856 dev_data = dev_iommu_priv_get(dev); 1857 iommu = get_amd_iommu_from_dev(dev); 1858 1859 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 1860 tag, dev_data->pri_tlp); 1861 1862 return iommu_queue_command(iommu, &cmd); 1863 } 1864 1865 /**************************************************************************** 1866 * 1867 * The next functions belong to the domain allocation. A domain is 1868 * allocated for every IOMMU as the default domain. If device isolation 1869 * is enabled, every device get its own domain. The most important thing 1870 * about domains is the page table mapping the DMA address space they 1871 * contain. 1872 * 1873 ****************************************************************************/ 1874 int amd_iommu_pdom_id_alloc(void) 1875 { 1876 return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); 1877 } 1878 1879 int amd_iommu_pdom_id_reserve(u16 id, gfp_t gfp) 1880 { 1881 return ida_alloc_range(&pdom_ids, id, id, gfp); 1882 } 1883 1884 void amd_iommu_pdom_id_free(int id) 1885 { 1886 ida_free(&pdom_ids, id); 1887 } 1888 1889 void amd_iommu_pdom_id_destroy(void) 1890 { 1891 ida_destroy(&pdom_ids); 1892 } 1893 1894 static void free_gcr3_tbl_level1(u64 *tbl) 1895 { 1896 u64 *ptr; 1897 int i; 1898 1899 for (i = 0; i < 512; ++i) { 1900 if (!(tbl[i] & GCR3_VALID)) 1901 continue; 1902 1903 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1904 1905 iommu_free_pages(ptr); 1906 } 1907 } 1908 1909 static void free_gcr3_tbl_level2(u64 *tbl) 1910 { 1911 u64 *ptr; 1912 int i; 1913 1914 for (i = 0; i < 512; ++i) { 1915 if (!(tbl[i] & GCR3_VALID)) 1916 continue; 1917 1918 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1919 1920 free_gcr3_tbl_level1(ptr); 1921 } 1922 } 1923 1924 static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) 1925 { 1926 if (gcr3_info->glx == 2) 1927 free_gcr3_tbl_level2(gcr3_info->gcr3_tbl); 1928 else if (gcr3_info->glx == 1) 1929 free_gcr3_tbl_level1(gcr3_info->gcr3_tbl); 1930 else 1931 WARN_ON_ONCE(gcr3_info->glx != 0); 1932 1933 gcr3_info->glx = 0; 1934 1935 /* Free per device domain ID */ 1936 amd_iommu_pdom_id_free(gcr3_info->domid); 1937 1938 iommu_free_pages(gcr3_info->gcr3_tbl); 1939 gcr3_info->gcr3_tbl = NULL; 1940 } 1941 1942 /* 1943 * Number of GCR3 table levels required. Level must be 4-Kbyte 1944 * page and can contain up to 512 entries. 1945 */ 1946 static int get_gcr3_levels(int pasids) 1947 { 1948 int levels; 1949 1950 if (pasids == -1) 1951 return amd_iommu_max_glx_val; 1952 1953 levels = get_count_order(pasids); 1954 1955 return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; 1956 } 1957 1958 static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, 1959 struct amd_iommu *iommu, int pasids) 1960 { 1961 int levels = get_gcr3_levels(pasids); 1962 int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 1963 int domid; 1964 1965 if (levels > amd_iommu_max_glx_val) 1966 return -EINVAL; 1967 1968 if (gcr3_info->gcr3_tbl) 1969 return -EBUSY; 1970 1971 /* Allocate per device domain ID */ 1972 domid = amd_iommu_pdom_id_alloc(); 1973 if (domid <= 0) 1974 return -ENOSPC; 1975 gcr3_info->domid = domid; 1976 1977 gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K); 1978 if (gcr3_info->gcr3_tbl == NULL) { 1979 amd_iommu_pdom_id_free(domid); 1980 return -ENOMEM; 1981 } 1982 1983 gcr3_info->glx = levels; 1984 1985 return 0; 1986 } 1987 1988 static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info, 1989 ioasid_t pasid, bool alloc) 1990 { 1991 int index; 1992 u64 *pte; 1993 u64 *root = gcr3_info->gcr3_tbl; 1994 int level = gcr3_info->glx; 1995 1996 while (true) { 1997 1998 index = (pasid >> (9 * level)) & 0x1ff; 1999 pte = &root[index]; 2000 2001 if (level == 0) 2002 break; 2003 2004 if (!(*pte & GCR3_VALID)) { 2005 if (!alloc) 2006 return NULL; 2007 2008 root = (void *)get_zeroed_page(GFP_ATOMIC); 2009 if (root == NULL) 2010 return NULL; 2011 2012 *pte = iommu_virt_to_phys(root) | GCR3_VALID; 2013 } 2014 2015 root = iommu_phys_to_virt(*pte & PAGE_MASK); 2016 2017 level -= 1; 2018 } 2019 2020 return pte; 2021 } 2022 2023 static int update_gcr3(struct iommu_dev_data *dev_data, 2024 ioasid_t pasid, unsigned long gcr3, bool set) 2025 { 2026 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2027 u64 *pte; 2028 2029 pte = __get_gcr3_pte(gcr3_info, pasid, true); 2030 if (pte == NULL) 2031 return -ENOMEM; 2032 2033 if (set) 2034 *pte = (gcr3 & PAGE_MASK) | GCR3_VALID; 2035 else 2036 *pte = 0; 2037 2038 dev_flush_pasid_all(dev_data, pasid); 2039 return 0; 2040 } 2041 2042 int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, 2043 unsigned long gcr3) 2044 { 2045 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2046 int ret; 2047 2048 iommu_group_mutex_assert(dev_data->dev); 2049 2050 ret = update_gcr3(dev_data, pasid, gcr3, true); 2051 if (ret) 2052 return ret; 2053 2054 gcr3_info->pasid_cnt++; 2055 return ret; 2056 } 2057 2058 int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) 2059 { 2060 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2061 int ret; 2062 2063 iommu_group_mutex_assert(dev_data->dev); 2064 2065 ret = update_gcr3(dev_data, pasid, 0, false); 2066 if (ret) 2067 return ret; 2068 2069 gcr3_info->pasid_cnt--; 2070 return ret; 2071 } 2072 2073 /* 2074 * Note: 2075 * The old value for GCR3 table and GPT have been cleared from caller. 2076 */ 2077 static void set_dte_gcr3_table(struct iommu_dev_data *dev_data, 2078 struct dev_table_entry *new) 2079 { 2080 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2081 u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2082 2083 new->data[0] |= DTE_FLAG_TV | 2084 (dev_data->ppr ? DTE_FLAG_PPR : 0) | 2085 (pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) | 2086 DTE_FLAG_GV | 2087 FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2088 FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) | 2089 DTE_FLAG_IR | DTE_FLAG_IW; 2090 2091 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) | 2092 FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2093 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) | 2094 FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2095 2096 /* Guest page table can only support 4 and 5 levels */ 2097 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2098 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2099 else 2100 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2101 } 2102 2103 void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, 2104 struct protection_domain *domain, u16 domid, 2105 struct pt_iommu_amdv1_hw_info *pt_info, 2106 struct dev_table_entry *new) 2107 { 2108 u64 host_pt_root = __sme_set(pt_info->host_pt_root); 2109 2110 /* Note Dirty tracking is used for v1 table only for now */ 2111 new->data[0] |= DTE_FLAG_TV | 2112 FIELD_PREP(DTE_MODE_MASK, pt_info->mode) | 2113 (domain->dirty_tracking ? DTE_FLAG_HAD : 0) | 2114 FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) | 2115 DTE_FLAG_IR | DTE_FLAG_IW; 2116 2117 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) | 2118 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); 2119 } 2120 2121 static void set_dte_v1(struct iommu_dev_data *dev_data, 2122 struct protection_domain *domain, u16 domid, 2123 phys_addr_t top_paddr, unsigned int top_level, 2124 struct dev_table_entry *new) 2125 { 2126 struct pt_iommu_amdv1_hw_info pt_info; 2127 2128 /* 2129 * When updating the IO pagetable, the new top and level 2130 * are provided as parameters. For other operations i.e. 2131 * device attach, retrieve the current pagetable info 2132 * via the IOMMU PT API. 2133 */ 2134 if (top_paddr) { 2135 pt_info.host_pt_root = top_paddr; 2136 pt_info.mode = top_level + 1; 2137 } else { 2138 WARN_ON(top_paddr || top_level); 2139 pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info); 2140 } 2141 2142 amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new); 2143 } 2144 2145 static void set_dte_passthrough(struct iommu_dev_data *dev_data, 2146 struct protection_domain *domain, 2147 struct dev_table_entry *new) 2148 { 2149 new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; 2150 2151 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | 2152 (dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; 2153 } 2154 2155 static void set_dte_entry(struct amd_iommu *iommu, 2156 struct iommu_dev_data *dev_data, 2157 phys_addr_t top_paddr, unsigned int top_level) 2158 { 2159 u32 old_domid; 2160 struct dev_table_entry new = {}; 2161 struct protection_domain *domain = dev_data->domain; 2162 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2163 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2164 2165 amd_iommu_make_clear_dte(dev_data, &new); 2166 2167 old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; 2168 if (gcr3_info->gcr3_tbl) 2169 set_dte_gcr3_table(dev_data, &new); 2170 else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) 2171 set_dte_passthrough(dev_data, domain, &new); 2172 else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) && 2173 domain->pd_mode == PD_MODE_V1) 2174 set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new); 2175 else 2176 WARN_ON(true); 2177 2178 amd_iommu_update_dte(iommu, dev_data, &new); 2179 2180 /* 2181 * A kdump kernel might be replacing a domain ID that was copied from 2182 * the previous kernel--if so, it needs to flush the translation cache 2183 * entries for the old domain ID that is being overwritten 2184 */ 2185 if (old_domid) { 2186 amd_iommu_flush_tlb_domid(iommu, old_domid); 2187 } 2188 } 2189 2190 /* 2191 * Clear DMA-remap related flags to block all DMA (blockeded domain) 2192 */ 2193 static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) 2194 { 2195 struct dev_table_entry new = {}; 2196 2197 amd_iommu_make_clear_dte(dev_data, &new); 2198 amd_iommu_update_dte(iommu, dev_data, &new); 2199 } 2200 2201 /* Update and flush DTE for the given device */ 2202 static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) 2203 { 2204 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2205 2206 if (set) 2207 set_dte_entry(iommu, dev_data, 0, 0); 2208 else 2209 clear_dte_entry(iommu, dev_data); 2210 } 2211 2212 /* 2213 * If domain is SVA capable then initialize GCR3 table. Also if domain is 2214 * in v2 page table mode then update GCR3[0]. 2215 */ 2216 static int init_gcr3_table(struct iommu_dev_data *dev_data, 2217 struct protection_domain *pdom) 2218 { 2219 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2220 int max_pasids = dev_data->max_pasids; 2221 struct pt_iommu_x86_64_hw_info pt_info; 2222 int ret = 0; 2223 2224 /* 2225 * If domain is in pt mode then setup GCR3 table only if device 2226 * is PASID capable 2227 */ 2228 if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data)) 2229 return ret; 2230 2231 /* 2232 * By default, setup GCR3 table to support MAX PASIDs 2233 * supported by the device/IOMMU. 2234 */ 2235 ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 2236 max_pasids > 0 ? max_pasids : 1); 2237 if (ret) 2238 return ret; 2239 2240 /* Setup GCR3[0] only if domain is setup with v2 page table mode */ 2241 if (!pdom_is_v2_pgtbl_mode(pdom)) 2242 return ret; 2243 2244 pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); 2245 ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); 2246 if (ret) 2247 free_gcr3_table(&dev_data->gcr3_info); 2248 2249 return ret; 2250 } 2251 2252 static void destroy_gcr3_table(struct iommu_dev_data *dev_data, 2253 struct protection_domain *pdom) 2254 { 2255 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2256 2257 if (pdom_is_v2_pgtbl_mode(pdom)) 2258 update_gcr3(dev_data, 0, 0, false); 2259 2260 if (gcr3_info->gcr3_tbl == NULL) 2261 return; 2262 2263 free_gcr3_table(gcr3_info); 2264 } 2265 2266 static int pdom_attach_iommu(struct amd_iommu *iommu, 2267 struct protection_domain *pdom) 2268 { 2269 struct pdom_iommu_info *pdom_iommu_info, *curr; 2270 unsigned long flags; 2271 int ret = 0; 2272 2273 spin_lock_irqsave(&pdom->lock, flags); 2274 2275 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2276 if (pdom_iommu_info) { 2277 pdom_iommu_info->refcnt++; 2278 goto out_unlock; 2279 } 2280 2281 pdom_iommu_info = kzalloc_obj(*pdom_iommu_info, GFP_ATOMIC); 2282 if (!pdom_iommu_info) { 2283 ret = -ENOMEM; 2284 goto out_unlock; 2285 } 2286 2287 pdom_iommu_info->iommu = iommu; 2288 pdom_iommu_info->refcnt = 1; 2289 2290 curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, 2291 NULL, pdom_iommu_info, GFP_ATOMIC); 2292 if (curr) { 2293 kfree(pdom_iommu_info); 2294 ret = -ENOSPC; 2295 goto out_unlock; 2296 } 2297 2298 out_unlock: 2299 spin_unlock_irqrestore(&pdom->lock, flags); 2300 return ret; 2301 } 2302 2303 static void pdom_detach_iommu(struct amd_iommu *iommu, 2304 struct protection_domain *pdom) 2305 { 2306 struct pdom_iommu_info *pdom_iommu_info; 2307 unsigned long flags; 2308 2309 spin_lock_irqsave(&pdom->lock, flags); 2310 2311 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2312 if (!pdom_iommu_info) { 2313 spin_unlock_irqrestore(&pdom->lock, flags); 2314 return; 2315 } 2316 2317 pdom_iommu_info->refcnt--; 2318 if (pdom_iommu_info->refcnt == 0) { 2319 xa_erase(&pdom->iommu_array, iommu->index); 2320 kfree(pdom_iommu_info); 2321 } 2322 2323 spin_unlock_irqrestore(&pdom->lock, flags); 2324 } 2325 2326 /* 2327 * If a device is not yet associated with a domain, this function makes the 2328 * device visible in the domain 2329 */ 2330 static int attach_device(struct device *dev, 2331 struct protection_domain *domain) 2332 { 2333 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2334 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2335 struct pci_dev *pdev; 2336 unsigned long flags; 2337 int ret = 0; 2338 2339 mutex_lock(&dev_data->mutex); 2340 2341 if (dev_data->domain != NULL) { 2342 ret = -EBUSY; 2343 goto out; 2344 } 2345 2346 /* Do reference counting */ 2347 ret = pdom_attach_iommu(iommu, domain); 2348 if (ret) 2349 goto out; 2350 2351 /* Setup GCR3 table */ 2352 if (pdom_is_sva_capable(domain)) { 2353 ret = init_gcr3_table(dev_data, domain); 2354 if (ret) { 2355 pdom_detach_iommu(iommu, domain); 2356 goto out; 2357 } 2358 } 2359 2360 pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; 2361 if (pdev && pdom_is_sva_capable(domain)) { 2362 pdev_enable_caps(pdev); 2363 2364 /* 2365 * Device can continue to function even if IOPF 2366 * enablement failed. Hence in error path just 2367 * disable device PRI support. 2368 */ 2369 if (amd_iommu_iopf_add_device(iommu, dev_data)) 2370 pdev_disable_cap_pri(pdev); 2371 } else if (pdev) { 2372 pdev_enable_cap_ats(pdev); 2373 } 2374 2375 /* Update data structures */ 2376 dev_data->domain = domain; 2377 spin_lock_irqsave(&domain->lock, flags); 2378 list_add(&dev_data->list, &domain->dev_list); 2379 spin_unlock_irqrestore(&domain->lock, flags); 2380 2381 /* Update device table */ 2382 dev_update_dte(dev_data, true); 2383 2384 out: 2385 mutex_unlock(&dev_data->mutex); 2386 2387 return ret; 2388 } 2389 2390 /* 2391 * Removes a device from a protection domain (with devtable_lock held) 2392 */ 2393 static void detach_device(struct device *dev) 2394 { 2395 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2396 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2397 struct protection_domain *domain = dev_data->domain; 2398 unsigned long flags; 2399 2400 mutex_lock(&dev_data->mutex); 2401 2402 /* 2403 * First check if the device is still attached. It might already 2404 * be detached from its domain because the generic 2405 * iommu_detach_group code detached it and we try again here in 2406 * our alias handling. 2407 */ 2408 if (WARN_ON(!dev_data->domain)) 2409 goto out; 2410 2411 /* Remove IOPF handler */ 2412 if (dev_data->ppr) { 2413 iopf_queue_flush_dev(dev); 2414 amd_iommu_iopf_remove_device(iommu, dev_data); 2415 } 2416 2417 if (dev_is_pci(dev)) 2418 pdev_disable_caps(to_pci_dev(dev)); 2419 2420 /* Clear DTE and flush the entry */ 2421 dev_update_dte(dev_data, false); 2422 2423 /* Flush IOTLB and wait for the flushes to finish */ 2424 spin_lock_irqsave(&domain->lock, flags); 2425 amd_iommu_domain_flush_all(domain); 2426 list_del(&dev_data->list); 2427 spin_unlock_irqrestore(&domain->lock, flags); 2428 2429 /* Clear GCR3 table */ 2430 if (pdom_is_sva_capable(domain)) 2431 destroy_gcr3_table(dev_data, domain); 2432 2433 /* Update data structures */ 2434 dev_data->domain = NULL; 2435 2436 /* decrease reference counters - needs to happen after the flushes */ 2437 pdom_detach_iommu(iommu, domain); 2438 2439 out: 2440 mutex_unlock(&dev_data->mutex); 2441 } 2442 2443 static struct iommu_device *amd_iommu_probe_device(struct device *dev) 2444 { 2445 struct iommu_device *iommu_dev; 2446 struct amd_iommu *iommu; 2447 struct iommu_dev_data *dev_data; 2448 int ret; 2449 2450 if (!check_device(dev)) 2451 return ERR_PTR(-ENODEV); 2452 2453 iommu = rlookup_amd_iommu(dev); 2454 if (!iommu) 2455 return ERR_PTR(-ENODEV); 2456 2457 /* Not registered yet? */ 2458 if (!iommu->iommu.ops) 2459 return ERR_PTR(-ENODEV); 2460 2461 if (dev_iommu_priv_get(dev)) 2462 return &iommu->iommu; 2463 2464 ret = iommu_init_device(iommu, dev); 2465 if (ret) { 2466 dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); 2467 iommu_dev = ERR_PTR(ret); 2468 iommu_ignore_device(iommu, dev); 2469 goto out_err; 2470 } 2471 2472 amd_iommu_set_pci_msi_domain(dev, iommu); 2473 iommu_dev = &iommu->iommu; 2474 2475 /* 2476 * If IOMMU and device supports PASID then it will contain max 2477 * supported PASIDs, else it will be zero. 2478 */ 2479 dev_data = dev_iommu_priv_get(dev); 2480 if (amd_iommu_pasid_supported() && dev_is_pci(dev) && 2481 pdev_pasid_supported(dev_data)) { 2482 dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids, 2483 pci_max_pasids(to_pci_dev(dev))); 2484 } 2485 2486 if (amd_iommu_pgtable == PD_MODE_NONE) { 2487 pr_warn_once("%s: DMA translation not supported by iommu.\n", 2488 __func__); 2489 iommu_dev = ERR_PTR(-ENODEV); 2490 goto out_err; 2491 } 2492 2493 iommu_completion_wait(iommu); 2494 2495 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 2496 dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; 2497 else 2498 dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; 2499 2500 if (dev_is_pci(dev)) 2501 pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); 2502 2503 out_err: 2504 return iommu_dev; 2505 } 2506 2507 static void amd_iommu_release_device(struct device *dev) 2508 { 2509 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2510 2511 WARN_ON(dev_data->domain); 2512 2513 /* 2514 * We keep dev_data around for unplugged devices and reuse it when the 2515 * device is re-plugged - not doing so would introduce a ton of races. 2516 */ 2517 } 2518 2519 static struct iommu_group *amd_iommu_device_group(struct device *dev) 2520 { 2521 if (dev_is_pci(dev)) 2522 return pci_device_group(dev); 2523 2524 return acpihid_device_group(dev); 2525 } 2526 2527 /***************************************************************************** 2528 * 2529 * The following functions belong to the exported interface of AMD IOMMU 2530 * 2531 * This interface allows access to lower level functions of the IOMMU 2532 * like protection domain handling and assignement of devices to domains 2533 * which is not possible with the dma_ops interface. 2534 * 2535 *****************************************************************************/ 2536 2537 static void protection_domain_init(struct protection_domain *domain) 2538 { 2539 spin_lock_init(&domain->lock); 2540 INIT_LIST_HEAD(&domain->dev_list); 2541 INIT_LIST_HEAD(&domain->dev_data_list); 2542 INIT_LIST_HEAD(&domain->viommu_list); 2543 xa_init(&domain->iommu_array); 2544 } 2545 2546 struct protection_domain *protection_domain_alloc(void) 2547 { 2548 struct protection_domain *domain; 2549 int domid; 2550 2551 domain = kzalloc_obj(*domain); 2552 if (!domain) 2553 return NULL; 2554 2555 domid = amd_iommu_pdom_id_alloc(); 2556 if (domid <= 0) { 2557 kfree(domain); 2558 return NULL; 2559 } 2560 domain->id = domid; 2561 2562 protection_domain_init(domain); 2563 2564 return domain; 2565 } 2566 2567 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2568 { 2569 if (amd_iommu_hatdis) 2570 return false; 2571 2572 return iommu && (iommu->features & FEATURE_HDSUP); 2573 } 2574 2575 static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) 2576 { 2577 struct protection_domain *pdom = 2578 container_of(iommupt, struct protection_domain, iommu); 2579 2580 return &pdom->lock; 2581 } 2582 2583 /* 2584 * Update all HW references to the domain with a new pgtable configuration. 2585 */ 2586 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 2587 phys_addr_t top_paddr, unsigned int top_level) 2588 { 2589 struct protection_domain *pdom = 2590 container_of(iommu_table, struct protection_domain, iommu); 2591 struct iommu_dev_data *dev_data; 2592 2593 lockdep_assert_held(&pdom->lock); 2594 2595 /* Update the DTE for all devices attached to this domain */ 2596 list_for_each_entry(dev_data, &pdom->dev_list, list) { 2597 struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 2598 2599 /* Update the HW references with the new level and top ptr */ 2600 set_dte_entry(iommu, dev_data, top_paddr, top_level); 2601 clone_aliases(iommu, dev_data->dev); 2602 } 2603 2604 list_for_each_entry(dev_data, &pdom->dev_list, list) 2605 device_flush_dte(dev_data); 2606 2607 domain_flush_complete(pdom); 2608 } 2609 2610 /* 2611 * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to 2612 * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non 2613 * present caching (like hypervisor shadowing). 2614 */ 2615 static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2616 unsigned long iova, size_t size) 2617 { 2618 struct protection_domain *domain = to_pdomain(dom); 2619 unsigned long flags; 2620 2621 if (likely(!amd_iommu_np_cache)) 2622 return 0; 2623 2624 spin_lock_irqsave(&domain->lock, flags); 2625 amd_iommu_domain_flush_pages(domain, iova, size); 2626 spin_unlock_irqrestore(&domain->lock, flags); 2627 return 0; 2628 } 2629 2630 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2631 { 2632 struct protection_domain *dom = to_pdomain(domain); 2633 unsigned long flags; 2634 2635 spin_lock_irqsave(&dom->lock, flags); 2636 amd_iommu_domain_flush_all(dom); 2637 spin_unlock_irqrestore(&dom->lock, flags); 2638 } 2639 2640 static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2641 struct iommu_iotlb_gather *gather) 2642 { 2643 struct protection_domain *dom = to_pdomain(domain); 2644 unsigned long flags; 2645 2646 spin_lock_irqsave(&dom->lock, flags); 2647 amd_iommu_domain_flush_pages(dom, gather->start, 2648 gather->end - gather->start + 1); 2649 spin_unlock_irqrestore(&dom->lock, flags); 2650 iommu_put_pages_list(&gather->freelist); 2651 } 2652 2653 static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { 2654 .get_top_lock = amd_iommu_get_top_lock, 2655 .change_top = amd_iommu_change_top, 2656 }; 2657 2658 static const struct iommu_domain_ops amdv1_ops = { 2659 IOMMU_PT_DOMAIN_OPS(amdv1), 2660 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2661 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2662 .iotlb_sync = amd_iommu_iotlb_sync, 2663 .attach_dev = amd_iommu_attach_device, 2664 .free = amd_iommu_domain_free, 2665 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2666 }; 2667 2668 static const struct iommu_dirty_ops amdv1_dirty_ops = { 2669 IOMMU_PT_DIRTY_OPS(amdv1), 2670 .set_dirty_tracking = amd_iommu_set_dirty_tracking, 2671 }; 2672 2673 static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, 2674 u32 flags) 2675 { 2676 struct pt_iommu_amdv1_cfg cfg = {}; 2677 struct protection_domain *domain; 2678 int ret; 2679 2680 if (amd_iommu_hatdis) 2681 return ERR_PTR(-EOPNOTSUPP); 2682 2683 domain = protection_domain_alloc(); 2684 if (!domain) 2685 return ERR_PTR(-ENOMEM); 2686 2687 domain->pd_mode = PD_MODE_V1; 2688 domain->iommu.driver_ops = &amd_hw_driver_ops_v1; 2689 domain->iommu.nid = dev_to_node(dev); 2690 if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2691 domain->domain.dirty_ops = &amdv1_dirty_ops; 2692 2693 /* 2694 * Someday FORCE_COHERENCE should be set by 2695 * amd_iommu_enforce_cache_coherency() like VT-d does. 2696 */ 2697 cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 2698 BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 2699 BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 2700 2701 /* 2702 * AMD's IOMMU can flush as many pages as necessary in a single flush. 2703 * Unless we run in a virtual machine, which can be inferred according 2704 * to whether "non-present cache" is on, it is probably best to prefer 2705 * (potentially) too extensive TLB flushing (i.e., more misses) over 2706 * multiple TLB flushes (i.e., more flushes). For virtual machines the 2707 * hypervisor needs to synchronize the host IOMMU PTEs with those of 2708 * the guest, and the trade-off is different: unnecessary TLB flushes 2709 * should be avoided. 2710 */ 2711 if (amd_iommu_np_cache) 2712 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2713 else 2714 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2715 2716 cfg.common.hw_max_vasz_lg2 = 2717 min(64, (amd_iommu_hpt_level - 1) * 9 + 21); 2718 cfg.common.hw_max_oasz_lg2 = 52; 2719 cfg.starting_level = 2; 2720 domain->domain.ops = &amdv1_ops; 2721 2722 ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); 2723 if (ret) { 2724 amd_iommu_domain_free(&domain->domain); 2725 return ERR_PTR(ret); 2726 } 2727 2728 /* 2729 * Narrow the supported page sizes to those selected by the kernel 2730 * command line. 2731 */ 2732 domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; 2733 return &domain->domain; 2734 } 2735 2736 static const struct iommu_domain_ops amdv2_ops = { 2737 IOMMU_PT_DOMAIN_OPS(x86_64), 2738 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2739 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2740 .iotlb_sync = amd_iommu_iotlb_sync, 2741 .attach_dev = amd_iommu_attach_device, 2742 .free = amd_iommu_domain_free, 2743 /* 2744 * Note the AMDv2 page table format does not support a Force Coherency 2745 * bit, so enforce_cache_coherency should not be set. However VFIO is 2746 * not prepared to handle a case where some domains will support 2747 * enforcement and others do not. VFIO and iommufd will have to be fixed 2748 * before it can fully use the V2 page table. See the comment in 2749 * iommufd_hwpt_paging_alloc(). For now leave things as they have 2750 * historically been and lie about enforce_cache_coherencey. 2751 */ 2752 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2753 }; 2754 2755 static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, 2756 u32 flags) 2757 { 2758 struct pt_iommu_x86_64_cfg cfg = {}; 2759 struct protection_domain *domain; 2760 int ret; 2761 2762 if (!amd_iommu_v2_pgtbl_supported()) 2763 return ERR_PTR(-EOPNOTSUPP); 2764 2765 domain = protection_domain_alloc(); 2766 if (!domain) 2767 return ERR_PTR(-ENOMEM); 2768 2769 domain->pd_mode = PD_MODE_V2; 2770 domain->iommu.nid = dev_to_node(dev); 2771 2772 cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); 2773 if (amd_iommu_np_cache) 2774 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2775 else 2776 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2777 2778 /* 2779 * The v2 table behaves differently if it is attached to PASID 0 vs a 2780 * non-zero PASID. On PASID 0 it has no sign extension and the full 2781 * 57/48 bits decode the lower addresses. Otherwise it behaves like a 2782 * normal sign extended x86 page table. Since we want the domain to work 2783 * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not 2784 * set which creates a table that is compatible in both modes. 2785 */ 2786 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { 2787 cfg.common.hw_max_vasz_lg2 = 56; 2788 cfg.top_level = 4; 2789 } else { 2790 cfg.common.hw_max_vasz_lg2 = 47; 2791 cfg.top_level = 3; 2792 } 2793 cfg.common.hw_max_oasz_lg2 = 52; 2794 domain->domain.ops = &amdv2_ops; 2795 2796 ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); 2797 if (ret) { 2798 amd_iommu_domain_free(&domain->domain); 2799 return ERR_PTR(ret); 2800 } 2801 return &domain->domain; 2802 } 2803 2804 static inline bool is_nest_parent_supported(u32 flags) 2805 { 2806 /* Only allow nest parent when these features are supported */ 2807 return check_feature(FEATURE_GT) && 2808 check_feature(FEATURE_GIOSUP) && 2809 check_feature2(FEATURE_GCR3TRPMODE); 2810 } 2811 2812 static struct iommu_domain * 2813 amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 2814 const struct iommu_user_data *user_data) 2815 2816 { 2817 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2818 const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 2819 IOMMU_HWPT_ALLOC_PASID | 2820 IOMMU_HWPT_ALLOC_NEST_PARENT; 2821 2822 if ((flags & ~supported_flags) || user_data) 2823 return ERR_PTR(-EOPNOTSUPP); 2824 2825 switch (flags & supported_flags) { 2826 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: 2827 case IOMMU_HWPT_ALLOC_NEST_PARENT: 2828 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: 2829 /* 2830 * Allocate domain with v1 page table for dirty tracking 2831 * and/or Nest parent. 2832 */ 2833 if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && 2834 !amd_iommu_hd_support(iommu)) 2835 break; 2836 2837 if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && 2838 !is_nest_parent_supported(flags)) 2839 break; 2840 2841 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2842 case IOMMU_HWPT_ALLOC_PASID: 2843 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2844 if (!amd_iommu_pasid_supported()) 2845 break; 2846 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2847 case 0: { 2848 struct iommu_domain *ret; 2849 2850 /* If nothing specific is required use the kernel commandline default */ 2851 if (amd_iommu_pgtable == PD_MODE_V1) { 2852 ret = amd_iommu_domain_alloc_paging_v1(dev, flags); 2853 if (ret != ERR_PTR(-EOPNOTSUPP)) 2854 return ret; 2855 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2856 } 2857 ret = amd_iommu_domain_alloc_paging_v2(dev, flags); 2858 if (ret != ERR_PTR(-EOPNOTSUPP)) 2859 return ret; 2860 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2861 } 2862 default: 2863 break; 2864 } 2865 return ERR_PTR(-EOPNOTSUPP); 2866 } 2867 2868 void amd_iommu_domain_free(struct iommu_domain *dom) 2869 { 2870 struct protection_domain *domain = to_pdomain(dom); 2871 2872 WARN_ON(!list_empty(&domain->dev_list)); 2873 pt_iommu_deinit(&domain->iommu); 2874 amd_iommu_pdom_id_free(domain->id); 2875 kfree(domain); 2876 } 2877 2878 static int blocked_domain_attach_device(struct iommu_domain *domain, 2879 struct device *dev, 2880 struct iommu_domain *old) 2881 { 2882 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2883 2884 if (dev_data->domain) 2885 detach_device(dev); 2886 2887 /* Clear DTE and flush the entry */ 2888 mutex_lock(&dev_data->mutex); 2889 dev_update_dte(dev_data, false); 2890 mutex_unlock(&dev_data->mutex); 2891 2892 return 0; 2893 } 2894 2895 static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, 2896 struct device *dev, ioasid_t pasid, 2897 struct iommu_domain *old) 2898 { 2899 amd_iommu_remove_dev_pasid(dev, pasid, old); 2900 return 0; 2901 } 2902 2903 static struct iommu_domain blocked_domain = { 2904 .type = IOMMU_DOMAIN_BLOCKED, 2905 .ops = &(const struct iommu_domain_ops) { 2906 .attach_dev = blocked_domain_attach_device, 2907 .set_dev_pasid = blocked_domain_set_dev_pasid, 2908 } 2909 }; 2910 2911 static struct protection_domain identity_domain; 2912 2913 static int amd_iommu_identity_attach(struct iommu_domain *dom, struct device *dev, 2914 struct iommu_domain *old) 2915 { 2916 /* 2917 * Don't allow attaching a device to the identity domain if SNP is 2918 * enabled. 2919 */ 2920 if (amd_iommu_snp_en) 2921 return -EINVAL; 2922 2923 return amd_iommu_attach_device(dom, dev, old); 2924 } 2925 2926 static const struct iommu_domain_ops identity_domain_ops = { 2927 .attach_dev = amd_iommu_identity_attach, 2928 }; 2929 2930 void amd_iommu_init_identity_domain(void) 2931 { 2932 struct iommu_domain *domain = &identity_domain.domain; 2933 2934 domain->type = IOMMU_DOMAIN_IDENTITY; 2935 domain->ops = &identity_domain_ops; 2936 domain->owner = &amd_iommu_ops; 2937 2938 identity_domain.id = amd_iommu_pdom_id_alloc(); 2939 2940 protection_domain_init(&identity_domain); 2941 } 2942 2943 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 2944 struct iommu_domain *old) 2945 { 2946 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2947 struct protection_domain *domain = to_pdomain(dom); 2948 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2949 int ret; 2950 2951 /* 2952 * Skip attach device to domain if new domain is same as 2953 * devices current domain 2954 */ 2955 if (dev_data->domain == domain) 2956 return 0; 2957 2958 dev_data->defer_attach = false; 2959 2960 /* 2961 * Restrict to devices with compatible IOMMU hardware support 2962 * when enforcement of dirty tracking is enabled. 2963 */ 2964 if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) 2965 return -EINVAL; 2966 2967 if (dev_data->domain) 2968 detach_device(dev); 2969 2970 ret = attach_device(dev, domain); 2971 2972 #ifdef CONFIG_IRQ_REMAP 2973 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { 2974 if (dom->type == IOMMU_DOMAIN_UNMANAGED) 2975 dev_data->use_vapic = 1; 2976 else 2977 dev_data->use_vapic = 0; 2978 } 2979 #endif 2980 2981 return ret; 2982 } 2983 2984 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) 2985 { 2986 switch (cap) { 2987 case IOMMU_CAP_CACHE_COHERENCY: 2988 return true; 2989 case IOMMU_CAP_NOEXEC: 2990 return false; 2991 case IOMMU_CAP_PRE_BOOT_PROTECTION: 2992 return amdr_ivrs_remap_support; 2993 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 2994 return true; 2995 case IOMMU_CAP_DIRTY_TRACKING: { 2996 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2997 2998 return amd_iommu_hd_support(iommu); 2999 } 3000 case IOMMU_CAP_PCI_ATS_SUPPORTED: { 3001 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 3002 3003 return amd_iommu_iotlb_sup && 3004 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); 3005 } 3006 default: 3007 break; 3008 } 3009 3010 return false; 3011 } 3012 3013 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 3014 bool enable) 3015 { 3016 struct protection_domain *pdomain = to_pdomain(domain); 3017 struct dev_table_entry *dte; 3018 struct iommu_dev_data *dev_data; 3019 bool domain_flush = false; 3020 struct amd_iommu *iommu; 3021 unsigned long flags; 3022 u64 new; 3023 3024 spin_lock_irqsave(&pdomain->lock, flags); 3025 if (!(pdomain->dirty_tracking ^ enable)) { 3026 spin_unlock_irqrestore(&pdomain->lock, flags); 3027 return 0; 3028 } 3029 3030 list_for_each_entry(dev_data, &pdomain->dev_list, list) { 3031 spin_lock(&dev_data->dte_lock); 3032 iommu = get_amd_iommu_from_dev_data(dev_data); 3033 dte = &get_dev_table(iommu)[dev_data->devid]; 3034 new = dte->data[0]; 3035 new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); 3036 dte->data[0] = new; 3037 spin_unlock(&dev_data->dte_lock); 3038 3039 /* Flush device DTE */ 3040 device_flush_dte(dev_data); 3041 domain_flush = true; 3042 } 3043 3044 /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ 3045 if (domain_flush) 3046 amd_iommu_domain_flush_all(pdomain); 3047 3048 pdomain->dirty_tracking = enable; 3049 spin_unlock_irqrestore(&pdomain->lock, flags); 3050 3051 return 0; 3052 } 3053 3054 static void amd_iommu_get_resv_regions(struct device *dev, 3055 struct list_head *head) 3056 { 3057 struct iommu_resv_region *region; 3058 struct unity_map_entry *entry; 3059 struct amd_iommu *iommu; 3060 struct amd_iommu_pci_seg *pci_seg; 3061 int devid, sbdf; 3062 3063 sbdf = get_device_sbdf_id(dev); 3064 if (sbdf < 0) 3065 return; 3066 3067 devid = PCI_SBDF_TO_DEVID(sbdf); 3068 iommu = get_amd_iommu_from_dev(dev); 3069 pci_seg = iommu->pci_seg; 3070 3071 list_for_each_entry(entry, &pci_seg->unity_map, list) { 3072 int type, prot = 0; 3073 size_t length; 3074 3075 if (devid < entry->devid_start || devid > entry->devid_end) 3076 continue; 3077 3078 type = IOMMU_RESV_DIRECT; 3079 length = entry->address_end - entry->address_start; 3080 if (entry->prot & IOMMU_PROT_IR) 3081 prot |= IOMMU_READ; 3082 if (entry->prot & IOMMU_PROT_IW) 3083 prot |= IOMMU_WRITE; 3084 if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE) 3085 /* Exclusion range */ 3086 type = IOMMU_RESV_RESERVED; 3087 3088 region = iommu_alloc_resv_region(entry->address_start, 3089 length, prot, type, 3090 GFP_KERNEL); 3091 if (!region) { 3092 dev_err(dev, "Out of memory allocating dm-regions\n"); 3093 return; 3094 } 3095 list_add_tail(®ion->list, head); 3096 } 3097 3098 region = iommu_alloc_resv_region(MSI_RANGE_START, 3099 MSI_RANGE_END - MSI_RANGE_START + 1, 3100 0, IOMMU_RESV_MSI, GFP_KERNEL); 3101 if (!region) 3102 return; 3103 list_add_tail(®ion->list, head); 3104 3105 if (amd_iommu_ht_range_ignore()) 3106 return; 3107 3108 region = iommu_alloc_resv_region(HT_RANGE_START, 3109 HT_RANGE_END - HT_RANGE_START + 1, 3110 0, IOMMU_RESV_RESERVED, GFP_KERNEL); 3111 if (!region) 3112 return; 3113 list_add_tail(®ion->list, head); 3114 } 3115 3116 static bool amd_iommu_is_attach_deferred(struct device *dev) 3117 { 3118 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 3119 3120 return dev_data->defer_attach; 3121 } 3122 3123 static int amd_iommu_def_domain_type(struct device *dev) 3124 { 3125 struct iommu_dev_data *dev_data; 3126 3127 dev_data = dev_iommu_priv_get(dev); 3128 if (!dev_data) 3129 return 0; 3130 3131 /* Always use DMA domain for untrusted device */ 3132 if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) 3133 return IOMMU_DOMAIN_DMA; 3134 3135 /* 3136 * Do not identity map IOMMUv2 capable devices when: 3137 * - memory encryption is active, because some of those devices 3138 * (AMD GPUs) don't have the encryption bit in their DMA-mask 3139 * and require remapping. 3140 * - SNP is enabled, because it prohibits DTE[Mode]=0. 3141 */ 3142 if (pdev_pasid_supported(dev_data) && 3143 !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && 3144 !amd_iommu_snp_en) { 3145 return IOMMU_DOMAIN_IDENTITY; 3146 } 3147 3148 return 0; 3149 } 3150 3151 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3152 { 3153 /* IOMMU_PTE_FC is always set */ 3154 return true; 3155 } 3156 3157 const struct iommu_ops amd_iommu_ops = { 3158 .capable = amd_iommu_capable, 3159 .hw_info = amd_iommufd_hw_info, 3160 .blocked_domain = &blocked_domain, 3161 .release_domain = &blocked_domain, 3162 .identity_domain = &identity_domain.domain, 3163 .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, 3164 .domain_alloc_sva = amd_iommu_domain_alloc_sva, 3165 .probe_device = amd_iommu_probe_device, 3166 .release_device = amd_iommu_release_device, 3167 .device_group = amd_iommu_device_group, 3168 .get_resv_regions = amd_iommu_get_resv_regions, 3169 .is_attach_deferred = amd_iommu_is_attach_deferred, 3170 .def_domain_type = amd_iommu_def_domain_type, 3171 .page_response = amd_iommu_page_response, 3172 .get_viommu_size = amd_iommufd_get_viommu_size, 3173 .viommu_init = amd_iommufd_viommu_init, 3174 }; 3175 3176 #ifdef CONFIG_IRQ_REMAP 3177 3178 /***************************************************************************** 3179 * 3180 * Interrupt Remapping Implementation 3181 * 3182 *****************************************************************************/ 3183 3184 static struct irq_chip amd_ir_chip; 3185 static DEFINE_SPINLOCK(iommu_table_lock); 3186 3187 static int iommu_flush_dev_irt(struct pci_dev *unused, u16 devid, void *data) 3188 { 3189 int ret; 3190 struct iommu_cmd cmd; 3191 struct amd_iommu *iommu = data; 3192 3193 build_inv_irt(&cmd, devid); 3194 ret = __iommu_queue_command_sync(iommu, &cmd, true); 3195 return ret; 3196 } 3197 3198 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) 3199 { 3200 int ret; 3201 u64 data; 3202 unsigned long flags; 3203 struct iommu_cmd cmd; 3204 struct pci_dev *pdev = NULL; 3205 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3206 3207 if (iommu->irtcachedis_enabled) 3208 return; 3209 3210 if (dev_data && dev_data->dev && dev_is_pci(dev_data->dev)) 3211 pdev = to_pci_dev(dev_data->dev); 3212 3213 raw_spin_lock_irqsave(&iommu->lock, flags); 3214 data = get_cmdsem_val(iommu); 3215 build_completion_wait(&cmd, iommu, data); 3216 3217 if (pdev) 3218 ret = pci_for_each_dma_alias(pdev, iommu_flush_dev_irt, iommu); 3219 else 3220 ret = iommu_flush_dev_irt(NULL, devid, iommu); 3221 if (ret) 3222 goto out_err; 3223 3224 ret = __iommu_queue_command_sync(iommu, &cmd, false); 3225 if (ret) 3226 goto out_err; 3227 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3228 3229 wait_on_sem(iommu, data); 3230 return; 3231 3232 out_err: 3233 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3234 } 3235 3236 static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) 3237 { 3238 if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) 3239 return DTE_INTTABLEN_2K; 3240 return DTE_INTTABLEN_512; 3241 } 3242 3243 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, 3244 struct irq_remap_table *table) 3245 { 3246 u64 new; 3247 struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; 3248 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3249 3250 if (dev_data) 3251 spin_lock(&dev_data->dte_lock); 3252 3253 new = READ_ONCE(dte->data[2]); 3254 new &= ~DTE_IRQ_PHYS_ADDR_MASK; 3255 new |= iommu_virt_to_phys(table->table); 3256 new |= DTE_IRQ_REMAP_INTCTL; 3257 new |= iommu_get_int_tablen(dev_data); 3258 new |= DTE_IRQ_REMAP_ENABLE; 3259 WRITE_ONCE(dte->data[2], new); 3260 3261 if (dev_data) 3262 spin_unlock(&dev_data->dte_lock); 3263 } 3264 3265 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) 3266 { 3267 struct irq_remap_table *table; 3268 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3269 3270 if (WARN_ONCE(!pci_seg->rlookup_table[devid], 3271 "%s: no iommu for devid %x:%x\n", 3272 __func__, pci_seg->id, devid)) 3273 return NULL; 3274 3275 table = pci_seg->irq_lookup_table[devid]; 3276 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n", 3277 __func__, pci_seg->id, devid)) 3278 return NULL; 3279 3280 return table; 3281 } 3282 3283 static struct irq_remap_table *__alloc_irq_table(int nid, size_t size) 3284 { 3285 struct irq_remap_table *table; 3286 3287 table = kzalloc_obj(*table); 3288 if (!table) 3289 return NULL; 3290 3291 table->table = iommu_alloc_pages_node_sz( 3292 nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size)); 3293 if (!table->table) { 3294 kfree(table); 3295 return NULL; 3296 } 3297 raw_spin_lock_init(&table->lock); 3298 3299 return table; 3300 } 3301 3302 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, 3303 struct irq_remap_table *table) 3304 { 3305 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3306 3307 pci_seg->irq_lookup_table[devid] = table; 3308 set_dte_irq_entry(iommu, devid, table); 3309 iommu_flush_dte(iommu, devid); 3310 } 3311 3312 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, 3313 void *data) 3314 { 3315 struct irq_remap_table *table = data; 3316 struct amd_iommu_pci_seg *pci_seg; 3317 struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev); 3318 3319 if (!iommu) 3320 return -EINVAL; 3321 3322 pci_seg = iommu->pci_seg; 3323 pci_seg->irq_lookup_table[alias] = table; 3324 set_dte_irq_entry(iommu, alias, table); 3325 iommu_flush_dte(pci_seg->rlookup_table[alias], alias); 3326 3327 return 0; 3328 } 3329 3330 static inline size_t get_irq_table_size(unsigned int max_irqs) 3331 { 3332 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3333 return max_irqs * sizeof(u32); 3334 3335 return max_irqs * (sizeof(u64) * 2); 3336 } 3337 3338 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, 3339 u16 devid, struct pci_dev *pdev, 3340 unsigned int max_irqs) 3341 { 3342 struct irq_remap_table *table = NULL; 3343 struct irq_remap_table *new_table = NULL; 3344 struct amd_iommu_pci_seg *pci_seg; 3345 unsigned long flags; 3346 int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3347 u16 alias; 3348 3349 spin_lock_irqsave(&iommu_table_lock, flags); 3350 3351 pci_seg = iommu->pci_seg; 3352 table = pci_seg->irq_lookup_table[devid]; 3353 if (table) 3354 goto out_unlock; 3355 3356 alias = pci_seg->alias_table[devid]; 3357 table = pci_seg->irq_lookup_table[alias]; 3358 if (table) { 3359 set_remap_table_entry(iommu, devid, table); 3360 goto out_wait; 3361 } 3362 spin_unlock_irqrestore(&iommu_table_lock, flags); 3363 3364 /* Nothing there yet, allocate new irq remapping table */ 3365 new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs)); 3366 if (!new_table) 3367 return NULL; 3368 3369 spin_lock_irqsave(&iommu_table_lock, flags); 3370 3371 table = pci_seg->irq_lookup_table[devid]; 3372 if (table) 3373 goto out_unlock; 3374 3375 table = pci_seg->irq_lookup_table[alias]; 3376 if (table) { 3377 set_remap_table_entry(iommu, devid, table); 3378 goto out_wait; 3379 } 3380 3381 table = new_table; 3382 new_table = NULL; 3383 3384 if (pdev) 3385 pci_for_each_dma_alias(pdev, set_remap_table_entry_alias, 3386 table); 3387 else 3388 set_remap_table_entry(iommu, devid, table); 3389 3390 if (devid != alias) 3391 set_remap_table_entry(iommu, alias, table); 3392 3393 out_wait: 3394 iommu_completion_wait(iommu); 3395 3396 out_unlock: 3397 spin_unlock_irqrestore(&iommu_table_lock, flags); 3398 3399 if (new_table) { 3400 iommu_free_pages(new_table->table); 3401 kfree(new_table); 3402 } 3403 return table; 3404 } 3405 3406 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, 3407 bool align, struct pci_dev *pdev, 3408 unsigned long max_irqs) 3409 { 3410 struct irq_remap_table *table; 3411 int index, c, alignment = 1; 3412 unsigned long flags; 3413 3414 table = alloc_irq_table(iommu, devid, pdev, max_irqs); 3415 if (!table) 3416 return -ENODEV; 3417 3418 if (align) 3419 alignment = roundup_pow_of_two(count); 3420 3421 raw_spin_lock_irqsave(&table->lock, flags); 3422 3423 /* Scan table for free entries */ 3424 for (index = ALIGN(table->min_index, alignment), c = 0; 3425 index < max_irqs;) { 3426 if (!iommu->irte_ops->is_allocated(table, index)) { 3427 c += 1; 3428 } else { 3429 c = 0; 3430 index = ALIGN(index + 1, alignment); 3431 continue; 3432 } 3433 3434 if (c == count) { 3435 for (; c != 0; --c) 3436 iommu->irte_ops->set_allocated(table, index - c + 1); 3437 3438 index -= count - 1; 3439 goto out; 3440 } 3441 3442 index++; 3443 } 3444 3445 index = -ENOSPC; 3446 3447 out: 3448 raw_spin_unlock_irqrestore(&table->lock, flags); 3449 3450 return index; 3451 } 3452 3453 static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3454 struct irte_ga *irte) 3455 { 3456 struct irq_remap_table *table; 3457 struct irte_ga *entry; 3458 unsigned long flags; 3459 u128 old; 3460 3461 table = get_irq_table(iommu, devid); 3462 if (!table) 3463 return -ENOMEM; 3464 3465 raw_spin_lock_irqsave(&table->lock, flags); 3466 3467 entry = (struct irte_ga *)table->table; 3468 entry = &entry[index]; 3469 3470 /* 3471 * We use cmpxchg16 to atomically update the 128-bit IRTE, 3472 * and it cannot be updated by the hardware or other processors 3473 * behind us, so the return value of cmpxchg16 should be the 3474 * same as the old value. 3475 */ 3476 old = entry->irte; 3477 WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte)); 3478 3479 raw_spin_unlock_irqrestore(&table->lock, flags); 3480 3481 return 0; 3482 } 3483 3484 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3485 struct irte_ga *irte) 3486 { 3487 int ret; 3488 3489 ret = __modify_irte_ga(iommu, devid, index, irte); 3490 if (ret) 3491 return ret; 3492 3493 iommu_flush_irt_and_complete(iommu, devid); 3494 3495 return 0; 3496 } 3497 3498 static int modify_irte(struct amd_iommu *iommu, 3499 u16 devid, int index, union irte *irte) 3500 { 3501 struct irq_remap_table *table; 3502 unsigned long flags; 3503 3504 table = get_irq_table(iommu, devid); 3505 if (!table) 3506 return -ENOMEM; 3507 3508 raw_spin_lock_irqsave(&table->lock, flags); 3509 table->table[index] = irte->val; 3510 raw_spin_unlock_irqrestore(&table->lock, flags); 3511 3512 iommu_flush_irt_and_complete(iommu, devid); 3513 3514 return 0; 3515 } 3516 3517 static void free_irte(struct amd_iommu *iommu, u16 devid, int index) 3518 { 3519 struct irq_remap_table *table; 3520 unsigned long flags; 3521 3522 table = get_irq_table(iommu, devid); 3523 if (!table) 3524 return; 3525 3526 raw_spin_lock_irqsave(&table->lock, flags); 3527 iommu->irte_ops->clear_allocated(table, index); 3528 raw_spin_unlock_irqrestore(&table->lock, flags); 3529 3530 iommu_flush_irt_and_complete(iommu, devid); 3531 } 3532 3533 static void irte_prepare(void *entry, 3534 u32 delivery_mode, bool dest_mode, 3535 u8 vector, u32 dest_apicid, int devid) 3536 { 3537 union irte *irte = (union irte *) entry; 3538 3539 irte->val = 0; 3540 irte->fields.vector = vector; 3541 irte->fields.int_type = delivery_mode; 3542 irte->fields.destination = dest_apicid; 3543 irte->fields.dm = dest_mode; 3544 irte->fields.valid = 1; 3545 } 3546 3547 static void irte_ga_prepare(void *entry, 3548 u32 delivery_mode, bool dest_mode, 3549 u8 vector, u32 dest_apicid, int devid) 3550 { 3551 struct irte_ga *irte = (struct irte_ga *) entry; 3552 3553 irte->lo.val = 0; 3554 irte->hi.val = 0; 3555 irte->lo.fields_remap.int_type = delivery_mode; 3556 irte->lo.fields_remap.dm = dest_mode; 3557 irte->hi.fields.vector = vector; 3558 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid); 3559 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid); 3560 irte->lo.fields_remap.valid = 1; 3561 } 3562 3563 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3564 { 3565 union irte *irte = (union irte *) entry; 3566 3567 irte->fields.valid = 1; 3568 modify_irte(iommu, devid, index, irte); 3569 } 3570 3571 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3572 { 3573 struct irte_ga *irte = (struct irte_ga *) entry; 3574 3575 irte->lo.fields_remap.valid = 1; 3576 modify_irte_ga(iommu, devid, index, irte); 3577 } 3578 3579 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3580 { 3581 union irte *irte = (union irte *) entry; 3582 3583 irte->fields.valid = 0; 3584 modify_irte(iommu, devid, index, irte); 3585 } 3586 3587 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3588 { 3589 struct irte_ga *irte = (struct irte_ga *) entry; 3590 3591 irte->lo.fields_remap.valid = 0; 3592 modify_irte_ga(iommu, devid, index, irte); 3593 } 3594 3595 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3596 u8 vector, u32 dest_apicid) 3597 { 3598 union irte *irte = (union irte *) entry; 3599 3600 irte->fields.vector = vector; 3601 irte->fields.destination = dest_apicid; 3602 modify_irte(iommu, devid, index, irte); 3603 } 3604 3605 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3606 u8 vector, u32 dest_apicid) 3607 { 3608 struct irte_ga *irte = (struct irte_ga *) entry; 3609 3610 if (!irte->lo.fields_remap.guest_mode) { 3611 irte->hi.fields.vector = vector; 3612 irte->lo.fields_remap.destination = 3613 APICID_TO_IRTE_DEST_LO(dest_apicid); 3614 irte->hi.fields.destination = 3615 APICID_TO_IRTE_DEST_HI(dest_apicid); 3616 modify_irte_ga(iommu, devid, index, irte); 3617 } 3618 } 3619 3620 #define IRTE_ALLOCATED (~1U) 3621 static void irte_set_allocated(struct irq_remap_table *table, int index) 3622 { 3623 table->table[index] = IRTE_ALLOCATED; 3624 } 3625 3626 static void irte_ga_set_allocated(struct irq_remap_table *table, int index) 3627 { 3628 struct irte_ga *ptr = (struct irte_ga *)table->table; 3629 struct irte_ga *irte = &ptr[index]; 3630 3631 memset(&irte->lo.val, 0, sizeof(u64)); 3632 memset(&irte->hi.val, 0, sizeof(u64)); 3633 irte->hi.fields.vector = 0xff; 3634 } 3635 3636 static bool irte_is_allocated(struct irq_remap_table *table, int index) 3637 { 3638 union irte *ptr = (union irte *)table->table; 3639 union irte *irte = &ptr[index]; 3640 3641 return irte->val != 0; 3642 } 3643 3644 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) 3645 { 3646 struct irte_ga *ptr = (struct irte_ga *)table->table; 3647 struct irte_ga *irte = &ptr[index]; 3648 3649 return irte->hi.fields.vector != 0; 3650 } 3651 3652 static void irte_clear_allocated(struct irq_remap_table *table, int index) 3653 { 3654 table->table[index] = 0; 3655 } 3656 3657 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) 3658 { 3659 struct irte_ga *ptr = (struct irte_ga *)table->table; 3660 struct irte_ga *irte = &ptr[index]; 3661 3662 memset(&irte->lo.val, 0, sizeof(u64)); 3663 memset(&irte->hi.val, 0, sizeof(u64)); 3664 } 3665 3666 static int get_devid(struct irq_alloc_info *info) 3667 { 3668 switch (info->type) { 3669 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3670 return get_ioapic_devid(info->devid); 3671 case X86_IRQ_ALLOC_TYPE_HPET: 3672 return get_hpet_devid(info->devid); 3673 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3674 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3675 return get_device_sbdf_id(msi_desc_to_dev(info->desc)); 3676 default: 3677 WARN_ON_ONCE(1); 3678 return -1; 3679 } 3680 } 3681 3682 struct irq_remap_ops amd_iommu_irq_ops = { 3683 .prepare = amd_iommu_prepare, 3684 .enable = amd_iommu_enable, 3685 .disable = amd_iommu_disable, 3686 .reenable = amd_iommu_reenable, 3687 .enable_faulting = amd_iommu_enable_faulting, 3688 }; 3689 3690 static void fill_msi_msg(struct msi_msg *msg, u32 index) 3691 { 3692 msg->data = index; 3693 msg->address_lo = 0; 3694 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; 3695 /* 3696 * The struct msi_msg.dest_mode_logical is used to set the DM bit 3697 * in MSI Message Address Register. For device w/ 2K int-remap support, 3698 * this is bit must be set to 1 regardless of the actual destination 3699 * mode, which is signified by the IRTE[DM]. 3700 */ 3701 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 3702 msg->arch_addr_lo.dest_mode_logical = true; 3703 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; 3704 } 3705 3706 static void irq_remapping_prepare_irte(struct amd_ir_data *data, 3707 struct irq_cfg *irq_cfg, 3708 struct irq_alloc_info *info, 3709 int devid, int index, int sub_handle) 3710 { 3711 struct irq_2_irte *irte_info = &data->irq_2_irte; 3712 struct amd_iommu *iommu = data->iommu; 3713 3714 if (!iommu) 3715 return; 3716 3717 data->irq_2_irte.devid = devid; 3718 data->irq_2_irte.index = index + sub_handle; 3719 iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED, 3720 apic->dest_mode_logical, irq_cfg->vector, 3721 irq_cfg->dest_apicid, devid); 3722 3723 switch (info->type) { 3724 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3725 case X86_IRQ_ALLOC_TYPE_HPET: 3726 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3727 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3728 fill_msi_msg(&data->msi_entry, irte_info->index); 3729 break; 3730 3731 default: 3732 BUG_ON(1); 3733 break; 3734 } 3735 } 3736 3737 struct amd_irte_ops irte_32_ops = { 3738 .prepare = irte_prepare, 3739 .activate = irte_activate, 3740 .deactivate = irte_deactivate, 3741 .set_affinity = irte_set_affinity, 3742 .set_allocated = irte_set_allocated, 3743 .is_allocated = irte_is_allocated, 3744 .clear_allocated = irte_clear_allocated, 3745 }; 3746 3747 struct amd_irte_ops irte_128_ops = { 3748 .prepare = irte_ga_prepare, 3749 .activate = irte_ga_activate, 3750 .deactivate = irte_ga_deactivate, 3751 .set_affinity = irte_ga_set_affinity, 3752 .set_allocated = irte_ga_set_allocated, 3753 .is_allocated = irte_ga_is_allocated, 3754 .clear_allocated = irte_ga_clear_allocated, 3755 }; 3756 3757 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, 3758 unsigned int nr_irqs, void *arg) 3759 { 3760 struct irq_alloc_info *info = arg; 3761 struct irq_data *irq_data; 3762 struct amd_ir_data *data = NULL; 3763 struct amd_iommu *iommu; 3764 struct irq_cfg *cfg; 3765 struct iommu_dev_data *dev_data; 3766 unsigned long max_irqs; 3767 int i, ret, devid, seg, sbdf; 3768 int index; 3769 3770 if (!info) 3771 return -EINVAL; 3772 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) 3773 return -EINVAL; 3774 3775 sbdf = get_devid(info); 3776 if (sbdf < 0) 3777 return -EINVAL; 3778 3779 seg = PCI_SBDF_TO_SEGID(sbdf); 3780 devid = PCI_SBDF_TO_DEVID(sbdf); 3781 iommu = __rlookup_amd_iommu(seg, devid); 3782 if (!iommu) 3783 return -EINVAL; 3784 3785 dev_data = search_dev_data(iommu, devid); 3786 max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; 3787 3788 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); 3789 if (ret < 0) 3790 return ret; 3791 3792 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { 3793 struct irq_remap_table *table; 3794 3795 table = alloc_irq_table(iommu, devid, NULL, max_irqs); 3796 if (table) { 3797 if (!table->min_index) { 3798 /* 3799 * Keep the first 32 indexes free for IOAPIC 3800 * interrupts. 3801 */ 3802 table->min_index = 32; 3803 for (i = 0; i < 32; ++i) 3804 iommu->irte_ops->set_allocated(table, i); 3805 } 3806 WARN_ON(table->min_index != 32); 3807 index = info->ioapic.pin; 3808 } else { 3809 index = -ENOMEM; 3810 } 3811 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI || 3812 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) { 3813 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); 3814 3815 index = alloc_irq_index(iommu, devid, nr_irqs, align, 3816 msi_desc_to_pci_dev(info->desc), 3817 max_irqs); 3818 } else { 3819 index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, 3820 max_irqs); 3821 } 3822 3823 if (index < 0) { 3824 pr_warn("Failed to allocate IRTE\n"); 3825 ret = index; 3826 goto out_free_parent; 3827 } 3828 3829 for (i = 0; i < nr_irqs; i++) { 3830 irq_data = irq_domain_get_irq_data(domain, virq + i); 3831 cfg = irq_data ? irqd_cfg(irq_data) : NULL; 3832 if (!cfg) { 3833 ret = -EINVAL; 3834 goto out_free_data; 3835 } 3836 3837 ret = -ENOMEM; 3838 data = kzalloc_obj(*data); 3839 if (!data) 3840 goto out_free_data; 3841 3842 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3843 data->entry = kzalloc_obj(union irte); 3844 else 3845 data->entry = kzalloc_obj(struct irte_ga); 3846 if (!data->entry) { 3847 kfree(data); 3848 goto out_free_data; 3849 } 3850 3851 data->iommu = iommu; 3852 irq_data->hwirq = (devid << 16) + i; 3853 irq_data->chip_data = data; 3854 irq_data->chip = &amd_ir_chip; 3855 irq_remapping_prepare_irte(data, cfg, info, devid, index, i); 3856 } 3857 3858 return 0; 3859 3860 out_free_data: 3861 for (i--; i >= 0; i--) { 3862 irq_data = irq_domain_get_irq_data(domain, virq + i); 3863 if (irq_data) 3864 kfree(irq_data->chip_data); 3865 } 3866 for (i = 0; i < nr_irqs; i++) 3867 free_irte(iommu, devid, index + i); 3868 out_free_parent: 3869 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3870 return ret; 3871 } 3872 3873 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, 3874 unsigned int nr_irqs) 3875 { 3876 struct irq_2_irte *irte_info; 3877 struct irq_data *irq_data; 3878 struct amd_ir_data *data; 3879 int i; 3880 3881 for (i = 0; i < nr_irqs; i++) { 3882 irq_data = irq_domain_get_irq_data(domain, virq + i); 3883 if (irq_data && irq_data->chip_data) { 3884 data = irq_data->chip_data; 3885 irte_info = &data->irq_2_irte; 3886 free_irte(data->iommu, irte_info->devid, irte_info->index); 3887 kfree(data->entry); 3888 kfree(data); 3889 } 3890 } 3891 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3892 } 3893 3894 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 3895 struct amd_ir_data *ir_data, 3896 struct irq_2_irte *irte_info, 3897 struct irq_cfg *cfg); 3898 3899 static int irq_remapping_activate(struct irq_domain *domain, 3900 struct irq_data *irq_data, bool reserve) 3901 { 3902 struct amd_ir_data *data = irq_data->chip_data; 3903 struct irq_2_irte *irte_info = &data->irq_2_irte; 3904 struct amd_iommu *iommu = data->iommu; 3905 struct irq_cfg *cfg = irqd_cfg(irq_data); 3906 3907 if (!iommu) 3908 return 0; 3909 3910 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid, 3911 irte_info->index); 3912 amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg); 3913 return 0; 3914 } 3915 3916 static void irq_remapping_deactivate(struct irq_domain *domain, 3917 struct irq_data *irq_data) 3918 { 3919 struct amd_ir_data *data = irq_data->chip_data; 3920 struct irq_2_irte *irte_info = &data->irq_2_irte; 3921 struct amd_iommu *iommu = data->iommu; 3922 3923 if (iommu) 3924 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid, 3925 irte_info->index); 3926 } 3927 3928 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, 3929 enum irq_domain_bus_token bus_token) 3930 { 3931 struct amd_iommu *iommu; 3932 int devid = -1; 3933 3934 if (!amd_iommu_irq_remap) 3935 return 0; 3936 3937 if (x86_fwspec_is_ioapic(fwspec)) 3938 devid = get_ioapic_devid(fwspec->param[0]); 3939 else if (x86_fwspec_is_hpet(fwspec)) 3940 devid = get_hpet_devid(fwspec->param[0]); 3941 3942 if (devid < 0) 3943 return 0; 3944 iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff)); 3945 3946 return iommu && iommu->ir_domain == d; 3947 } 3948 3949 static const struct irq_domain_ops amd_ir_domain_ops = { 3950 .select = irq_remapping_select, 3951 .alloc = irq_remapping_alloc, 3952 .free = irq_remapping_free, 3953 .activate = irq_remapping_activate, 3954 .deactivate = irq_remapping_deactivate, 3955 }; 3956 3957 static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, 3958 bool ga_log_intr) 3959 { 3960 if (cpu >= 0) { 3961 entry->lo.fields_vapic.destination = 3962 APICID_TO_IRTE_DEST_LO(cpu); 3963 entry->hi.fields.destination = 3964 APICID_TO_IRTE_DEST_HI(cpu); 3965 entry->lo.fields_vapic.is_run = true; 3966 entry->lo.fields_vapic.ga_log_intr = false; 3967 } else { 3968 entry->lo.fields_vapic.is_run = false; 3969 entry->lo.fields_vapic.ga_log_intr = ga_log_intr; 3970 } 3971 } 3972 3973 /* 3974 * Update the pCPU information for an IRTE that is configured to post IRQs to 3975 * a vCPU, without issuing an IOMMU invalidation for the IRTE. 3976 * 3977 * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination 3978 * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't 3979 * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based 3980 * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is 3981 * blocking and requires a notification wake event). I.e. treat vCPUs that are 3982 * associated with a pCPU as running. This API is intended to be used when a 3983 * vCPU is scheduled in/out (or stops running for any reason), to do a fast 3984 * update of IsRun, GALogIntr, and (conditionally) Destination. 3985 * 3986 * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached 3987 * and thus don't require an invalidation to ensure the IOMMU consumes fresh 3988 * information. 3989 */ 3990 int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 3991 { 3992 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3993 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3994 3995 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3996 return -EINVAL; 3997 3998 if (!entry || !entry->lo.fields_vapic.guest_mode) 3999 return 0; 4000 4001 if (!ir_data->iommu) 4002 return -ENODEV; 4003 4004 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 4005 4006 return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4007 ir_data->irq_2_irte.index, entry); 4008 } 4009 EXPORT_SYMBOL(amd_iommu_update_ga); 4010 4011 int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 4012 { 4013 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4014 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4015 u64 valid; 4016 4017 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4018 return -EINVAL; 4019 4020 if (!entry) 4021 return 0; 4022 4023 valid = entry->lo.fields_vapic.valid; 4024 4025 entry->lo.val = 0; 4026 entry->hi.val = 0; 4027 4028 entry->lo.fields_vapic.valid = valid; 4029 entry->lo.fields_vapic.guest_mode = 1; 4030 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; 4031 entry->hi.fields.vector = ir_data->ga_vector; 4032 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; 4033 4034 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 4035 4036 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4037 ir_data->irq_2_irte.index, entry); 4038 } 4039 EXPORT_SYMBOL(amd_iommu_activate_guest_mode); 4040 4041 int amd_iommu_deactivate_guest_mode(void *data) 4042 { 4043 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4044 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4045 struct irq_cfg *cfg = ir_data->cfg; 4046 u64 valid; 4047 4048 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4049 return -EINVAL; 4050 4051 if (!entry || !entry->lo.fields_vapic.guest_mode) 4052 return 0; 4053 4054 valid = entry->lo.fields_remap.valid; 4055 4056 entry->lo.val = 0; 4057 entry->hi.val = 0; 4058 4059 entry->lo.fields_remap.valid = valid; 4060 entry->lo.fields_remap.dm = apic->dest_mode_logical; 4061 entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED; 4062 entry->hi.fields.vector = cfg->vector; 4063 entry->lo.fields_remap.destination = 4064 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); 4065 entry->hi.fields.destination = 4066 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid); 4067 4068 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4069 ir_data->irq_2_irte.index, entry); 4070 } 4071 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); 4072 4073 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) 4074 { 4075 int ret; 4076 struct amd_iommu_pi_data *pi_data = info; 4077 struct amd_ir_data *ir_data = data->chip_data; 4078 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4079 struct iommu_dev_data *dev_data; 4080 4081 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4082 return -EINVAL; 4083 4084 if (ir_data->iommu == NULL) 4085 return -EINVAL; 4086 4087 dev_data = search_dev_data(ir_data->iommu, irte_info->devid); 4088 4089 /* Note: 4090 * This device has never been set up for guest mode. 4091 * we should not modify the IRTE 4092 */ 4093 if (!dev_data || !dev_data->use_vapic) 4094 return -EINVAL; 4095 4096 ir_data->cfg = irqd_cfg(data); 4097 4098 if (pi_data) { 4099 pi_data->ir_data = ir_data; 4100 4101 ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); 4102 ir_data->ga_vector = pi_data->vector; 4103 ir_data->ga_tag = pi_data->ga_tag; 4104 if (pi_data->is_guest_mode) 4105 ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, 4106 pi_data->ga_log_intr); 4107 else 4108 ret = amd_iommu_deactivate_guest_mode(ir_data); 4109 } else { 4110 ret = amd_iommu_deactivate_guest_mode(ir_data); 4111 } 4112 4113 return ret; 4114 } 4115 4116 4117 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 4118 struct amd_ir_data *ir_data, 4119 struct irq_2_irte *irte_info, 4120 struct irq_cfg *cfg) 4121 { 4122 4123 /* 4124 * Atomically updates the IRTE with the new destination, vector 4125 * and flushes the interrupt entry cache. 4126 */ 4127 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid, 4128 irte_info->index, cfg->vector, 4129 cfg->dest_apicid); 4130 } 4131 4132 static int amd_ir_set_affinity(struct irq_data *data, 4133 const struct cpumask *mask, bool force) 4134 { 4135 struct amd_ir_data *ir_data = data->chip_data; 4136 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4137 struct irq_cfg *cfg = irqd_cfg(data); 4138 struct irq_data *parent = data->parent_data; 4139 struct amd_iommu *iommu = ir_data->iommu; 4140 int ret; 4141 4142 if (!iommu) 4143 return -ENODEV; 4144 4145 ret = parent->chip->irq_set_affinity(parent, mask, force); 4146 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 4147 return ret; 4148 4149 amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg); 4150 /* 4151 * After this point, all the interrupts will start arriving 4152 * at the new destination. So, time to cleanup the previous 4153 * vector allocation. 4154 */ 4155 vector_schedule_cleanup(cfg); 4156 4157 return IRQ_SET_MASK_OK_DONE; 4158 } 4159 4160 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) 4161 { 4162 struct amd_ir_data *ir_data = irq_data->chip_data; 4163 4164 *msg = ir_data->msi_entry; 4165 } 4166 4167 static struct irq_chip amd_ir_chip = { 4168 .name = "AMD-IR", 4169 .irq_ack = apic_ack_irq, 4170 .irq_set_affinity = amd_ir_set_affinity, 4171 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, 4172 .irq_compose_msi_msg = ir_compose_msi_msg, 4173 }; 4174 4175 static const struct msi_parent_ops amdvi_msi_parent_ops = { 4176 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, 4177 .bus_select_token = DOMAIN_BUS_AMDVI, 4178 .bus_select_mask = MATCH_PCI_MSI, 4179 .prefix = "IR-", 4180 .init_dev_msi_info = msi_parent_init_dev_msi_info, 4181 }; 4182 4183 int amd_iommu_create_irq_domain(struct amd_iommu *iommu) 4184 { 4185 struct irq_domain_info info = { 4186 .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index), 4187 .ops = &amd_ir_domain_ops, 4188 .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, 4189 .host_data = iommu, 4190 .parent = arch_get_ir_parent_domain(), 4191 }; 4192 4193 if (!info.fwnode) 4194 return -ENOMEM; 4195 4196 iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops); 4197 if (!iommu->ir_domain) { 4198 irq_domain_free_fwnode(info.fwnode); 4199 return -ENOMEM; 4200 } 4201 return 0; 4202 } 4203 #endif 4204 4205 MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); 4206