1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 4 * Author: Joerg Roedel <jroedel@suse.de> 5 * Leo Duran <leo.duran@amd.com> 6 */ 7 8 #define pr_fmt(fmt) "AMD-Vi: " fmt 9 #define dev_fmt(fmt) pr_fmt(fmt) 10 11 #include <linux/ratelimit.h> 12 #include <linux/pci.h> 13 #include <linux/acpi.h> 14 #include <linux/pci-ats.h> 15 #include <linux/bitmap.h> 16 #include <linux/slab.h> 17 #include <linux/string_choices.h> 18 #include <linux/debugfs.h> 19 #include <linux/scatterlist.h> 20 #include <linux/dma-map-ops.h> 21 #include <linux/dma-direct.h> 22 #include <linux/idr.h> 23 #include <linux/iommu-helper.h> 24 #include <linux/delay.h> 25 #include <linux/amd-iommu.h> 26 #include <linux/notifier.h> 27 #include <linux/export.h> 28 #include <linux/irq.h> 29 #include <linux/irqchip/irq-msi-lib.h> 30 #include <linux/msi.h> 31 #include <linux/irqdomain.h> 32 #include <linux/percpu.h> 33 #include <linux/cc_platform.h> 34 #include <asm/irq_remapping.h> 35 #include <asm/io_apic.h> 36 #include <asm/apic.h> 37 #include <asm/hw_irq.h> 38 #include <asm/proto.h> 39 #include <asm/iommu.h> 40 #include <asm/gart.h> 41 #include <asm/dma.h> 42 #include <uapi/linux/iommufd.h> 43 #include <linux/generic_pt/iommu.h> 44 45 #include "amd_iommu.h" 46 #include "iommufd.h" 47 #include "../irq_remapping.h" 48 #include "../iommu-pages.h" 49 50 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 51 52 /* Reserved IOVA ranges */ 53 #define MSI_RANGE_START (0xfee00000) 54 #define MSI_RANGE_END (0xfeefffff) 55 #define HT_RANGE_START (0xfd00000000ULL) 56 #define HT_RANGE_END (0xffffffffffULL) 57 58 LIST_HEAD(ioapic_map); 59 LIST_HEAD(hpet_map); 60 LIST_HEAD(acpihid_map); 61 62 const struct iommu_ops amd_iommu_ops; 63 64 int amd_iommu_max_glx_val = -1; 65 66 /* 67 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap 68 * to know which ones are already in use. 69 */ 70 DEFINE_IDA(pdom_ids); 71 72 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 73 struct iommu_domain *old); 74 75 static void set_dte_entry(struct amd_iommu *iommu, 76 struct iommu_dev_data *dev_data, 77 phys_addr_t top_paddr, unsigned int top_level); 78 79 static int device_flush_dte(struct iommu_dev_data *dev_data); 80 81 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 82 phys_addr_t top_paddr, unsigned int top_level); 83 84 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 85 86 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 87 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 88 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 89 bool enable); 90 91 static void clone_aliases(struct amd_iommu *iommu, struct device *dev); 92 93 static int iommu_completion_wait(struct amd_iommu *iommu); 94 95 /**************************************************************************** 96 * 97 * Helper functions 98 * 99 ****************************************************************************/ 100 101 static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) 102 { 103 /* 104 * Note: 105 * We use arch_cmpxchg128_local() because: 106 * - Need cmpxchg16b instruction mainly for 128-bit store to DTE 107 * (not necessary for cmpxchg since this function is already 108 * protected by a spin_lock for this DTE). 109 * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. 110 */ 111 arch_cmpxchg128_local(ptr, *ptr, val); 112 } 113 114 static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) 115 { 116 struct dev_table_entry old; 117 118 old.data128[1] = ptr->data128[1]; 119 /* 120 * Preserve DTE_DATA2_INTR_MASK. This needs to be 121 * done here since it requires to be inside 122 * spin_lock(&dev_data->dte_lock) context. 123 */ 124 new->data[2] &= ~DTE_DATA2_INTR_MASK; 125 new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; 126 127 amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); 128 } 129 130 static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) 131 { 132 amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); 133 } 134 135 /* 136 * Note: 137 * IOMMU reads the entire Device Table entry in a single 256-bit transaction 138 * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver 139 * need to ensure the following: 140 * - DTE[V|GV] bit is being written last when setting. 141 * - DTE[V|GV] bit is being written first when clearing. 142 * 143 * This function is used only by code, which updates DMA translation part of the DTE. 144 * So, only consider control bits related to DMA when updating the entry. 145 */ 146 static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 147 struct dev_table_entry *new) 148 { 149 unsigned long flags; 150 struct dev_table_entry *dev_table = get_dev_table(iommu); 151 struct dev_table_entry *ptr = &dev_table[dev_data->devid]; 152 153 spin_lock_irqsave(&dev_data->dte_lock, flags); 154 155 if (!(ptr->data[0] & DTE_FLAG_V)) { 156 /* Existing DTE is not valid. */ 157 write_dte_upper128(ptr, new); 158 write_dte_lower128(ptr, new); 159 iommu_flush_dte_sync(iommu, dev_data->devid); 160 } else if (!(new->data[0] & DTE_FLAG_V)) { 161 /* Existing DTE is valid. New DTE is not valid. */ 162 write_dte_lower128(ptr, new); 163 write_dte_upper128(ptr, new); 164 iommu_flush_dte_sync(iommu, dev_data->devid); 165 } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { 166 /* 167 * Both DTEs are valid. 168 * Existing DTE has no guest page table. 169 */ 170 write_dte_upper128(ptr, new); 171 write_dte_lower128(ptr, new); 172 iommu_flush_dte_sync(iommu, dev_data->devid); 173 } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { 174 /* 175 * Both DTEs are valid. 176 * Existing DTE has guest page table, 177 * new DTE has no guest page table, 178 */ 179 write_dte_lower128(ptr, new); 180 write_dte_upper128(ptr, new); 181 iommu_flush_dte_sync(iommu, dev_data->devid); 182 } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != 183 FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { 184 /* 185 * Both DTEs are valid and have guest page table, 186 * but have different number of levels. So, we need 187 * to upadte both upper and lower 128-bit value, which 188 * require disabling and flushing. 189 */ 190 struct dev_table_entry clear = {}; 191 192 /* First disable DTE */ 193 write_dte_lower128(ptr, &clear); 194 iommu_flush_dte_sync(iommu, dev_data->devid); 195 196 /* Then update DTE */ 197 write_dte_upper128(ptr, new); 198 write_dte_lower128(ptr, new); 199 iommu_flush_dte_sync(iommu, dev_data->devid); 200 } else { 201 /* 202 * Both DTEs are valid and have guest page table, 203 * and same number of levels. We just need to only 204 * update the lower 128-bit. So no need to disable DTE. 205 */ 206 write_dte_lower128(ptr, new); 207 } 208 209 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 210 } 211 212 void amd_iommu_update_dte(struct amd_iommu *iommu, 213 struct iommu_dev_data *dev_data, 214 struct dev_table_entry *new) 215 { 216 update_dte256(iommu, dev_data, new); 217 clone_aliases(iommu, dev_data->dev); 218 device_flush_dte(dev_data); 219 iommu_completion_wait(iommu); 220 } 221 222 static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 223 struct dev_table_entry *dte) 224 { 225 unsigned long flags; 226 struct dev_table_entry *ptr; 227 struct dev_table_entry *dev_table = get_dev_table(iommu); 228 229 ptr = &dev_table[dev_data->devid]; 230 231 spin_lock_irqsave(&dev_data->dte_lock, flags); 232 dte->data128[0] = ptr->data128[0]; 233 dte->data128[1] = ptr->data128[1]; 234 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 235 } 236 237 static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) 238 { 239 return (pdom && (pdom->pd_mode == PD_MODE_V2)); 240 } 241 242 static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom) 243 { 244 return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY); 245 } 246 247 /* 248 * We cannot support PASID w/ existing v1 page table in the same domain 249 * since it will be nested. However, existing domain w/ v2 page table 250 * or passthrough mode can be used for PASID. 251 */ 252 static inline bool pdom_is_sva_capable(struct protection_domain *pdom) 253 { 254 return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom); 255 } 256 257 static inline int get_acpihid_device_id(struct device *dev, 258 struct acpihid_map_entry **entry) 259 { 260 struct acpi_device *adev = ACPI_COMPANION(dev); 261 struct acpihid_map_entry *p, *p1 = NULL; 262 int hid_count = 0; 263 bool fw_bug; 264 265 if (!adev) 266 return -ENODEV; 267 268 list_for_each_entry(p, &acpihid_map, list) { 269 if (acpi_dev_hid_uid_match(adev, p->hid, 270 p->uid[0] ? p->uid : NULL)) { 271 p1 = p; 272 fw_bug = false; 273 hid_count = 1; 274 break; 275 } 276 277 /* 278 * Count HID matches w/o UID, raise FW_BUG but allow exactly one match 279 */ 280 if (acpi_dev_hid_match(adev, p->hid)) { 281 p1 = p; 282 hid_count++; 283 fw_bug = true; 284 } 285 } 286 287 if (!p1) 288 return -EINVAL; 289 if (fw_bug) 290 dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", 291 hid_count, str_plural(hid_count)); 292 if (hid_count > 1) 293 return -EINVAL; 294 if (entry) 295 *entry = p1; 296 297 return p1->devid; 298 } 299 300 static inline int get_device_sbdf_id(struct device *dev) 301 { 302 int sbdf; 303 304 if (dev_is_pci(dev)) 305 sbdf = get_pci_sbdf_id(to_pci_dev(dev)); 306 else 307 sbdf = get_acpihid_device_id(dev, NULL); 308 309 return sbdf; 310 } 311 312 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu) 313 { 314 struct dev_table_entry *dev_table; 315 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 316 317 BUG_ON(pci_seg == NULL); 318 dev_table = pci_seg->dev_table; 319 BUG_ON(dev_table == NULL); 320 321 return dev_table; 322 } 323 324 static inline u16 get_device_segment(struct device *dev) 325 { 326 u16 seg; 327 328 if (dev_is_pci(dev)) { 329 struct pci_dev *pdev = to_pci_dev(dev); 330 331 seg = pci_domain_nr(pdev->bus); 332 } else { 333 u32 devid = get_acpihid_device_id(dev, NULL); 334 335 seg = PCI_SBDF_TO_SEGID(devid); 336 } 337 338 return seg; 339 } 340 341 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */ 342 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid) 343 { 344 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 345 346 pci_seg->rlookup_table[devid] = iommu; 347 } 348 349 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) 350 { 351 struct amd_iommu_pci_seg *pci_seg; 352 353 for_each_pci_segment(pci_seg) { 354 if (pci_seg->id != seg) 355 continue; 356 /* IVRS may not describe every device on the bus */ 357 if (devid > pci_seg->last_bdf) 358 return NULL; 359 return pci_seg->rlookup_table[devid]; 360 } 361 return NULL; 362 } 363 364 static struct amd_iommu *rlookup_amd_iommu(struct device *dev) 365 { 366 u16 seg = get_device_segment(dev); 367 int devid = get_device_sbdf_id(dev); 368 369 if (devid < 0) 370 return NULL; 371 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid)); 372 } 373 374 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) 375 { 376 struct iommu_dev_data *dev_data; 377 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 378 379 dev_data = kzalloc_obj(*dev_data); 380 if (!dev_data) 381 return NULL; 382 383 mutex_init(&dev_data->mutex); 384 spin_lock_init(&dev_data->dte_lock); 385 dev_data->devid = devid; 386 ratelimit_default_init(&dev_data->rs); 387 388 llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list); 389 return dev_data; 390 } 391 392 struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) 393 { 394 struct iommu_dev_data *dev_data; 395 struct llist_node *node; 396 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 397 398 if (llist_empty(&pci_seg->dev_data_list)) 399 return NULL; 400 401 node = pci_seg->dev_data_list.first; 402 llist_for_each_entry(dev_data, node, dev_data_list) { 403 if (dev_data->devid == devid) 404 return dev_data; 405 } 406 407 return NULL; 408 } 409 410 static int clone_alias(struct pci_dev *pdev_origin, u16 alias, void *data) 411 { 412 struct dev_table_entry new; 413 struct amd_iommu *iommu; 414 struct iommu_dev_data *dev_data, *alias_data; 415 struct pci_dev *pdev = data; 416 u16 devid = pci_dev_id(pdev); 417 int ret = 0; 418 419 if (devid == alias) 420 return 0; 421 422 iommu = rlookup_amd_iommu(&pdev->dev); 423 if (!iommu) 424 return 0; 425 426 /* Copy the data from pdev */ 427 dev_data = dev_iommu_priv_get(&pdev->dev); 428 if (!dev_data) { 429 pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); 430 ret = -EINVAL; 431 goto out; 432 } 433 get_dte256(iommu, dev_data, &new); 434 435 /* Setup alias */ 436 alias_data = find_dev_data(iommu, alias); 437 if (!alias_data) { 438 pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); 439 ret = -EINVAL; 440 goto out; 441 } 442 update_dte256(iommu, alias_data, &new); 443 444 amd_iommu_set_rlookup_table(iommu, alias); 445 out: 446 return ret; 447 } 448 449 static void clone_aliases(struct amd_iommu *iommu, struct device *dev) 450 { 451 struct pci_dev *pdev; 452 453 if (!dev_is_pci(dev)) 454 return; 455 pdev = to_pci_dev(dev); 456 457 /* 458 * The IVRS alias stored in the alias table may not be 459 * part of the PCI DMA aliases if it's bus differs 460 * from the original device. 461 */ 462 clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], pdev); 463 464 pci_for_each_dma_alias(pdev, clone_alias, pdev); 465 } 466 467 static void setup_aliases(struct amd_iommu *iommu, struct device *dev) 468 { 469 struct pci_dev *pdev = to_pci_dev(dev); 470 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 471 u16 ivrs_alias; 472 473 /* For ACPI HID devices, there are no aliases */ 474 if (!dev_is_pci(dev)) 475 return; 476 477 /* 478 * Add the IVRS alias to the pci aliases if it is on the same 479 * bus. The IVRS table may know about a quirk that we don't. 480 */ 481 ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)]; 482 if (ivrs_alias != pci_dev_id(pdev) && 483 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) 484 pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1); 485 486 clone_aliases(iommu, dev); 487 } 488 489 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid) 490 { 491 struct iommu_dev_data *dev_data; 492 493 dev_data = search_dev_data(iommu, devid); 494 495 if (dev_data == NULL) { 496 dev_data = alloc_dev_data(iommu, devid); 497 if (!dev_data) 498 return NULL; 499 500 if (translation_pre_enabled(iommu)) 501 dev_data->defer_attach = true; 502 } 503 504 return dev_data; 505 } 506 507 /* 508 * Find or create an IOMMU group for a acpihid device. 509 */ 510 static struct iommu_group *acpihid_device_group(struct device *dev) 511 { 512 struct acpihid_map_entry *p, *entry = NULL; 513 int devid; 514 515 devid = get_acpihid_device_id(dev, &entry); 516 if (devid < 0) 517 return ERR_PTR(devid); 518 519 list_for_each_entry(p, &acpihid_map, list) { 520 if ((devid == p->devid) && p->group) 521 entry->group = p->group; 522 } 523 524 if (!entry->group) 525 entry->group = generic_device_group(dev); 526 else 527 iommu_group_ref_get(entry->group); 528 529 return entry->group; 530 } 531 532 static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) 533 { 534 return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); 535 } 536 537 static u32 pdev_get_caps(struct pci_dev *pdev) 538 { 539 int features; 540 u32 flags = 0; 541 542 if (pci_ats_supported(pdev)) 543 flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 544 545 if (pci_pri_supported(pdev)) 546 flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 547 548 features = pci_pasid_features(pdev); 549 if (features >= 0) { 550 flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 551 552 if (features & PCI_PASID_CAP_EXEC) 553 flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 554 555 if (features & PCI_PASID_CAP_PRIV) 556 flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 557 } 558 559 return flags; 560 } 561 562 static inline int pdev_enable_cap_ats(struct pci_dev *pdev) 563 { 564 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 565 int ret = -EINVAL; 566 567 if (dev_data->ats_enabled) 568 return 0; 569 570 if (amd_iommu_iotlb_sup && 571 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { 572 ret = pci_enable_ats(pdev, PAGE_SHIFT); 573 if (!ret) { 574 dev_data->ats_enabled = 1; 575 dev_data->ats_qdep = pci_ats_queue_depth(pdev); 576 } 577 } 578 579 return ret; 580 } 581 582 static inline void pdev_disable_cap_ats(struct pci_dev *pdev) 583 { 584 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 585 586 if (dev_data->ats_enabled) { 587 pci_disable_ats(pdev); 588 dev_data->ats_enabled = 0; 589 } 590 } 591 592 static inline int pdev_enable_cap_pri(struct pci_dev *pdev) 593 { 594 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 595 int ret = -EINVAL; 596 597 if (dev_data->pri_enabled) 598 return 0; 599 600 if (!dev_data->ats_enabled) 601 return 0; 602 603 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { 604 /* 605 * First reset the PRI state of the device. 606 * FIXME: Hardcode number of outstanding requests for now 607 */ 608 if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { 609 dev_data->pri_enabled = 1; 610 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); 611 612 ret = 0; 613 } 614 } 615 616 return ret; 617 } 618 619 static inline void pdev_disable_cap_pri(struct pci_dev *pdev) 620 { 621 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 622 623 if (dev_data->pri_enabled) { 624 pci_disable_pri(pdev); 625 dev_data->pri_enabled = 0; 626 } 627 } 628 629 static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) 630 { 631 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 632 int ret = -EINVAL; 633 634 if (dev_data->pasid_enabled) 635 return 0; 636 637 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { 638 /* Only allow access to user-accessible pages */ 639 ret = pci_enable_pasid(pdev, 0); 640 if (!ret) 641 dev_data->pasid_enabled = 1; 642 } 643 644 return ret; 645 } 646 647 static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) 648 { 649 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 650 651 if (dev_data->pasid_enabled) { 652 pci_disable_pasid(pdev); 653 dev_data->pasid_enabled = 0; 654 } 655 } 656 657 static void pdev_enable_caps(struct pci_dev *pdev) 658 { 659 pdev_enable_cap_pasid(pdev); 660 pdev_enable_cap_ats(pdev); 661 pdev_enable_cap_pri(pdev); 662 } 663 664 static void pdev_disable_caps(struct pci_dev *pdev) 665 { 666 pdev_disable_cap_ats(pdev); 667 pdev_disable_cap_pasid(pdev); 668 pdev_disable_cap_pri(pdev); 669 } 670 671 /* 672 * This function checks if the driver got a valid device from the caller to 673 * avoid dereferencing invalid pointers. 674 */ 675 static bool check_device(struct device *dev) 676 { 677 struct amd_iommu_pci_seg *pci_seg; 678 struct amd_iommu *iommu; 679 int devid, sbdf; 680 681 if (!dev) 682 return false; 683 684 sbdf = get_device_sbdf_id(dev); 685 if (sbdf < 0) 686 return false; 687 devid = PCI_SBDF_TO_DEVID(sbdf); 688 689 iommu = rlookup_amd_iommu(dev); 690 if (!iommu) 691 return false; 692 693 /* Out of our scope? */ 694 pci_seg = iommu->pci_seg; 695 if (devid > pci_seg->last_bdf) 696 return false; 697 698 return true; 699 } 700 701 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) 702 { 703 struct iommu_dev_data *dev_data; 704 int devid, sbdf; 705 706 if (dev_iommu_priv_get(dev)) 707 return 0; 708 709 sbdf = get_device_sbdf_id(dev); 710 if (sbdf < 0) 711 return sbdf; 712 713 devid = PCI_SBDF_TO_DEVID(sbdf); 714 dev_data = find_dev_data(iommu, devid); 715 if (!dev_data) 716 return -ENOMEM; 717 718 dev_data->dev = dev; 719 720 /* 721 * The dev_iommu_priv_set() needes to be called before setup_aliases. 722 * Otherwise, subsequent call to dev_iommu_priv_get() will fail. 723 */ 724 dev_iommu_priv_set(dev, dev_data); 725 setup_aliases(iommu, dev); 726 727 /* 728 * By default we use passthrough mode for IOMMUv2 capable device. 729 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to 730 * invalid address), we ignore the capability for the device so 731 * it'll be forced to go into translation mode. 732 */ 733 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && 734 dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { 735 dev_data->flags = pdev_get_caps(to_pci_dev(dev)); 736 } 737 738 return 0; 739 } 740 741 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) 742 { 743 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 744 struct dev_table_entry *dev_table = get_dev_table(iommu); 745 int devid, sbdf; 746 747 sbdf = get_device_sbdf_id(dev); 748 if (sbdf < 0) 749 return; 750 751 devid = PCI_SBDF_TO_DEVID(sbdf); 752 pci_seg->rlookup_table[devid] = NULL; 753 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry)); 754 755 setup_aliases(iommu, dev); 756 } 757 758 759 /**************************************************************************** 760 * 761 * Interrupt handling functions 762 * 763 ****************************************************************************/ 764 765 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) 766 { 767 int i; 768 struct dev_table_entry dte; 769 struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); 770 771 get_dte256(iommu, dev_data, &dte); 772 773 for (i = 0; i < 4; ++i) 774 pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); 775 } 776 777 static void dump_command(unsigned long phys_addr) 778 { 779 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 780 int i; 781 782 for (i = 0; i < 4; ++i) 783 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]); 784 } 785 786 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event) 787 { 788 struct iommu_dev_data *dev_data = NULL; 789 int devid, vmg_tag, flags; 790 struct pci_dev *pdev; 791 u64 spa; 792 793 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 794 vmg_tag = (event[1]) & 0xFFFF; 795 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 796 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8); 797 798 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 799 devid & 0xff); 800 if (pdev) 801 dev_data = dev_iommu_priv_get(&pdev->dev); 802 803 if (dev_data) { 804 if (__ratelimit(&dev_data->rs)) { 805 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 806 vmg_tag, spa, flags); 807 } 808 } else { 809 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 810 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 811 vmg_tag, spa, flags); 812 } 813 814 if (pdev) 815 pci_dev_put(pdev); 816 } 817 818 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event) 819 { 820 struct iommu_dev_data *dev_data = NULL; 821 int devid, flags_rmp, vmg_tag, flags; 822 struct pci_dev *pdev; 823 u64 gpa; 824 825 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 826 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF; 827 vmg_tag = (event[1]) & 0xFFFF; 828 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 829 gpa = ((u64)event[3] << 32) | event[2]; 830 831 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 832 devid & 0xff); 833 if (pdev) 834 dev_data = dev_iommu_priv_get(&pdev->dev); 835 836 if (dev_data) { 837 if (__ratelimit(&dev_data->rs)) { 838 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 839 vmg_tag, gpa, flags_rmp, flags); 840 } 841 } else { 842 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 843 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 844 vmg_tag, gpa, flags_rmp, flags); 845 } 846 847 if (pdev) 848 pci_dev_put(pdev); 849 } 850 851 #define IS_IOMMU_MEM_TRANSACTION(flags) \ 852 (((flags) & EVENT_FLAG_I) == 0) 853 854 #define IS_WRITE_REQUEST(flags) \ 855 ((flags) & EVENT_FLAG_RW) 856 857 static void amd_iommu_report_page_fault(struct amd_iommu *iommu, 858 u16 devid, u16 domain_id, 859 u64 address, int flags) 860 { 861 struct iommu_dev_data *dev_data = NULL; 862 struct pci_dev *pdev; 863 864 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 865 devid & 0xff); 866 if (pdev) 867 dev_data = dev_iommu_priv_get(&pdev->dev); 868 869 if (dev_data) { 870 /* 871 * If this is a DMA fault (for which the I(nterrupt) 872 * bit will be unset), allow report_iommu_fault() to 873 * prevent logging it. 874 */ 875 if (IS_IOMMU_MEM_TRANSACTION(flags)) { 876 /* Device not attached to domain properly */ 877 if (dev_data->domain == NULL) { 878 pr_err_ratelimited("Event logged [Device not attached to domain properly]\n"); 879 pr_err_ratelimited(" device=%04x:%02x:%02x.%x domain=0x%04x\n", 880 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), 881 PCI_FUNC(devid), domain_id); 882 goto out; 883 } 884 885 if (!report_iommu_fault(&dev_data->domain->domain, 886 &pdev->dev, address, 887 IS_WRITE_REQUEST(flags) ? 888 IOMMU_FAULT_WRITE : 889 IOMMU_FAULT_READ)) 890 goto out; 891 } 892 893 if (__ratelimit(&dev_data->rs)) { 894 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", 895 domain_id, address, flags); 896 } 897 } else { 898 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", 899 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 900 domain_id, address, flags); 901 } 902 903 out: 904 if (pdev) 905 pci_dev_put(pdev); 906 } 907 908 static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 909 { 910 struct device *dev = iommu->iommu.dev; 911 int type, devid, flags, tag; 912 volatile u32 *event = __evt; 913 int count = 0; 914 u64 address, ctrl; 915 u32 pasid; 916 917 retry: 918 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 919 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 920 pasid = (event[0] & EVENT_DOMID_MASK_HI) | 921 (event[1] & EVENT_DOMID_MASK_LO); 922 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 923 address = (u64)(((u64)event[3]) << 32) | event[2]; 924 ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); 925 926 if (type == 0) { 927 /* Did we hit the erratum? */ 928 if (++count == LOOP_TIMEOUT) { 929 pr_err("No event written to event log\n"); 930 return; 931 } 932 udelay(1); 933 goto retry; 934 } 935 936 if (type == EVENT_TYPE_IO_FAULT) { 937 amd_iommu_report_page_fault(iommu, devid, pasid, address, flags); 938 return; 939 } 940 941 switch (type) { 942 case EVENT_TYPE_ILL_DEV: 943 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 944 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 945 pasid, address, flags); 946 dev_err(dev, "Control Reg : 0x%llx\n", ctrl); 947 dump_dte_entry(iommu, devid); 948 break; 949 case EVENT_TYPE_DEV_TAB_ERR: 950 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x " 951 "address=0x%llx flags=0x%04x]\n", 952 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 953 address, flags); 954 break; 955 case EVENT_TYPE_PAGE_TAB_ERR: 956 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n", 957 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 958 pasid, address, flags); 959 break; 960 case EVENT_TYPE_ILL_CMD: 961 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address); 962 dump_command(address); 963 break; 964 case EVENT_TYPE_CMD_HARD_ERR: 965 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n", 966 address, flags); 967 break; 968 case EVENT_TYPE_IOTLB_INV_TO: 969 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n", 970 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 971 address); 972 break; 973 case EVENT_TYPE_INV_DEV_REQ: 974 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 975 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 976 pasid, address, flags); 977 break; 978 case EVENT_TYPE_RMP_FAULT: 979 amd_iommu_report_rmp_fault(iommu, event); 980 break; 981 case EVENT_TYPE_RMP_HW_ERR: 982 amd_iommu_report_rmp_hw_error(iommu, event); 983 break; 984 case EVENT_TYPE_INV_PPR_REQ: 985 pasid = PPR_PASID(*((u64 *)__evt)); 986 tag = event[1] & 0x03FF; 987 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n", 988 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 989 pasid, address, flags, tag); 990 break; 991 default: 992 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n", 993 event[0], event[1], event[2], event[3]); 994 } 995 996 /* 997 * To detect the hardware errata 732 we need to clear the 998 * entry back to zero. This issue does not exist on SNP 999 * enabled system. Also this buffer is not writeable on 1000 * SNP enabled system. 1001 */ 1002 if (!amd_iommu_snp_en) 1003 memset(__evt, 0, 4 * sizeof(u32)); 1004 } 1005 1006 static void iommu_poll_events(struct amd_iommu *iommu) 1007 { 1008 u32 head, tail; 1009 1010 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1011 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 1012 1013 while (head != tail) { 1014 iommu_print_event(iommu, iommu->evt_buf + head); 1015 1016 /* Update head pointer of hardware ring-buffer */ 1017 head = (head + EVTLOG_ENTRY_SIZE) % amd_iommu_evtlog_size; 1018 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1019 } 1020 1021 } 1022 1023 #ifdef CONFIG_IRQ_REMAP 1024 static int (*iommu_ga_log_notifier)(u32); 1025 1026 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) 1027 { 1028 iommu_ga_log_notifier = notifier; 1029 1030 /* 1031 * Ensure all in-flight IRQ handlers run to completion before returning 1032 * to the caller, e.g. to ensure module code isn't unloaded while it's 1033 * being executed in the IRQ handler. 1034 */ 1035 if (!notifier) 1036 synchronize_rcu(); 1037 1038 return 0; 1039 } 1040 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); 1041 1042 static void iommu_poll_ga_log(struct amd_iommu *iommu) 1043 { 1044 u32 head, tail; 1045 1046 if (iommu->ga_log == NULL) 1047 return; 1048 1049 head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1050 tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); 1051 1052 while (head != tail) { 1053 volatile u64 *raw; 1054 u64 log_entry; 1055 1056 raw = (u64 *)(iommu->ga_log + head); 1057 1058 /* Avoid memcpy function-call overhead */ 1059 log_entry = *raw; 1060 1061 /* Update head pointer of hardware ring-buffer */ 1062 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; 1063 writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1064 1065 /* Handle GA entry */ 1066 switch (GA_REQ_TYPE(log_entry)) { 1067 case GA_GUEST_NR: 1068 if (!iommu_ga_log_notifier) 1069 break; 1070 1071 pr_debug("%s: devid=%#x, ga_tag=%#x\n", 1072 __func__, GA_DEVID(log_entry), 1073 GA_TAG(log_entry)); 1074 1075 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) 1076 pr_err("GA log notifier failed.\n"); 1077 break; 1078 default: 1079 break; 1080 } 1081 } 1082 } 1083 1084 static void 1085 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) 1086 { 1087 if (!irq_remapping_enabled || !dev_is_pci(dev) || 1088 !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev))) 1089 return; 1090 1091 dev_set_msi_domain(dev, iommu->ir_domain); 1092 } 1093 1094 #else /* CONFIG_IRQ_REMAP */ 1095 static inline void 1096 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } 1097 #endif /* !CONFIG_IRQ_REMAP */ 1098 1099 static void amd_iommu_handle_irq(void *data, const char *evt_type, 1100 u32 int_mask, u32 overflow_mask, 1101 void (*int_handler)(struct amd_iommu *), 1102 void (*overflow_handler)(struct amd_iommu *)) 1103 { 1104 struct amd_iommu *iommu = (struct amd_iommu *) data; 1105 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1106 u32 mask = int_mask | overflow_mask; 1107 1108 while (status & mask) { 1109 /* Enable interrupt sources again */ 1110 writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET); 1111 1112 if (int_handler) { 1113 pr_devel("Processing IOMMU (ivhd%d) %s Log\n", 1114 iommu->index, evt_type); 1115 int_handler(iommu); 1116 } 1117 1118 if ((status & overflow_mask) && overflow_handler) 1119 overflow_handler(iommu); 1120 1121 /* 1122 * Hardware bug: ERBT1312 1123 * When re-enabling interrupt (by writing 1 1124 * to clear the bit), the hardware might also try to set 1125 * the interrupt bit in the event status register. 1126 * In this scenario, the bit will be set, and disable 1127 * subsequent interrupts. 1128 * 1129 * Workaround: The IOMMU driver should read back the 1130 * status register and check if the interrupt bits are cleared. 1131 * If not, driver will need to go through the interrupt handler 1132 * again and re-clear the bits 1133 */ 1134 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1135 } 1136 } 1137 1138 irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data) 1139 { 1140 amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK, 1141 MMIO_STATUS_EVT_OVERFLOW_MASK, 1142 iommu_poll_events, amd_iommu_restart_event_logging); 1143 1144 return IRQ_HANDLED; 1145 } 1146 1147 irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data) 1148 { 1149 amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK, 1150 MMIO_STATUS_PPR_OVERFLOW_MASK, 1151 amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log); 1152 1153 return IRQ_HANDLED; 1154 } 1155 1156 irqreturn_t amd_iommu_int_thread_galog(int irq, void *data) 1157 { 1158 #ifdef CONFIG_IRQ_REMAP 1159 amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK, 1160 MMIO_STATUS_GALOG_OVERFLOW_MASK, 1161 iommu_poll_ga_log, amd_iommu_restart_ga_log); 1162 #endif 1163 1164 return IRQ_HANDLED; 1165 } 1166 1167 irqreturn_t amd_iommu_int_thread(int irq, void *data) 1168 { 1169 amd_iommu_int_thread_evtlog(irq, data); 1170 amd_iommu_int_thread_pprlog(irq, data); 1171 amd_iommu_int_thread_galog(irq, data); 1172 1173 return IRQ_HANDLED; 1174 } 1175 1176 /**************************************************************************** 1177 * 1178 * IOMMU command queuing functions 1179 * 1180 ****************************************************************************/ 1181 1182 static void dump_command_buffer(struct amd_iommu *iommu) 1183 { 1184 struct iommu_cmd *cmd; 1185 u32 head, tail; 1186 int i; 1187 1188 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 1189 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1190 1191 pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), 1192 MMIO_CMD_BUFFER_TAIL(tail)); 1193 1194 for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { 1195 cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); 1196 pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], 1197 cmd->data[3]); 1198 } 1199 } 1200 1201 static int wait_on_sem(struct amd_iommu *iommu, u64 data) 1202 { 1203 int i = 0; 1204 1205 /* 1206 * cmd_sem holds a monotonically non-decreasing completion sequence 1207 * number. 1208 */ 1209 while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && 1210 i < LOOP_TIMEOUT) { 1211 udelay(1); 1212 i += 1; 1213 } 1214 1215 if (i == LOOP_TIMEOUT) { 1216 1217 pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", 1218 iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), 1219 PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); 1220 1221 if (amd_iommu_dump) 1222 DO_ONCE_LITE(dump_command_buffer, iommu); 1223 1224 return -EIO; 1225 } 1226 1227 return 0; 1228 } 1229 1230 static void copy_cmd_to_buffer(struct amd_iommu *iommu, 1231 struct iommu_cmd *cmd) 1232 { 1233 u8 *target; 1234 u32 tail; 1235 1236 /* Copy command to buffer */ 1237 tail = iommu->cmd_buf_tail; 1238 target = iommu->cmd_buf + tail; 1239 memcpy(target, cmd, sizeof(*cmd)); 1240 1241 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1242 iommu->cmd_buf_tail = tail; 1243 1244 /* Tell the IOMMU about it */ 1245 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1246 } 1247 1248 static void build_completion_wait(struct iommu_cmd *cmd, 1249 struct amd_iommu *iommu, 1250 u64 data) 1251 { 1252 u64 paddr = iommu->cmd_sem_paddr; 1253 1254 memset(cmd, 0, sizeof(*cmd)); 1255 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 1256 cmd->data[1] = upper_32_bits(paddr); 1257 cmd->data[2] = lower_32_bits(data); 1258 cmd->data[3] = upper_32_bits(data); 1259 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 1260 } 1261 1262 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 1263 { 1264 memset(cmd, 0, sizeof(*cmd)); 1265 cmd->data[0] = devid; 1266 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 1267 } 1268 1269 /* 1270 * Builds an invalidation address which is suitable for one page or multiple 1271 * pages. Sets the size bit (S) as needed is more than one page is flushed. 1272 */ 1273 static inline u64 build_inv_address(u64 address, u64 last) 1274 { 1275 unsigned int sz_lg2; 1276 1277 address &= GENMASK_U64(63, 12); 1278 sz_lg2 = fls64(address ^ last); 1279 if (sz_lg2 <= 12) 1280 return address; 1281 1282 /* 1283 * Encode sz_lg2 according to Table 14: Example Page Size Encodings 1284 * 1285 * See "Note *": 1286 * Address bits 51:32 can be used to encode page sizes greater 1287 * that 4 Gbytes. 1288 * Which we take to mean that the highest page size has bit 1289 * [51]=0, [50:12]=1 1290 * and that coding happens when sz_lg2 is 52. Fall back to full 1291 * invalidation if the size is too big. 1292 * 1293 */ 1294 if (unlikely(sz_lg2 > 52)) 1295 return CMD_INV_IOMMU_ALL_PAGES_ADDRESS | 1296 CMD_INV_IOMMU_PAGES_SIZE_MASK; 1297 1298 /* 1299 * The sz_lg2 calculation with fls() ensures that: 1300 * address & BIT(sz_lg2 - 1) == 0 1301 * Therefore only the 1's need to be added. 8KB requires no 1's 1302 */ 1303 if (sz_lg2 > 13) 1304 address |= GENMASK_U64(sz_lg2 - 2, 12); 1305 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK; 1306 } 1307 1308 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 1309 u64 last, u16 domid, ioasid_t pasid, 1310 u32 flags) 1311 { 1312 u64 inv_address = build_inv_address(address, last); 1313 1314 memset(cmd, 0, sizeof(*cmd)); 1315 1316 cmd->data[1] |= domid; 1317 cmd->data[2] = lower_32_bits(inv_address); 1318 cmd->data[3] = upper_32_bits(inv_address); 1319 cmd->data[2] |= flags; 1320 if (flags & CMD_INV_IOMMU_PAGES_GN_MASK) 1321 cmd->data[0] |= pasid; 1322 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 1323 } 1324 1325 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 1326 u64 address, u64 last, 1327 ioasid_t pasid, bool gn) 1328 { 1329 u64 inv_address = build_inv_address(address, last); 1330 1331 memset(cmd, 0, sizeof(*cmd)); 1332 1333 cmd->data[0] = devid; 1334 cmd->data[0] |= (qdep & 0xff) << 24; 1335 cmd->data[1] = devid; 1336 cmd->data[2] = lower_32_bits(inv_address); 1337 cmd->data[3] = upper_32_bits(inv_address); 1338 if (gn) { 1339 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 1340 cmd->data[1] |= (pasid & 0xff) << 16; 1341 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1342 } 1343 1344 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 1345 } 1346 1347 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, 1348 int status, int tag, u8 gn) 1349 { 1350 memset(cmd, 0, sizeof(*cmd)); 1351 1352 cmd->data[0] = devid; 1353 if (gn) { 1354 cmd->data[1] = pasid; 1355 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 1356 } 1357 cmd->data[3] = tag & 0x1ff; 1358 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 1359 1360 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 1361 } 1362 1363 static void build_inv_all(struct iommu_cmd *cmd) 1364 { 1365 memset(cmd, 0, sizeof(*cmd)); 1366 CMD_SET_TYPE(cmd, CMD_INV_ALL); 1367 } 1368 1369 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 1370 { 1371 memset(cmd, 0, sizeof(*cmd)); 1372 cmd->data[0] = devid; 1373 CMD_SET_TYPE(cmd, CMD_INV_IRT); 1374 } 1375 1376 /* 1377 * Writes the command to the IOMMUs command buffer and informs the 1378 * hardware about the new command. 1379 */ 1380 static int __iommu_queue_command_sync(struct amd_iommu *iommu, 1381 struct iommu_cmd *cmd, 1382 bool sync) 1383 { 1384 unsigned int count = 0; 1385 u32 left, next_tail; 1386 1387 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1388 again: 1389 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; 1390 1391 if (left <= 0x20) { 1392 /* Skip udelay() the first time around */ 1393 if (count++) { 1394 if (count == LOOP_TIMEOUT) { 1395 pr_err("Command buffer timeout\n"); 1396 return -EIO; 1397 } 1398 1399 udelay(1); 1400 } 1401 1402 /* Update head and recheck remaining space */ 1403 iommu->cmd_buf_head = readl(iommu->mmio_base + 1404 MMIO_CMD_HEAD_OFFSET); 1405 1406 goto again; 1407 } 1408 1409 copy_cmd_to_buffer(iommu, cmd); 1410 1411 /* Do we need to make sure all commands are processed? */ 1412 iommu->need_sync = sync; 1413 1414 return 0; 1415 } 1416 1417 static int iommu_queue_command_sync(struct amd_iommu *iommu, 1418 struct iommu_cmd *cmd, 1419 bool sync) 1420 { 1421 unsigned long flags; 1422 int ret; 1423 1424 raw_spin_lock_irqsave(&iommu->lock, flags); 1425 ret = __iommu_queue_command_sync(iommu, cmd, sync); 1426 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1427 1428 return ret; 1429 } 1430 1431 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 1432 { 1433 return iommu_queue_command_sync(iommu, cmd, true); 1434 } 1435 1436 static u64 get_cmdsem_val(struct amd_iommu *iommu) 1437 { 1438 lockdep_assert_held(&iommu->lock); 1439 return ++iommu->cmd_sem_val; 1440 } 1441 1442 /* 1443 * This function queues a completion wait command into the command 1444 * buffer of an IOMMU 1445 */ 1446 static int iommu_completion_wait(struct amd_iommu *iommu) 1447 { 1448 struct iommu_cmd cmd; 1449 unsigned long flags; 1450 int ret; 1451 u64 data; 1452 1453 if (!iommu->need_sync) 1454 return 0; 1455 1456 raw_spin_lock_irqsave(&iommu->lock, flags); 1457 1458 data = get_cmdsem_val(iommu); 1459 build_completion_wait(&cmd, iommu, data); 1460 1461 ret = __iommu_queue_command_sync(iommu, &cmd, false); 1462 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1463 1464 if (ret) 1465 return ret; 1466 1467 ret = wait_on_sem(iommu, data); 1468 1469 return ret; 1470 } 1471 1472 static void domain_flush_complete(struct protection_domain *domain) 1473 { 1474 struct pdom_iommu_info *pdom_iommu_info; 1475 unsigned long i; 1476 1477 lockdep_assert_held(&domain->lock); 1478 1479 /* 1480 * Devices of this domain are behind this IOMMU 1481 * We need to wait for completion of all commands. 1482 */ 1483 xa_for_each(&domain->iommu_array, i, pdom_iommu_info) 1484 iommu_completion_wait(pdom_iommu_info->iommu); 1485 } 1486 1487 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 1488 { 1489 struct iommu_cmd cmd; 1490 1491 build_inv_dte(&cmd, devid); 1492 1493 return iommu_queue_command(iommu, &cmd); 1494 } 1495 1496 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) 1497 { 1498 int ret; 1499 1500 ret = iommu_flush_dte(iommu, devid); 1501 if (!ret) 1502 iommu_completion_wait(iommu); 1503 } 1504 1505 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) 1506 { 1507 u32 devid; 1508 u16 last_bdf = iommu->pci_seg->last_bdf; 1509 1510 for (devid = 0; devid <= last_bdf; ++devid) 1511 iommu_flush_dte(iommu, devid); 1512 1513 iommu_completion_wait(iommu); 1514 } 1515 1516 /* 1517 * This function uses heavy locking and may disable irqs for some time. But 1518 * this is no issue because it is only called during resume. 1519 */ 1520 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) 1521 { 1522 u32 dom_id; 1523 u16 last_bdf = iommu->pci_seg->last_bdf; 1524 1525 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { 1526 struct iommu_cmd cmd; 1527 build_inv_iommu_pages(&cmd, 0, U64_MAX, 1528 dom_id, IOMMU_NO_PASID, 1529 CMD_INV_IOMMU_PAGES_PDE_MASK); 1530 iommu_queue_command(iommu, &cmd); 1531 } 1532 1533 iommu_completion_wait(iommu); 1534 } 1535 1536 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) 1537 { 1538 struct iommu_cmd cmd; 1539 1540 build_inv_iommu_pages(&cmd, 0, U64_MAX, 1541 dom_id, IOMMU_NO_PASID, 1542 CMD_INV_IOMMU_PAGES_PDE_MASK); 1543 iommu_queue_command(iommu, &cmd); 1544 1545 iommu_completion_wait(iommu); 1546 } 1547 1548 static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, 1549 u64 address, u64 last, u32 flags) 1550 { 1551 int ret = 0; 1552 struct amd_iommu_viommu *aviommu; 1553 1554 list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) { 1555 unsigned long i; 1556 struct guest_domain_mapping_info *gdom_info; 1557 struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, 1558 struct amd_iommu, iommu); 1559 1560 xa_lock(&aviommu->gdomid_array); 1561 xa_for_each(&aviommu->gdomid_array, i, gdom_info) { 1562 struct iommu_cmd cmd; 1563 1564 pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__, 1565 iommu->devid, gdom_info->hdom_id); 1566 build_inv_iommu_pages(&cmd, address, last, gdom_info->hdom_id, 1567 IOMMU_NO_PASID, flags); 1568 ret |= iommu_queue_command(iommu, &cmd); 1569 } 1570 xa_unlock(&aviommu->gdomid_array); 1571 } 1572 return ret; 1573 } 1574 1575 static void amd_iommu_flush_all(struct amd_iommu *iommu) 1576 { 1577 struct iommu_cmd cmd; 1578 1579 build_inv_all(&cmd); 1580 1581 iommu_queue_command(iommu, &cmd); 1582 iommu_completion_wait(iommu); 1583 } 1584 1585 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 1586 { 1587 struct iommu_cmd cmd; 1588 1589 build_inv_irt(&cmd, devid); 1590 1591 iommu_queue_command(iommu, &cmd); 1592 } 1593 1594 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) 1595 { 1596 u32 devid; 1597 u16 last_bdf = iommu->pci_seg->last_bdf; 1598 1599 if (iommu->irtcachedis_enabled) 1600 return; 1601 1602 for (devid = 0; devid <= last_bdf; devid++) 1603 iommu_flush_irt(iommu, devid); 1604 1605 iommu_completion_wait(iommu); 1606 } 1607 1608 void amd_iommu_flush_all_caches(struct amd_iommu *iommu) 1609 { 1610 if (check_feature(FEATURE_IA)) { 1611 amd_iommu_flush_all(iommu); 1612 } else { 1613 amd_iommu_flush_dte_all(iommu); 1614 amd_iommu_flush_irt_all(iommu); 1615 amd_iommu_flush_tlb_all(iommu); 1616 } 1617 } 1618 1619 /* 1620 * Command send function for flushing on-device TLB 1621 */ 1622 static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, 1623 u64 last, ioasid_t pasid, bool gn) 1624 { 1625 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1626 struct iommu_cmd cmd; 1627 int qdep = dev_data->ats_qdep; 1628 1629 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, 1630 last, pasid, gn); 1631 1632 return iommu_queue_command(iommu, &cmd); 1633 } 1634 1635 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data) 1636 { 1637 struct amd_iommu *iommu = data; 1638 1639 return iommu_flush_dte(iommu, alias); 1640 } 1641 1642 /* 1643 * Command send function for invalidating a device table entry 1644 */ 1645 static int device_flush_dte(struct iommu_dev_data *dev_data) 1646 { 1647 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1648 struct pci_dev *pdev = NULL; 1649 struct amd_iommu_pci_seg *pci_seg; 1650 u16 alias; 1651 int ret; 1652 1653 if (dev_is_pci(dev_data->dev)) 1654 pdev = to_pci_dev(dev_data->dev); 1655 1656 if (pdev) 1657 ret = pci_for_each_dma_alias(pdev, 1658 device_flush_dte_alias, iommu); 1659 else 1660 ret = iommu_flush_dte(iommu, dev_data->devid); 1661 if (ret) 1662 return ret; 1663 1664 pci_seg = iommu->pci_seg; 1665 alias = pci_seg->alias_table[dev_data->devid]; 1666 if (alias != dev_data->devid) { 1667 ret = iommu_flush_dte(iommu, alias); 1668 if (ret) 1669 return ret; 1670 } 1671 1672 if (dev_data->ats_enabled) { 1673 /* Invalidate the entire contents of an IOTLB */ 1674 ret = device_flush_iotlb(dev_data, 0, U64_MAX, 1675 IOMMU_NO_PASID, false); 1676 } 1677 1678 return ret; 1679 } 1680 1681 static int domain_flush_pages_v2(struct protection_domain *pdom, 1682 u64 address, u64 last, u32 flags) 1683 { 1684 struct iommu_dev_data *dev_data; 1685 struct iommu_cmd cmd; 1686 int ret = 0; 1687 1688 lockdep_assert_held(&pdom->lock); 1689 list_for_each_entry(dev_data, &pdom->dev_list, list) { 1690 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1691 u16 domid = dev_data->gcr3_info.domid; 1692 1693 build_inv_iommu_pages(&cmd, address, last, domid, 1694 IOMMU_NO_PASID, 1695 flags | CMD_INV_IOMMU_PAGES_GN_MASK); 1696 1697 ret |= iommu_queue_command(iommu, &cmd); 1698 } 1699 1700 return ret; 1701 } 1702 1703 static int domain_flush_pages_v1(struct protection_domain *pdom, 1704 u64 address, u64 last, u32 flags) 1705 { 1706 struct pdom_iommu_info *pdom_iommu_info; 1707 struct iommu_cmd cmd; 1708 int ret = 0; 1709 unsigned long i; 1710 1711 lockdep_assert_held(&pdom->lock); 1712 1713 build_inv_iommu_pages(&cmd, address, last, 1714 pdom->id, IOMMU_NO_PASID, flags); 1715 1716 xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { 1717 /* 1718 * Devices of this domain are behind this IOMMU 1719 * We need a TLB flush 1720 */ 1721 ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); 1722 } 1723 1724 /* 1725 * A domain w/ v1 table can be a nest parent, which can have 1726 * multiple nested domains. Each nested domain has 1:1 mapping 1727 * between gDomID and hDomID. Therefore, flush every hDomID 1728 * associated to this nest parent domain. 1729 * 1730 * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested() 1731 */ 1732 if (!list_empty(&pdom->viommu_list)) 1733 ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, last, flags); 1734 1735 return ret; 1736 } 1737 1738 /* 1739 * TLB invalidation function which is called from the mapping functions. 1740 * It flushes range of PTEs of the domain. 1741 */ 1742 static void __domain_flush_pages(struct protection_domain *domain, 1743 u64 address, u64 last, u32 flags) 1744 { 1745 struct iommu_dev_data *dev_data; 1746 int ret = 0; 1747 ioasid_t pasid = IOMMU_NO_PASID; 1748 bool gn = false; 1749 1750 lockdep_assert_held(&domain->lock); 1751 1752 if (pdom_is_v2_pgtbl_mode(domain)) { 1753 gn = true; 1754 ret = domain_flush_pages_v2(domain, address, last, flags); 1755 } else { 1756 ret = domain_flush_pages_v1(domain, address, last, flags); 1757 } 1758 1759 list_for_each_entry(dev_data, &domain->dev_list, list) { 1760 1761 if (!dev_data->ats_enabled) 1762 continue; 1763 1764 ret |= device_flush_iotlb(dev_data, address, last, pasid, gn); 1765 } 1766 1767 WARN_ON(ret); 1768 } 1769 1770 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 1771 u64 address, u64 last, u32 flags) 1772 { 1773 lockdep_assert_held(&domain->lock); 1774 1775 if (likely(!amd_iommu_np_cache) || 1776 unlikely(address == 0 && last == U64_MAX)) { 1777 __domain_flush_pages(domain, address, last, flags); 1778 1779 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1780 domain_flush_complete(domain); 1781 1782 return; 1783 } 1784 1785 /* 1786 * When NpCache is on, we infer that we run in a VM and use a vIOMMU. 1787 * In such setups it is best to avoid flushes of ranges which are not 1788 * naturally aligned, since it would lead to flushes of unmodified 1789 * PTEs. Such flushes would require the hypervisor to do more work than 1790 * necessary. Therefore, perform repeated flushes of aligned ranges 1791 * until you cover the range. Each iteration flushes the smaller 1792 * between the natural alignment of the address that we flush and the 1793 * greatest naturally aligned region that fits in the range. 1794 */ 1795 while (address <= last) { 1796 unsigned int sz_lg2 = ilog2(last - address + 1); 1797 u64 flush_last; 1798 1799 if (likely(address)) 1800 sz_lg2 = min_t(unsigned int, sz_lg2, __ffs64(address)); 1801 1802 flush_last = address + (1ULL << sz_lg2) - 1; 1803 __domain_flush_pages(domain, address, flush_last, flags); 1804 if (check_add_overflow(flush_last, 1, &address)) 1805 break; 1806 } 1807 1808 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1809 domain_flush_complete(domain); 1810 } 1811 1812 /* Flush the whole IO/TLB for a given protection domain - including PDE */ 1813 static void amd_iommu_domain_flush_all(struct protection_domain *domain) 1814 { 1815 amd_iommu_domain_flush_pages(domain, 0, U64_MAX, 1816 CMD_INV_IOMMU_PAGES_PDE_MASK); 1817 } 1818 1819 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, 1820 ioasid_t pasid, u64 address, u64 last) 1821 { 1822 struct iommu_cmd cmd; 1823 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1824 1825 build_inv_iommu_pages(&cmd, address, last, 1826 dev_data->gcr3_info.domid, pasid, 1827 CMD_INV_IOMMU_PAGES_GN_MASK | 1828 CMD_INV_IOMMU_PAGES_PDE_MASK); 1829 iommu_queue_command(iommu, &cmd); 1830 1831 if (dev_data->ats_enabled) 1832 device_flush_iotlb(dev_data, address, last, pasid, true); 1833 1834 iommu_completion_wait(iommu); 1835 } 1836 1837 static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, 1838 ioasid_t pasid) 1839 { 1840 amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, U64_MAX); 1841 } 1842 1843 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1844 { 1845 struct iommu_dev_data *dev_data; 1846 struct amd_iommu *iommu; 1847 struct iommu_cmd cmd; 1848 1849 dev_data = dev_iommu_priv_get(dev); 1850 iommu = get_amd_iommu_from_dev(dev); 1851 1852 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 1853 tag, dev_data->pri_tlp); 1854 1855 return iommu_queue_command(iommu, &cmd); 1856 } 1857 1858 /**************************************************************************** 1859 * 1860 * The next functions belong to the domain allocation. A domain is 1861 * allocated for every IOMMU as the default domain. If device isolation 1862 * is enabled, every device get its own domain. The most important thing 1863 * about domains is the page table mapping the DMA address space they 1864 * contain. 1865 * 1866 ****************************************************************************/ 1867 int amd_iommu_pdom_id_alloc(void) 1868 { 1869 return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); 1870 } 1871 1872 int amd_iommu_pdom_id_reserve(u16 id, gfp_t gfp) 1873 { 1874 return ida_alloc_range(&pdom_ids, id, id, gfp); 1875 } 1876 1877 void amd_iommu_pdom_id_free(int id) 1878 { 1879 ida_free(&pdom_ids, id); 1880 } 1881 1882 void amd_iommu_pdom_id_destroy(void) 1883 { 1884 ida_destroy(&pdom_ids); 1885 } 1886 1887 static void free_gcr3_tbl_level1(u64 *tbl) 1888 { 1889 u64 *ptr; 1890 int i; 1891 1892 for (i = 0; i < 512; ++i) { 1893 if (!(tbl[i] & GCR3_VALID)) 1894 continue; 1895 1896 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1897 1898 iommu_free_pages(ptr); 1899 } 1900 } 1901 1902 static void free_gcr3_tbl_level2(u64 *tbl) 1903 { 1904 u64 *ptr; 1905 int i; 1906 1907 for (i = 0; i < 512; ++i) { 1908 if (!(tbl[i] & GCR3_VALID)) 1909 continue; 1910 1911 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1912 1913 free_gcr3_tbl_level1(ptr); 1914 } 1915 } 1916 1917 static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) 1918 { 1919 if (gcr3_info->glx == 2) 1920 free_gcr3_tbl_level2(gcr3_info->gcr3_tbl); 1921 else if (gcr3_info->glx == 1) 1922 free_gcr3_tbl_level1(gcr3_info->gcr3_tbl); 1923 else 1924 WARN_ON_ONCE(gcr3_info->glx != 0); 1925 1926 gcr3_info->glx = 0; 1927 1928 /* Free per device domain ID */ 1929 amd_iommu_pdom_id_free(gcr3_info->domid); 1930 1931 iommu_free_pages(gcr3_info->gcr3_tbl); 1932 gcr3_info->gcr3_tbl = NULL; 1933 } 1934 1935 /* 1936 * Number of GCR3 table levels required. Level must be 4-Kbyte 1937 * page and can contain up to 512 entries. 1938 */ 1939 static int get_gcr3_levels(int pasids) 1940 { 1941 int levels; 1942 1943 if (pasids == -1) 1944 return amd_iommu_max_glx_val; 1945 1946 levels = get_count_order(pasids); 1947 1948 return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; 1949 } 1950 1951 static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, 1952 struct amd_iommu *iommu, int pasids) 1953 { 1954 int levels = get_gcr3_levels(pasids); 1955 int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 1956 int domid; 1957 1958 if (levels > amd_iommu_max_glx_val) 1959 return -EINVAL; 1960 1961 if (gcr3_info->gcr3_tbl) 1962 return -EBUSY; 1963 1964 /* Allocate per device domain ID */ 1965 domid = amd_iommu_pdom_id_alloc(); 1966 if (domid <= 0) 1967 return -ENOSPC; 1968 gcr3_info->domid = domid; 1969 1970 gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K); 1971 if (gcr3_info->gcr3_tbl == NULL) { 1972 amd_iommu_pdom_id_free(domid); 1973 return -ENOMEM; 1974 } 1975 1976 gcr3_info->glx = levels; 1977 1978 return 0; 1979 } 1980 1981 static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info, 1982 ioasid_t pasid, bool alloc) 1983 { 1984 int index; 1985 u64 *pte; 1986 u64 *root = gcr3_info->gcr3_tbl; 1987 int level = gcr3_info->glx; 1988 1989 while (true) { 1990 1991 index = (pasid >> (9 * level)) & 0x1ff; 1992 pte = &root[index]; 1993 1994 if (level == 0) 1995 break; 1996 1997 if (!(*pte & GCR3_VALID)) { 1998 if (!alloc) 1999 return NULL; 2000 2001 root = (void *)get_zeroed_page(GFP_ATOMIC); 2002 if (root == NULL) 2003 return NULL; 2004 2005 *pte = iommu_virt_to_phys(root) | GCR3_VALID; 2006 } 2007 2008 root = iommu_phys_to_virt(*pte & PAGE_MASK); 2009 2010 level -= 1; 2011 } 2012 2013 return pte; 2014 } 2015 2016 static int update_gcr3(struct iommu_dev_data *dev_data, 2017 ioasid_t pasid, unsigned long gcr3, bool set) 2018 { 2019 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2020 u64 *pte; 2021 2022 pte = __get_gcr3_pte(gcr3_info, pasid, true); 2023 if (pte == NULL) 2024 return -ENOMEM; 2025 2026 if (set) 2027 *pte = (gcr3 & PAGE_MASK) | GCR3_VALID; 2028 else 2029 *pte = 0; 2030 2031 dev_flush_pasid_all(dev_data, pasid); 2032 return 0; 2033 } 2034 2035 int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, 2036 unsigned long gcr3) 2037 { 2038 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2039 int ret; 2040 2041 iommu_group_mutex_assert(dev_data->dev); 2042 2043 ret = update_gcr3(dev_data, pasid, gcr3, true); 2044 if (ret) 2045 return ret; 2046 2047 gcr3_info->pasid_cnt++; 2048 return ret; 2049 } 2050 2051 int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) 2052 { 2053 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2054 int ret; 2055 2056 iommu_group_mutex_assert(dev_data->dev); 2057 2058 ret = update_gcr3(dev_data, pasid, 0, false); 2059 if (ret) 2060 return ret; 2061 2062 gcr3_info->pasid_cnt--; 2063 return ret; 2064 } 2065 2066 /* 2067 * Note: 2068 * The old value for GCR3 table and GPT have been cleared from caller. 2069 */ 2070 static void set_dte_gcr3_table(struct iommu_dev_data *dev_data, 2071 struct dev_table_entry *new) 2072 { 2073 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2074 u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2075 2076 new->data[0] |= DTE_FLAG_TV | 2077 (dev_data->ppr ? DTE_FLAG_PPR : 0) | 2078 (pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) | 2079 DTE_FLAG_GV | 2080 FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2081 FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) | 2082 DTE_FLAG_IR | DTE_FLAG_IW; 2083 2084 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) | 2085 FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2086 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) | 2087 FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2088 2089 /* Guest page table can only support 4 and 5 levels */ 2090 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2091 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2092 else 2093 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2094 } 2095 2096 void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, 2097 struct protection_domain *domain, u16 domid, 2098 struct pt_iommu_amdv1_hw_info *pt_info, 2099 struct dev_table_entry *new) 2100 { 2101 u64 host_pt_root = __sme_set(pt_info->host_pt_root); 2102 2103 /* Note Dirty tracking is used for v1 table only for now */ 2104 new->data[0] |= DTE_FLAG_TV | 2105 FIELD_PREP(DTE_MODE_MASK, pt_info->mode) | 2106 (domain->dirty_tracking ? DTE_FLAG_HAD : 0) | 2107 FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) | 2108 DTE_FLAG_IR | DTE_FLAG_IW; 2109 2110 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) | 2111 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); 2112 } 2113 2114 static void set_dte_v1(struct iommu_dev_data *dev_data, 2115 struct protection_domain *domain, u16 domid, 2116 phys_addr_t top_paddr, unsigned int top_level, 2117 struct dev_table_entry *new) 2118 { 2119 struct pt_iommu_amdv1_hw_info pt_info; 2120 2121 /* 2122 * When updating the IO pagetable, the new top and level 2123 * are provided as parameters. For other operations i.e. 2124 * device attach, retrieve the current pagetable info 2125 * via the IOMMU PT API. 2126 */ 2127 if (top_paddr) { 2128 pt_info.host_pt_root = top_paddr; 2129 pt_info.mode = top_level + 1; 2130 } else { 2131 WARN_ON(top_paddr || top_level); 2132 pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info); 2133 } 2134 2135 amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new); 2136 } 2137 2138 static void set_dte_passthrough(struct iommu_dev_data *dev_data, 2139 struct protection_domain *domain, 2140 struct dev_table_entry *new) 2141 { 2142 new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; 2143 2144 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | 2145 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); 2146 2147 } 2148 2149 static void set_dte_entry(struct amd_iommu *iommu, 2150 struct iommu_dev_data *dev_data, 2151 phys_addr_t top_paddr, unsigned int top_level) 2152 { 2153 u32 old_domid; 2154 struct dev_table_entry new = {}; 2155 struct protection_domain *domain = dev_data->domain; 2156 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2157 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2158 2159 amd_iommu_make_clear_dte(dev_data, &new); 2160 2161 old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; 2162 if (gcr3_info->gcr3_tbl) 2163 set_dte_gcr3_table(dev_data, &new); 2164 else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) 2165 set_dte_passthrough(dev_data, domain, &new); 2166 else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) && 2167 domain->pd_mode == PD_MODE_V1) 2168 set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new); 2169 else 2170 WARN_ON(true); 2171 2172 amd_iommu_update_dte(iommu, dev_data, &new); 2173 2174 /* 2175 * A kdump kernel might be replacing a domain ID that was copied from 2176 * the previous kernel--if so, it needs to flush the translation cache 2177 * entries for the old domain ID that is being overwritten 2178 */ 2179 if (old_domid) { 2180 amd_iommu_flush_tlb_domid(iommu, old_domid); 2181 } 2182 } 2183 2184 /* 2185 * Clear DMA-remap related flags to block all DMA (blockeded domain) 2186 */ 2187 static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) 2188 { 2189 struct dev_table_entry new = {}; 2190 2191 amd_iommu_make_clear_dte(dev_data, &new); 2192 amd_iommu_update_dte(iommu, dev_data, &new); 2193 } 2194 2195 /* Update and flush DTE for the given device */ 2196 static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) 2197 { 2198 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2199 2200 if (set) 2201 set_dte_entry(iommu, dev_data, 0, 0); 2202 else 2203 clear_dte_entry(iommu, dev_data); 2204 } 2205 2206 /* 2207 * If domain is SVA capable then initialize GCR3 table. Also if domain is 2208 * in v2 page table mode then update GCR3[0]. 2209 */ 2210 static int init_gcr3_table(struct iommu_dev_data *dev_data, 2211 struct protection_domain *pdom) 2212 { 2213 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2214 int max_pasids = dev_data->max_pasids; 2215 struct pt_iommu_x86_64_hw_info pt_info; 2216 int ret = 0; 2217 2218 /* 2219 * If domain is in pt mode then setup GCR3 table only if device 2220 * is PASID capable 2221 */ 2222 if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data)) 2223 return ret; 2224 2225 /* 2226 * By default, setup GCR3 table to support MAX PASIDs 2227 * supported by the device/IOMMU. 2228 */ 2229 ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 2230 max_pasids > 0 ? max_pasids : 1); 2231 if (ret) 2232 return ret; 2233 2234 /* Setup GCR3[0] only if domain is setup with v2 page table mode */ 2235 if (!pdom_is_v2_pgtbl_mode(pdom)) 2236 return ret; 2237 2238 pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); 2239 ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); 2240 if (ret) 2241 free_gcr3_table(&dev_data->gcr3_info); 2242 2243 return ret; 2244 } 2245 2246 static void destroy_gcr3_table(struct iommu_dev_data *dev_data, 2247 struct protection_domain *pdom) 2248 { 2249 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2250 2251 if (pdom_is_v2_pgtbl_mode(pdom)) 2252 update_gcr3(dev_data, 0, 0, false); 2253 2254 if (gcr3_info->gcr3_tbl == NULL) 2255 return; 2256 2257 free_gcr3_table(gcr3_info); 2258 } 2259 2260 static int pdom_attach_iommu(struct amd_iommu *iommu, 2261 struct protection_domain *pdom) 2262 { 2263 struct pdom_iommu_info *pdom_iommu_info, *curr; 2264 unsigned long flags; 2265 int ret = 0; 2266 2267 spin_lock_irqsave(&pdom->lock, flags); 2268 2269 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2270 if (pdom_iommu_info) { 2271 pdom_iommu_info->refcnt++; 2272 goto out_unlock; 2273 } 2274 2275 pdom_iommu_info = kzalloc_obj(*pdom_iommu_info, GFP_ATOMIC); 2276 if (!pdom_iommu_info) { 2277 ret = -ENOMEM; 2278 goto out_unlock; 2279 } 2280 2281 pdom_iommu_info->iommu = iommu; 2282 pdom_iommu_info->refcnt = 1; 2283 2284 curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, 2285 NULL, pdom_iommu_info, GFP_ATOMIC); 2286 if (curr) { 2287 kfree(pdom_iommu_info); 2288 ret = -ENOSPC; 2289 goto out_unlock; 2290 } 2291 2292 out_unlock: 2293 spin_unlock_irqrestore(&pdom->lock, flags); 2294 return ret; 2295 } 2296 2297 static void pdom_detach_iommu(struct amd_iommu *iommu, 2298 struct protection_domain *pdom) 2299 { 2300 struct pdom_iommu_info *pdom_iommu_info; 2301 unsigned long flags; 2302 2303 spin_lock_irqsave(&pdom->lock, flags); 2304 2305 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2306 if (!pdom_iommu_info) { 2307 spin_unlock_irqrestore(&pdom->lock, flags); 2308 return; 2309 } 2310 2311 pdom_iommu_info->refcnt--; 2312 if (pdom_iommu_info->refcnt == 0) { 2313 xa_erase(&pdom->iommu_array, iommu->index); 2314 kfree(pdom_iommu_info); 2315 } 2316 2317 spin_unlock_irqrestore(&pdom->lock, flags); 2318 } 2319 2320 /* 2321 * If a device is not yet associated with a domain, this function makes the 2322 * device visible in the domain 2323 */ 2324 static int attach_device(struct device *dev, 2325 struct protection_domain *domain) 2326 { 2327 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2328 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2329 struct pci_dev *pdev; 2330 unsigned long flags; 2331 int ret = 0; 2332 2333 mutex_lock(&dev_data->mutex); 2334 2335 if (dev_data->domain != NULL) { 2336 ret = -EBUSY; 2337 goto out; 2338 } 2339 2340 /* Do reference counting */ 2341 ret = pdom_attach_iommu(iommu, domain); 2342 if (ret) 2343 goto out; 2344 2345 /* Setup GCR3 table */ 2346 if (pdom_is_sva_capable(domain)) { 2347 ret = init_gcr3_table(dev_data, domain); 2348 if (ret) { 2349 pdom_detach_iommu(iommu, domain); 2350 goto out; 2351 } 2352 } 2353 2354 pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; 2355 if (pdev && pdom_is_sva_capable(domain)) { 2356 pdev_enable_caps(pdev); 2357 2358 /* 2359 * Device can continue to function even if IOPF 2360 * enablement failed. Hence in error path just 2361 * disable device PRI support. 2362 */ 2363 if (amd_iommu_iopf_add_device(iommu, dev_data)) 2364 pdev_disable_cap_pri(pdev); 2365 } else if (pdev) { 2366 pdev_enable_cap_ats(pdev); 2367 } 2368 2369 /* Update data structures */ 2370 dev_data->domain = domain; 2371 spin_lock_irqsave(&domain->lock, flags); 2372 list_add(&dev_data->list, &domain->dev_list); 2373 spin_unlock_irqrestore(&domain->lock, flags); 2374 2375 /* Update device table */ 2376 dev_update_dte(dev_data, true); 2377 2378 out: 2379 mutex_unlock(&dev_data->mutex); 2380 2381 return ret; 2382 } 2383 2384 /* 2385 * Removes a device from a protection domain (with devtable_lock held) 2386 */ 2387 static void detach_device(struct device *dev) 2388 { 2389 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2390 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2391 struct protection_domain *domain = dev_data->domain; 2392 unsigned long flags; 2393 2394 mutex_lock(&dev_data->mutex); 2395 2396 /* 2397 * First check if the device is still attached. It might already 2398 * be detached from its domain because the generic 2399 * iommu_detach_group code detached it and we try again here in 2400 * our alias handling. 2401 */ 2402 if (WARN_ON(!dev_data->domain)) 2403 goto out; 2404 2405 /* Remove IOPF handler */ 2406 if (dev_data->ppr) { 2407 iopf_queue_flush_dev(dev); 2408 amd_iommu_iopf_remove_device(iommu, dev_data); 2409 } 2410 2411 if (dev_is_pci(dev)) 2412 pdev_disable_caps(to_pci_dev(dev)); 2413 2414 /* Clear DTE and flush the entry */ 2415 dev_update_dte(dev_data, false); 2416 2417 /* Flush IOTLB and wait for the flushes to finish */ 2418 spin_lock_irqsave(&domain->lock, flags); 2419 amd_iommu_domain_flush_all(domain); 2420 list_del(&dev_data->list); 2421 spin_unlock_irqrestore(&domain->lock, flags); 2422 2423 /* Clear GCR3 table */ 2424 if (pdom_is_sva_capable(domain)) 2425 destroy_gcr3_table(dev_data, domain); 2426 2427 /* Update data structures */ 2428 dev_data->domain = NULL; 2429 2430 /* decrease reference counters - needs to happen after the flushes */ 2431 pdom_detach_iommu(iommu, domain); 2432 2433 out: 2434 mutex_unlock(&dev_data->mutex); 2435 } 2436 2437 static struct iommu_device *amd_iommu_probe_device(struct device *dev) 2438 { 2439 struct iommu_device *iommu_dev; 2440 struct amd_iommu *iommu; 2441 struct iommu_dev_data *dev_data; 2442 int ret; 2443 2444 if (!check_device(dev)) 2445 return ERR_PTR(-ENODEV); 2446 2447 iommu = rlookup_amd_iommu(dev); 2448 if (!iommu) 2449 return ERR_PTR(-ENODEV); 2450 2451 /* Not registered yet? */ 2452 if (!iommu->iommu.ops) 2453 return ERR_PTR(-ENODEV); 2454 2455 if (dev_iommu_priv_get(dev)) 2456 return &iommu->iommu; 2457 2458 ret = iommu_init_device(iommu, dev); 2459 if (ret) { 2460 dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); 2461 iommu_dev = ERR_PTR(ret); 2462 iommu_ignore_device(iommu, dev); 2463 goto out_err; 2464 } 2465 2466 amd_iommu_set_pci_msi_domain(dev, iommu); 2467 iommu_dev = &iommu->iommu; 2468 2469 /* 2470 * If IOMMU and device supports PASID then it will contain max 2471 * supported PASIDs, else it will be zero. 2472 */ 2473 dev_data = dev_iommu_priv_get(dev); 2474 if (amd_iommu_pasid_supported() && dev_is_pci(dev) && 2475 pdev_pasid_supported(dev_data)) { 2476 dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids, 2477 pci_max_pasids(to_pci_dev(dev))); 2478 } 2479 2480 if (amd_iommu_pgtable == PD_MODE_NONE) { 2481 pr_warn_once("%s: DMA translation not supported by iommu.\n", 2482 __func__); 2483 iommu_dev = ERR_PTR(-ENODEV); 2484 goto out_err; 2485 } 2486 2487 iommu_completion_wait(iommu); 2488 2489 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 2490 dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; 2491 else 2492 dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; 2493 2494 if (dev_is_pci(dev)) 2495 pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); 2496 2497 out_err: 2498 return iommu_dev; 2499 } 2500 2501 static void amd_iommu_release_device(struct device *dev) 2502 { 2503 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2504 2505 WARN_ON(dev_data->domain); 2506 2507 /* 2508 * We keep dev_data around for unplugged devices and reuse it when the 2509 * device is re-plugged - not doing so would introduce a ton of races. 2510 */ 2511 } 2512 2513 static struct iommu_group *amd_iommu_device_group(struct device *dev) 2514 { 2515 if (dev_is_pci(dev)) 2516 return pci_device_group(dev); 2517 2518 return acpihid_device_group(dev); 2519 } 2520 2521 /***************************************************************************** 2522 * 2523 * The following functions belong to the exported interface of AMD IOMMU 2524 * 2525 * This interface allows access to lower level functions of the IOMMU 2526 * like protection domain handling and assignement of devices to domains 2527 * which is not possible with the dma_ops interface. 2528 * 2529 *****************************************************************************/ 2530 2531 static void protection_domain_init(struct protection_domain *domain) 2532 { 2533 spin_lock_init(&domain->lock); 2534 INIT_LIST_HEAD(&domain->dev_list); 2535 INIT_LIST_HEAD(&domain->dev_data_list); 2536 INIT_LIST_HEAD(&domain->viommu_list); 2537 xa_init(&domain->iommu_array); 2538 } 2539 2540 struct protection_domain *protection_domain_alloc(void) 2541 { 2542 struct protection_domain *domain; 2543 int domid; 2544 2545 domain = kzalloc_obj(*domain); 2546 if (!domain) 2547 return NULL; 2548 2549 domid = amd_iommu_pdom_id_alloc(); 2550 if (domid <= 0) { 2551 kfree(domain); 2552 return NULL; 2553 } 2554 domain->id = domid; 2555 2556 protection_domain_init(domain); 2557 2558 return domain; 2559 } 2560 2561 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2562 { 2563 if (amd_iommu_hatdis) 2564 return false; 2565 2566 return iommu && (iommu->features & FEATURE_HDSUP); 2567 } 2568 2569 static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) 2570 { 2571 struct protection_domain *pdom = 2572 container_of(iommupt, struct protection_domain, iommu); 2573 2574 return &pdom->lock; 2575 } 2576 2577 /* 2578 * Update all HW references to the domain with a new pgtable configuration. 2579 */ 2580 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 2581 phys_addr_t top_paddr, unsigned int top_level) 2582 { 2583 struct protection_domain *pdom = 2584 container_of(iommu_table, struct protection_domain, iommu); 2585 struct iommu_dev_data *dev_data; 2586 2587 lockdep_assert_held(&pdom->lock); 2588 2589 /* Update the DTE for all devices attached to this domain */ 2590 list_for_each_entry(dev_data, &pdom->dev_list, list) { 2591 struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 2592 2593 /* Update the HW references with the new level and top ptr */ 2594 set_dte_entry(iommu, dev_data, top_paddr, top_level); 2595 clone_aliases(iommu, dev_data->dev); 2596 } 2597 2598 list_for_each_entry(dev_data, &pdom->dev_list, list) 2599 device_flush_dte(dev_data); 2600 2601 domain_flush_complete(pdom); 2602 } 2603 2604 /* 2605 * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to 2606 * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non 2607 * present caching (like hypervisor shadowing). 2608 */ 2609 static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2610 unsigned long iova, size_t size) 2611 { 2612 struct protection_domain *domain = to_pdomain(dom); 2613 unsigned long flags; 2614 2615 if (likely(!amd_iommu_np_cache)) 2616 return 0; 2617 2618 spin_lock_irqsave(&domain->lock, flags); 2619 amd_iommu_domain_flush_pages(domain, iova, iova + size - 1, 2620 CMD_INV_IOMMU_PAGES_PDE_MASK); 2621 spin_unlock_irqrestore(&domain->lock, flags); 2622 return 0; 2623 } 2624 2625 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2626 { 2627 struct protection_domain *dom = to_pdomain(domain); 2628 unsigned long flags; 2629 2630 spin_lock_irqsave(&dom->lock, flags); 2631 amd_iommu_domain_flush_all(dom); 2632 spin_unlock_irqrestore(&dom->lock, flags); 2633 } 2634 2635 static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2636 struct iommu_iotlb_gather *gather) 2637 { 2638 struct protection_domain *dom = to_pdomain(domain); 2639 unsigned long flags; 2640 2641 spin_lock_irqsave(&dom->lock, flags); 2642 amd_iommu_domain_flush_pages(dom, gather->start, gather->end, 2643 iommu_pages_list_empty(&gather->freelist) ? 2644 0 : CMD_INV_IOMMU_PAGES_PDE_MASK); 2645 spin_unlock_irqrestore(&dom->lock, flags); 2646 iommu_put_pages_list(&gather->freelist); 2647 } 2648 2649 static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { 2650 .get_top_lock = amd_iommu_get_top_lock, 2651 .change_top = amd_iommu_change_top, 2652 }; 2653 2654 static const struct iommu_domain_ops amdv1_ops = { 2655 IOMMU_PT_DOMAIN_OPS(amdv1), 2656 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2657 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2658 .iotlb_sync = amd_iommu_iotlb_sync, 2659 .attach_dev = amd_iommu_attach_device, 2660 .free = amd_iommu_domain_free, 2661 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2662 }; 2663 2664 static const struct iommu_dirty_ops amdv1_dirty_ops = { 2665 IOMMU_PT_DIRTY_OPS(amdv1), 2666 .set_dirty_tracking = amd_iommu_set_dirty_tracking, 2667 }; 2668 2669 static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, 2670 u32 flags) 2671 { 2672 struct pt_iommu_amdv1_cfg cfg = {}; 2673 struct protection_domain *domain; 2674 int ret; 2675 2676 if (amd_iommu_hatdis) 2677 return ERR_PTR(-EOPNOTSUPP); 2678 2679 domain = protection_domain_alloc(); 2680 if (!domain) 2681 return ERR_PTR(-ENOMEM); 2682 2683 domain->pd_mode = PD_MODE_V1; 2684 domain->iommu.driver_ops = &amd_hw_driver_ops_v1; 2685 domain->iommu.nid = dev_to_node(dev); 2686 if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2687 domain->domain.dirty_ops = &amdv1_dirty_ops; 2688 2689 /* 2690 * Someday FORCE_COHERENCE should be set by 2691 * amd_iommu_enforce_cache_coherency() like VT-d does. 2692 */ 2693 cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 2694 BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 2695 BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 2696 2697 /* 2698 * AMD's IOMMU can flush as many pages as necessary in a single flush. 2699 * Unless we run in a virtual machine, which can be inferred according 2700 * to whether "non-present cache" is on, it is probably best to prefer 2701 * (potentially) too extensive TLB flushing (i.e., more misses) over 2702 * multiple TLB flushes (i.e., more flushes). For virtual machines the 2703 * hypervisor needs to synchronize the host IOMMU PTEs with those of 2704 * the guest, and the trade-off is different: unnecessary TLB flushes 2705 * should be avoided. 2706 */ 2707 if (amd_iommu_np_cache) 2708 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2709 else 2710 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2711 2712 cfg.common.hw_max_vasz_lg2 = amd_iommu_hpt_vasize; 2713 cfg.common.hw_max_oasz_lg2 = 52; 2714 cfg.starting_level = 2; 2715 domain->domain.ops = &amdv1_ops; 2716 2717 ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); 2718 if (ret) { 2719 amd_iommu_domain_free(&domain->domain); 2720 return ERR_PTR(ret); 2721 } 2722 2723 /* 2724 * Narrow the supported page sizes to those selected by the kernel 2725 * command line. 2726 */ 2727 domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; 2728 return &domain->domain; 2729 } 2730 2731 static const struct iommu_domain_ops amdv2_ops = { 2732 IOMMU_PT_DOMAIN_OPS(x86_64), 2733 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2734 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2735 .iotlb_sync = amd_iommu_iotlb_sync, 2736 .attach_dev = amd_iommu_attach_device, 2737 .free = amd_iommu_domain_free, 2738 /* 2739 * Note the AMDv2 page table format does not support a Force Coherency 2740 * bit, so enforce_cache_coherency should not be set. However VFIO is 2741 * not prepared to handle a case where some domains will support 2742 * enforcement and others do not. VFIO and iommufd will have to be fixed 2743 * before it can fully use the V2 page table. See the comment in 2744 * iommufd_hwpt_paging_alloc(). For now leave things as they have 2745 * historically been and lie about enforce_cache_coherencey. 2746 */ 2747 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2748 }; 2749 2750 static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, 2751 u32 flags) 2752 { 2753 struct pt_iommu_x86_64_cfg cfg = {}; 2754 struct protection_domain *domain; 2755 int ret; 2756 2757 if (!amd_iommu_v2_pgtbl_supported()) 2758 return ERR_PTR(-EOPNOTSUPP); 2759 2760 domain = protection_domain_alloc(); 2761 if (!domain) 2762 return ERR_PTR(-ENOMEM); 2763 2764 domain->pd_mode = PD_MODE_V2; 2765 domain->iommu.nid = dev_to_node(dev); 2766 2767 cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); 2768 if (amd_iommu_np_cache) 2769 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2770 else 2771 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2772 2773 /* 2774 * The v2 table behaves differently if it is attached to PASID 0 vs a 2775 * non-zero PASID. On PASID 0 it has no sign extension and the full 2776 * 57/48 bits decode the lower addresses. Otherwise it behaves like a 2777 * normal sign extended x86 page table. Since we want the domain to work 2778 * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not 2779 * set which creates a table that is compatible in both modes. 2780 */ 2781 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { 2782 cfg.common.hw_max_vasz_lg2 = 56; 2783 cfg.top_level = 4; 2784 } else { 2785 cfg.common.hw_max_vasz_lg2 = 47; 2786 cfg.top_level = 3; 2787 } 2788 cfg.common.hw_max_oasz_lg2 = 52; 2789 domain->domain.ops = &amdv2_ops; 2790 2791 ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); 2792 if (ret) { 2793 amd_iommu_domain_free(&domain->domain); 2794 return ERR_PTR(ret); 2795 } 2796 return &domain->domain; 2797 } 2798 2799 static inline bool is_nest_parent_supported(u32 flags) 2800 { 2801 /* Only allow nest parent when these features are supported */ 2802 return check_feature(FEATURE_GT) && 2803 check_feature(FEATURE_GIOSUP) && 2804 check_feature2(FEATURE_GCR3TRPMODE); 2805 } 2806 2807 static struct iommu_domain * 2808 amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 2809 const struct iommu_user_data *user_data) 2810 2811 { 2812 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2813 const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 2814 IOMMU_HWPT_ALLOC_PASID | 2815 IOMMU_HWPT_ALLOC_NEST_PARENT; 2816 2817 if ((flags & ~supported_flags) || user_data) 2818 return ERR_PTR(-EOPNOTSUPP); 2819 2820 switch (flags & supported_flags) { 2821 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: 2822 case IOMMU_HWPT_ALLOC_NEST_PARENT: 2823 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: 2824 /* 2825 * Allocate domain with v1 page table for dirty tracking 2826 * and/or Nest parent. 2827 */ 2828 if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && 2829 !amd_iommu_hd_support(iommu)) 2830 break; 2831 2832 if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && 2833 !is_nest_parent_supported(flags)) 2834 break; 2835 2836 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2837 case IOMMU_HWPT_ALLOC_PASID: 2838 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2839 if (!amd_iommu_pasid_supported()) 2840 break; 2841 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2842 case 0: { 2843 struct iommu_domain *ret; 2844 2845 /* If nothing specific is required use the kernel commandline default */ 2846 if (amd_iommu_pgtable == PD_MODE_V1) { 2847 ret = amd_iommu_domain_alloc_paging_v1(dev, flags); 2848 if (ret != ERR_PTR(-EOPNOTSUPP)) 2849 return ret; 2850 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2851 } 2852 ret = amd_iommu_domain_alloc_paging_v2(dev, flags); 2853 if (ret != ERR_PTR(-EOPNOTSUPP)) 2854 return ret; 2855 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2856 } 2857 default: 2858 break; 2859 } 2860 return ERR_PTR(-EOPNOTSUPP); 2861 } 2862 2863 void amd_iommu_domain_free(struct iommu_domain *dom) 2864 { 2865 struct protection_domain *domain = to_pdomain(dom); 2866 2867 WARN_ON(!list_empty(&domain->dev_list)); 2868 pt_iommu_deinit(&domain->iommu); 2869 amd_iommu_pdom_id_free(domain->id); 2870 kfree(domain); 2871 } 2872 2873 static int blocked_domain_attach_device(struct iommu_domain *domain, 2874 struct device *dev, 2875 struct iommu_domain *old) 2876 { 2877 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2878 2879 if (dev_data->domain) 2880 detach_device(dev); 2881 2882 /* Clear DTE and flush the entry */ 2883 mutex_lock(&dev_data->mutex); 2884 dev_update_dte(dev_data, false); 2885 mutex_unlock(&dev_data->mutex); 2886 2887 return 0; 2888 } 2889 2890 static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, 2891 struct device *dev, ioasid_t pasid, 2892 struct iommu_domain *old) 2893 { 2894 amd_iommu_remove_dev_pasid(dev, pasid, old); 2895 return 0; 2896 } 2897 2898 static struct iommu_domain blocked_domain = { 2899 .type = IOMMU_DOMAIN_BLOCKED, 2900 .ops = &(const struct iommu_domain_ops) { 2901 .attach_dev = blocked_domain_attach_device, 2902 .set_dev_pasid = blocked_domain_set_dev_pasid, 2903 } 2904 }; 2905 2906 static struct protection_domain identity_domain; 2907 2908 static int amd_iommu_identity_attach(struct iommu_domain *dom, struct device *dev, 2909 struct iommu_domain *old) 2910 { 2911 /* 2912 * Don't allow attaching a device to the identity domain if SNP is 2913 * enabled. 2914 */ 2915 if (amd_iommu_snp_en) 2916 return -EINVAL; 2917 2918 return amd_iommu_attach_device(dom, dev, old); 2919 } 2920 2921 static const struct iommu_domain_ops identity_domain_ops = { 2922 .attach_dev = amd_iommu_identity_attach, 2923 }; 2924 2925 void amd_iommu_init_identity_domain(void) 2926 { 2927 struct iommu_domain *domain = &identity_domain.domain; 2928 2929 domain->type = IOMMU_DOMAIN_IDENTITY; 2930 domain->ops = &identity_domain_ops; 2931 domain->owner = &amd_iommu_ops; 2932 2933 identity_domain.id = amd_iommu_pdom_id_alloc(); 2934 2935 protection_domain_init(&identity_domain); 2936 } 2937 2938 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 2939 struct iommu_domain *old) 2940 { 2941 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2942 struct protection_domain *domain = to_pdomain(dom); 2943 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2944 int ret; 2945 2946 /* 2947 * Skip attach device to domain if new domain is same as 2948 * devices current domain 2949 */ 2950 if (dev_data->domain == domain) 2951 return 0; 2952 2953 dev_data->defer_attach = false; 2954 2955 /* 2956 * Restrict to devices with compatible IOMMU hardware support 2957 * when enforcement of dirty tracking is enabled. 2958 */ 2959 if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) 2960 return -EINVAL; 2961 2962 if (dev_data->domain) 2963 detach_device(dev); 2964 2965 ret = attach_device(dev, domain); 2966 2967 #ifdef CONFIG_IRQ_REMAP 2968 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { 2969 if (dom->type == IOMMU_DOMAIN_UNMANAGED) 2970 dev_data->use_vapic = 1; 2971 else 2972 dev_data->use_vapic = 0; 2973 } 2974 #endif 2975 2976 return ret; 2977 } 2978 2979 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) 2980 { 2981 switch (cap) { 2982 case IOMMU_CAP_CACHE_COHERENCY: 2983 return true; 2984 case IOMMU_CAP_NOEXEC: 2985 return false; 2986 case IOMMU_CAP_PRE_BOOT_PROTECTION: 2987 return amdr_ivrs_remap_support; 2988 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 2989 return true; 2990 case IOMMU_CAP_DIRTY_TRACKING: { 2991 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2992 2993 return amd_iommu_hd_support(iommu); 2994 } 2995 case IOMMU_CAP_PCI_ATS_SUPPORTED: { 2996 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2997 2998 return amd_iommu_iotlb_sup && 2999 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); 3000 } 3001 default: 3002 break; 3003 } 3004 3005 return false; 3006 } 3007 3008 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 3009 bool enable) 3010 { 3011 struct protection_domain *pdomain = to_pdomain(domain); 3012 struct dev_table_entry *dte; 3013 struct iommu_dev_data *dev_data; 3014 bool domain_flush = false; 3015 struct amd_iommu *iommu; 3016 unsigned long flags; 3017 u64 new; 3018 3019 spin_lock_irqsave(&pdomain->lock, flags); 3020 if (!(pdomain->dirty_tracking ^ enable)) { 3021 spin_unlock_irqrestore(&pdomain->lock, flags); 3022 return 0; 3023 } 3024 3025 list_for_each_entry(dev_data, &pdomain->dev_list, list) { 3026 spin_lock(&dev_data->dte_lock); 3027 iommu = get_amd_iommu_from_dev_data(dev_data); 3028 dte = &get_dev_table(iommu)[dev_data->devid]; 3029 new = dte->data[0]; 3030 new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); 3031 dte->data[0] = new; 3032 spin_unlock(&dev_data->dte_lock); 3033 3034 /* Flush device DTE */ 3035 device_flush_dte(dev_data); 3036 domain_flush = true; 3037 } 3038 3039 /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ 3040 if (domain_flush) 3041 amd_iommu_domain_flush_all(pdomain); 3042 3043 pdomain->dirty_tracking = enable; 3044 spin_unlock_irqrestore(&pdomain->lock, flags); 3045 3046 return 0; 3047 } 3048 3049 static void amd_iommu_get_resv_regions(struct device *dev, 3050 struct list_head *head) 3051 { 3052 struct iommu_resv_region *region; 3053 struct unity_map_entry *entry; 3054 struct amd_iommu *iommu; 3055 struct amd_iommu_pci_seg *pci_seg; 3056 int devid, sbdf; 3057 3058 sbdf = get_device_sbdf_id(dev); 3059 if (sbdf < 0) 3060 return; 3061 3062 devid = PCI_SBDF_TO_DEVID(sbdf); 3063 iommu = get_amd_iommu_from_dev(dev); 3064 pci_seg = iommu->pci_seg; 3065 3066 list_for_each_entry(entry, &pci_seg->unity_map, list) { 3067 int type, prot = 0; 3068 size_t length; 3069 3070 if (devid < entry->devid_start || devid > entry->devid_end) 3071 continue; 3072 3073 type = IOMMU_RESV_DIRECT; 3074 length = entry->address_end - entry->address_start; 3075 if (entry->prot & IOMMU_PROT_IR) 3076 prot |= IOMMU_READ; 3077 if (entry->prot & IOMMU_PROT_IW) 3078 prot |= IOMMU_WRITE; 3079 3080 region = iommu_alloc_resv_region(entry->address_start, 3081 length, prot, type, 3082 GFP_KERNEL); 3083 if (!region) { 3084 dev_err(dev, "Out of memory allocating dm-regions\n"); 3085 return; 3086 } 3087 list_add_tail(®ion->list, head); 3088 } 3089 3090 region = iommu_alloc_resv_region(MSI_RANGE_START, 3091 MSI_RANGE_END - MSI_RANGE_START + 1, 3092 0, IOMMU_RESV_MSI, GFP_KERNEL); 3093 if (!region) 3094 return; 3095 list_add_tail(®ion->list, head); 3096 3097 if (amd_iommu_ht_range_ignore()) 3098 return; 3099 3100 region = iommu_alloc_resv_region(HT_RANGE_START, 3101 HT_RANGE_END - HT_RANGE_START + 1, 3102 0, IOMMU_RESV_RESERVED, GFP_KERNEL); 3103 if (!region) 3104 return; 3105 list_add_tail(®ion->list, head); 3106 } 3107 3108 static bool amd_iommu_is_attach_deferred(struct device *dev) 3109 { 3110 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 3111 3112 return dev_data->defer_attach; 3113 } 3114 3115 static int amd_iommu_def_domain_type(struct device *dev) 3116 { 3117 struct iommu_dev_data *dev_data; 3118 3119 dev_data = dev_iommu_priv_get(dev); 3120 if (!dev_data) 3121 return 0; 3122 3123 /* Always use DMA domain for untrusted device */ 3124 if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) 3125 return IOMMU_DOMAIN_DMA; 3126 3127 /* 3128 * Do not identity map IOMMUv2 capable devices when: 3129 * - memory encryption is active, because some of those devices 3130 * (AMD GPUs) don't have the encryption bit in their DMA-mask 3131 * and require remapping. 3132 * - SNP is enabled, because it prohibits DTE[Mode]=0. 3133 */ 3134 if (pdev_pasid_supported(dev_data) && 3135 !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && 3136 !amd_iommu_snp_en) { 3137 return IOMMU_DOMAIN_IDENTITY; 3138 } 3139 3140 return 0; 3141 } 3142 3143 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3144 { 3145 /* IOMMU_PTE_FC is always set */ 3146 return true; 3147 } 3148 3149 const struct iommu_ops amd_iommu_ops = { 3150 .capable = amd_iommu_capable, 3151 .hw_info = amd_iommufd_hw_info, 3152 .blocked_domain = &blocked_domain, 3153 .release_domain = &blocked_domain, 3154 .identity_domain = &identity_domain.domain, 3155 .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, 3156 .domain_alloc_sva = amd_iommu_domain_alloc_sva, 3157 .probe_device = amd_iommu_probe_device, 3158 .release_device = amd_iommu_release_device, 3159 .device_group = amd_iommu_device_group, 3160 .get_resv_regions = amd_iommu_get_resv_regions, 3161 .is_attach_deferred = amd_iommu_is_attach_deferred, 3162 .def_domain_type = amd_iommu_def_domain_type, 3163 .page_response = amd_iommu_page_response, 3164 .get_viommu_size = amd_iommufd_get_viommu_size, 3165 .viommu_init = amd_iommufd_viommu_init, 3166 }; 3167 3168 #ifdef CONFIG_IRQ_REMAP 3169 3170 /***************************************************************************** 3171 * 3172 * Interrupt Remapping Implementation 3173 * 3174 *****************************************************************************/ 3175 3176 static struct irq_chip amd_ir_chip; 3177 static DEFINE_SPINLOCK(iommu_table_lock); 3178 3179 static int iommu_flush_dev_irt(struct pci_dev *unused, u16 devid, void *data) 3180 { 3181 int ret; 3182 struct iommu_cmd cmd; 3183 struct amd_iommu *iommu = data; 3184 3185 build_inv_irt(&cmd, devid); 3186 ret = __iommu_queue_command_sync(iommu, &cmd, true); 3187 return ret; 3188 } 3189 3190 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) 3191 { 3192 int ret; 3193 u64 data; 3194 unsigned long flags; 3195 struct iommu_cmd cmd; 3196 struct pci_dev *pdev = NULL; 3197 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3198 3199 if (iommu->irtcachedis_enabled) 3200 return; 3201 3202 if (dev_data && dev_data->dev && dev_is_pci(dev_data->dev)) 3203 pdev = to_pci_dev(dev_data->dev); 3204 3205 raw_spin_lock_irqsave(&iommu->lock, flags); 3206 data = get_cmdsem_val(iommu); 3207 build_completion_wait(&cmd, iommu, data); 3208 3209 if (pdev) 3210 ret = pci_for_each_dma_alias(pdev, iommu_flush_dev_irt, iommu); 3211 else 3212 ret = iommu_flush_dev_irt(NULL, devid, iommu); 3213 if (ret) 3214 goto out_err; 3215 3216 ret = __iommu_queue_command_sync(iommu, &cmd, false); 3217 if (ret) 3218 goto out_err; 3219 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3220 3221 wait_on_sem(iommu, data); 3222 return; 3223 3224 out_err: 3225 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3226 } 3227 3228 static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) 3229 { 3230 if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) 3231 return DTE_INTTABLEN_2K; 3232 return DTE_INTTABLEN_512; 3233 } 3234 3235 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, 3236 struct irq_remap_table *table) 3237 { 3238 u64 new; 3239 struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; 3240 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3241 3242 if (dev_data) 3243 spin_lock(&dev_data->dte_lock); 3244 3245 new = READ_ONCE(dte->data[2]); 3246 new &= ~DTE_IRQ_PHYS_ADDR_MASK; 3247 new |= iommu_virt_to_phys(table->table); 3248 new |= DTE_IRQ_REMAP_INTCTL; 3249 new |= iommu_get_int_tablen(dev_data); 3250 new |= DTE_IRQ_REMAP_ENABLE; 3251 WRITE_ONCE(dte->data[2], new); 3252 3253 if (dev_data) 3254 spin_unlock(&dev_data->dte_lock); 3255 } 3256 3257 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) 3258 { 3259 struct irq_remap_table *table; 3260 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3261 3262 if (WARN_ONCE(!pci_seg->rlookup_table[devid], 3263 "%s: no iommu for devid %x:%x\n", 3264 __func__, pci_seg->id, devid)) 3265 return NULL; 3266 3267 table = pci_seg->irq_lookup_table[devid]; 3268 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n", 3269 __func__, pci_seg->id, devid)) 3270 return NULL; 3271 3272 return table; 3273 } 3274 3275 static struct irq_remap_table *__alloc_irq_table(int nid, size_t size) 3276 { 3277 struct irq_remap_table *table; 3278 3279 table = kzalloc_obj(*table); 3280 if (!table) 3281 return NULL; 3282 3283 table->table = iommu_alloc_pages_node_sz( 3284 nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size)); 3285 if (!table->table) { 3286 kfree(table); 3287 return NULL; 3288 } 3289 raw_spin_lock_init(&table->lock); 3290 3291 return table; 3292 } 3293 3294 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, 3295 struct irq_remap_table *table) 3296 { 3297 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3298 3299 pci_seg->irq_lookup_table[devid] = table; 3300 set_dte_irq_entry(iommu, devid, table); 3301 iommu_flush_dte(iommu, devid); 3302 } 3303 3304 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, 3305 void *data) 3306 { 3307 struct irq_remap_table *table = data; 3308 struct amd_iommu_pci_seg *pci_seg; 3309 struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev); 3310 3311 if (!iommu) 3312 return -EINVAL; 3313 3314 pci_seg = iommu->pci_seg; 3315 pci_seg->irq_lookup_table[alias] = table; 3316 set_dte_irq_entry(iommu, alias, table); 3317 iommu_flush_dte(pci_seg->rlookup_table[alias], alias); 3318 3319 return 0; 3320 } 3321 3322 static inline size_t get_irq_table_size(unsigned int max_irqs) 3323 { 3324 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3325 return max_irqs * sizeof(u32); 3326 3327 return max_irqs * (sizeof(u64) * 2); 3328 } 3329 3330 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, 3331 u16 devid, struct pci_dev *pdev, 3332 unsigned int max_irqs) 3333 { 3334 struct irq_remap_table *table = NULL; 3335 struct irq_remap_table *new_table = NULL; 3336 struct amd_iommu_pci_seg *pci_seg; 3337 unsigned long flags; 3338 int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3339 u16 alias; 3340 3341 spin_lock_irqsave(&iommu_table_lock, flags); 3342 3343 pci_seg = iommu->pci_seg; 3344 table = pci_seg->irq_lookup_table[devid]; 3345 if (table) 3346 goto out_unlock; 3347 3348 alias = pci_seg->alias_table[devid]; 3349 table = pci_seg->irq_lookup_table[alias]; 3350 if (table) { 3351 set_remap_table_entry(iommu, devid, table); 3352 goto out_wait; 3353 } 3354 spin_unlock_irqrestore(&iommu_table_lock, flags); 3355 3356 /* Nothing there yet, allocate new irq remapping table */ 3357 new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs)); 3358 if (!new_table) 3359 return NULL; 3360 3361 spin_lock_irqsave(&iommu_table_lock, flags); 3362 3363 table = pci_seg->irq_lookup_table[devid]; 3364 if (table) 3365 goto out_unlock; 3366 3367 table = pci_seg->irq_lookup_table[alias]; 3368 if (table) { 3369 set_remap_table_entry(iommu, devid, table); 3370 goto out_wait; 3371 } 3372 3373 table = new_table; 3374 new_table = NULL; 3375 3376 if (pdev) 3377 pci_for_each_dma_alias(pdev, set_remap_table_entry_alias, 3378 table); 3379 else 3380 set_remap_table_entry(iommu, devid, table); 3381 3382 if (devid != alias) 3383 set_remap_table_entry(iommu, alias, table); 3384 3385 out_wait: 3386 iommu_completion_wait(iommu); 3387 3388 out_unlock: 3389 spin_unlock_irqrestore(&iommu_table_lock, flags); 3390 3391 if (new_table) { 3392 iommu_free_pages(new_table->table); 3393 kfree(new_table); 3394 } 3395 return table; 3396 } 3397 3398 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, 3399 bool align, struct pci_dev *pdev, 3400 unsigned long max_irqs) 3401 { 3402 struct irq_remap_table *table; 3403 int index, c, alignment = 1; 3404 unsigned long flags; 3405 3406 table = alloc_irq_table(iommu, devid, pdev, max_irqs); 3407 if (!table) 3408 return -ENODEV; 3409 3410 if (align) 3411 alignment = roundup_pow_of_two(count); 3412 3413 raw_spin_lock_irqsave(&table->lock, flags); 3414 3415 /* Scan table for free entries */ 3416 for (index = ALIGN(table->min_index, alignment), c = 0; 3417 index < max_irqs;) { 3418 if (!iommu->irte_ops->is_allocated(table, index)) { 3419 c += 1; 3420 } else { 3421 c = 0; 3422 index = ALIGN(index + 1, alignment); 3423 continue; 3424 } 3425 3426 if (c == count) { 3427 for (; c != 0; --c) 3428 iommu->irte_ops->set_allocated(table, index - c + 1); 3429 3430 index -= count - 1; 3431 goto out; 3432 } 3433 3434 index++; 3435 } 3436 3437 index = -ENOSPC; 3438 3439 out: 3440 raw_spin_unlock_irqrestore(&table->lock, flags); 3441 3442 return index; 3443 } 3444 3445 static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3446 struct irte_ga *irte) 3447 { 3448 struct irq_remap_table *table; 3449 struct irte_ga *entry; 3450 unsigned long flags; 3451 u128 old; 3452 3453 table = get_irq_table(iommu, devid); 3454 if (!table) 3455 return -ENOMEM; 3456 3457 raw_spin_lock_irqsave(&table->lock, flags); 3458 3459 entry = (struct irte_ga *)table->table; 3460 entry = &entry[index]; 3461 3462 /* 3463 * We use cmpxchg16 to atomically update the 128-bit IRTE, 3464 * and it cannot be updated by the hardware or other processors 3465 * behind us, so the return value of cmpxchg16 should be the 3466 * same as the old value. 3467 */ 3468 old = entry->irte; 3469 WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte)); 3470 3471 raw_spin_unlock_irqrestore(&table->lock, flags); 3472 3473 return 0; 3474 } 3475 3476 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3477 struct irte_ga *irte) 3478 { 3479 int ret; 3480 3481 ret = __modify_irte_ga(iommu, devid, index, irte); 3482 if (ret) 3483 return ret; 3484 3485 iommu_flush_irt_and_complete(iommu, devid); 3486 3487 return 0; 3488 } 3489 3490 static int modify_irte(struct amd_iommu *iommu, 3491 u16 devid, int index, union irte *irte) 3492 { 3493 struct irq_remap_table *table; 3494 unsigned long flags; 3495 3496 table = get_irq_table(iommu, devid); 3497 if (!table) 3498 return -ENOMEM; 3499 3500 raw_spin_lock_irqsave(&table->lock, flags); 3501 table->table[index] = irte->val; 3502 raw_spin_unlock_irqrestore(&table->lock, flags); 3503 3504 iommu_flush_irt_and_complete(iommu, devid); 3505 3506 return 0; 3507 } 3508 3509 static void free_irte(struct amd_iommu *iommu, u16 devid, int index) 3510 { 3511 struct irq_remap_table *table; 3512 unsigned long flags; 3513 3514 table = get_irq_table(iommu, devid); 3515 if (!table) 3516 return; 3517 3518 raw_spin_lock_irqsave(&table->lock, flags); 3519 iommu->irte_ops->clear_allocated(table, index); 3520 raw_spin_unlock_irqrestore(&table->lock, flags); 3521 3522 iommu_flush_irt_and_complete(iommu, devid); 3523 } 3524 3525 static void irte_prepare(void *entry, 3526 u32 delivery_mode, bool dest_mode, 3527 u8 vector, u32 dest_apicid, int devid) 3528 { 3529 union irte *irte = (union irte *) entry; 3530 3531 irte->val = 0; 3532 irte->fields.vector = vector; 3533 irte->fields.int_type = delivery_mode; 3534 irte->fields.destination = dest_apicid; 3535 irte->fields.dm = dest_mode; 3536 irte->fields.valid = 1; 3537 } 3538 3539 static void irte_ga_prepare(void *entry, 3540 u32 delivery_mode, bool dest_mode, 3541 u8 vector, u32 dest_apicid, int devid) 3542 { 3543 struct irte_ga *irte = (struct irte_ga *) entry; 3544 3545 irte->lo.val = 0; 3546 irte->hi.val = 0; 3547 irte->lo.fields_remap.int_type = delivery_mode; 3548 irte->lo.fields_remap.dm = dest_mode; 3549 irte->hi.fields.vector = vector; 3550 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid); 3551 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid); 3552 irte->lo.fields_remap.valid = 1; 3553 } 3554 3555 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3556 { 3557 union irte *irte = (union irte *) entry; 3558 3559 irte->fields.valid = 1; 3560 modify_irte(iommu, devid, index, irte); 3561 } 3562 3563 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3564 { 3565 struct irte_ga *irte = (struct irte_ga *) entry; 3566 3567 irte->lo.fields_remap.valid = 1; 3568 modify_irte_ga(iommu, devid, index, irte); 3569 } 3570 3571 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3572 { 3573 union irte *irte = (union irte *) entry; 3574 3575 irte->fields.valid = 0; 3576 modify_irte(iommu, devid, index, irte); 3577 } 3578 3579 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3580 { 3581 struct irte_ga *irte = (struct irte_ga *) entry; 3582 3583 irte->lo.fields_remap.valid = 0; 3584 modify_irte_ga(iommu, devid, index, irte); 3585 } 3586 3587 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3588 u8 vector, u32 dest_apicid) 3589 { 3590 union irte *irte = (union irte *) entry; 3591 3592 irte->fields.vector = vector; 3593 irte->fields.destination = dest_apicid; 3594 modify_irte(iommu, devid, index, irte); 3595 } 3596 3597 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3598 u8 vector, u32 dest_apicid) 3599 { 3600 struct irte_ga *irte = (struct irte_ga *) entry; 3601 3602 if (!irte->lo.fields_remap.guest_mode) { 3603 irte->hi.fields.vector = vector; 3604 irte->lo.fields_remap.destination = 3605 APICID_TO_IRTE_DEST_LO(dest_apicid); 3606 irte->hi.fields.destination = 3607 APICID_TO_IRTE_DEST_HI(dest_apicid); 3608 modify_irte_ga(iommu, devid, index, irte); 3609 } 3610 } 3611 3612 #define IRTE_ALLOCATED (~1U) 3613 static void irte_set_allocated(struct irq_remap_table *table, int index) 3614 { 3615 table->table[index] = IRTE_ALLOCATED; 3616 } 3617 3618 static void irte_ga_set_allocated(struct irq_remap_table *table, int index) 3619 { 3620 struct irte_ga *ptr = (struct irte_ga *)table->table; 3621 struct irte_ga *irte = &ptr[index]; 3622 3623 memset(&irte->lo.val, 0, sizeof(u64)); 3624 memset(&irte->hi.val, 0, sizeof(u64)); 3625 irte->hi.fields.vector = 0xff; 3626 } 3627 3628 static bool irte_is_allocated(struct irq_remap_table *table, int index) 3629 { 3630 union irte *ptr = (union irte *)table->table; 3631 union irte *irte = &ptr[index]; 3632 3633 return irte->val != 0; 3634 } 3635 3636 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) 3637 { 3638 struct irte_ga *ptr = (struct irte_ga *)table->table; 3639 struct irte_ga *irte = &ptr[index]; 3640 3641 return irte->hi.fields.vector != 0; 3642 } 3643 3644 static void irte_clear_allocated(struct irq_remap_table *table, int index) 3645 { 3646 table->table[index] = 0; 3647 } 3648 3649 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) 3650 { 3651 struct irte_ga *ptr = (struct irte_ga *)table->table; 3652 struct irte_ga *irte = &ptr[index]; 3653 3654 memset(&irte->lo.val, 0, sizeof(u64)); 3655 memset(&irte->hi.val, 0, sizeof(u64)); 3656 } 3657 3658 static int get_devid(struct irq_alloc_info *info) 3659 { 3660 switch (info->type) { 3661 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3662 return get_ioapic_devid(info->devid); 3663 case X86_IRQ_ALLOC_TYPE_HPET: 3664 return get_hpet_devid(info->devid); 3665 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3666 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3667 return get_device_sbdf_id(msi_desc_to_dev(info->desc)); 3668 default: 3669 WARN_ON_ONCE(1); 3670 return -1; 3671 } 3672 } 3673 3674 struct irq_remap_ops amd_iommu_irq_ops = { 3675 .prepare = amd_iommu_prepare, 3676 .enable = amd_iommu_enable, 3677 .disable = amd_iommu_disable, 3678 .reenable = amd_iommu_reenable, 3679 .enable_faulting = amd_iommu_enable_faulting, 3680 }; 3681 3682 static void fill_msi_msg(struct msi_msg *msg, u32 index) 3683 { 3684 msg->data = index; 3685 msg->address_lo = 0; 3686 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; 3687 /* 3688 * The struct msi_msg.dest_mode_logical is used to set the DM bit 3689 * in MSI Message Address Register. For device w/ 2K int-remap support, 3690 * this is bit must be set to 1 regardless of the actual destination 3691 * mode, which is signified by the IRTE[DM]. 3692 */ 3693 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 3694 msg->arch_addr_lo.dest_mode_logical = true; 3695 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; 3696 } 3697 3698 static void irq_remapping_prepare_irte(struct amd_ir_data *data, 3699 struct irq_cfg *irq_cfg, 3700 struct irq_alloc_info *info, 3701 int devid, int index, int sub_handle) 3702 { 3703 struct irq_2_irte *irte_info = &data->irq_2_irte; 3704 struct amd_iommu *iommu = data->iommu; 3705 3706 if (!iommu) 3707 return; 3708 3709 data->irq_2_irte.devid = devid; 3710 data->irq_2_irte.index = index + sub_handle; 3711 iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED, 3712 apic->dest_mode_logical, irq_cfg->vector, 3713 irq_cfg->dest_apicid, devid); 3714 3715 switch (info->type) { 3716 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3717 case X86_IRQ_ALLOC_TYPE_HPET: 3718 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3719 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3720 fill_msi_msg(&data->msi_entry, irte_info->index); 3721 break; 3722 3723 default: 3724 BUG_ON(1); 3725 break; 3726 } 3727 } 3728 3729 struct amd_irte_ops irte_32_ops = { 3730 .prepare = irte_prepare, 3731 .activate = irte_activate, 3732 .deactivate = irte_deactivate, 3733 .set_affinity = irte_set_affinity, 3734 .set_allocated = irte_set_allocated, 3735 .is_allocated = irte_is_allocated, 3736 .clear_allocated = irte_clear_allocated, 3737 }; 3738 3739 struct amd_irte_ops irte_128_ops = { 3740 .prepare = irte_ga_prepare, 3741 .activate = irte_ga_activate, 3742 .deactivate = irte_ga_deactivate, 3743 .set_affinity = irte_ga_set_affinity, 3744 .set_allocated = irte_ga_set_allocated, 3745 .is_allocated = irte_ga_is_allocated, 3746 .clear_allocated = irte_ga_clear_allocated, 3747 }; 3748 3749 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, 3750 unsigned int nr_irqs, void *arg) 3751 { 3752 struct irq_alloc_info *info = arg; 3753 struct irq_data *irq_data; 3754 struct amd_ir_data *data = NULL; 3755 struct amd_iommu *iommu; 3756 struct irq_cfg *cfg; 3757 struct iommu_dev_data *dev_data; 3758 unsigned long max_irqs; 3759 int i, ret, devid, seg, sbdf; 3760 int index; 3761 3762 if (!info) 3763 return -EINVAL; 3764 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) 3765 return -EINVAL; 3766 3767 sbdf = get_devid(info); 3768 if (sbdf < 0) 3769 return -EINVAL; 3770 3771 seg = PCI_SBDF_TO_SEGID(sbdf); 3772 devid = PCI_SBDF_TO_DEVID(sbdf); 3773 iommu = __rlookup_amd_iommu(seg, devid); 3774 if (!iommu) 3775 return -EINVAL; 3776 3777 dev_data = search_dev_data(iommu, devid); 3778 max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; 3779 3780 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); 3781 if (ret < 0) 3782 return ret; 3783 3784 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { 3785 struct irq_remap_table *table; 3786 3787 table = alloc_irq_table(iommu, devid, NULL, max_irqs); 3788 if (table) { 3789 if (!table->min_index) { 3790 /* 3791 * Keep the first 32 indexes free for IOAPIC 3792 * interrupts. 3793 */ 3794 table->min_index = 32; 3795 for (i = 0; i < 32; ++i) 3796 iommu->irte_ops->set_allocated(table, i); 3797 } 3798 WARN_ON(table->min_index != 32); 3799 index = info->ioapic.pin; 3800 } else { 3801 index = -ENOMEM; 3802 } 3803 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI || 3804 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) { 3805 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); 3806 3807 index = alloc_irq_index(iommu, devid, nr_irqs, align, 3808 msi_desc_to_pci_dev(info->desc), 3809 max_irqs); 3810 } else { 3811 index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, 3812 max_irqs); 3813 } 3814 3815 if (index < 0) { 3816 pr_warn("Failed to allocate IRTE\n"); 3817 ret = index; 3818 goto out_free_parent; 3819 } 3820 3821 for (i = 0; i < nr_irqs; i++) { 3822 irq_data = irq_domain_get_irq_data(domain, virq + i); 3823 cfg = irq_data ? irqd_cfg(irq_data) : NULL; 3824 if (!cfg) { 3825 ret = -EINVAL; 3826 goto out_free_data; 3827 } 3828 3829 ret = -ENOMEM; 3830 data = kzalloc_obj(*data); 3831 if (!data) 3832 goto out_free_data; 3833 3834 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3835 data->entry = kzalloc_obj(union irte); 3836 else 3837 data->entry = kzalloc_obj(struct irte_ga); 3838 if (!data->entry) { 3839 kfree(data); 3840 goto out_free_data; 3841 } 3842 3843 data->iommu = iommu; 3844 irq_data->hwirq = (devid << 16) + i; 3845 irq_data->chip_data = data; 3846 irq_data->chip = &amd_ir_chip; 3847 irq_remapping_prepare_irte(data, cfg, info, devid, index, i); 3848 } 3849 3850 return 0; 3851 3852 out_free_data: 3853 for (i--; i >= 0; i--) { 3854 irq_data = irq_domain_get_irq_data(domain, virq + i); 3855 if (irq_data) 3856 kfree(irq_data->chip_data); 3857 } 3858 for (i = 0; i < nr_irqs; i++) 3859 free_irte(iommu, devid, index + i); 3860 out_free_parent: 3861 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3862 return ret; 3863 } 3864 3865 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, 3866 unsigned int nr_irqs) 3867 { 3868 struct irq_2_irte *irte_info; 3869 struct irq_data *irq_data; 3870 struct amd_ir_data *data; 3871 int i; 3872 3873 for (i = 0; i < nr_irqs; i++) { 3874 irq_data = irq_domain_get_irq_data(domain, virq + i); 3875 if (irq_data && irq_data->chip_data) { 3876 data = irq_data->chip_data; 3877 irte_info = &data->irq_2_irte; 3878 free_irte(data->iommu, irte_info->devid, irte_info->index); 3879 kfree(data->entry); 3880 kfree(data); 3881 } 3882 } 3883 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3884 } 3885 3886 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 3887 struct amd_ir_data *ir_data, 3888 struct irq_2_irte *irte_info, 3889 struct irq_cfg *cfg); 3890 3891 static int irq_remapping_activate(struct irq_domain *domain, 3892 struct irq_data *irq_data, bool reserve) 3893 { 3894 struct amd_ir_data *data = irq_data->chip_data; 3895 struct irq_2_irte *irte_info = &data->irq_2_irte; 3896 struct amd_iommu *iommu = data->iommu; 3897 struct irq_cfg *cfg = irqd_cfg(irq_data); 3898 3899 if (!iommu) 3900 return 0; 3901 3902 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid, 3903 irte_info->index); 3904 amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg); 3905 return 0; 3906 } 3907 3908 static void irq_remapping_deactivate(struct irq_domain *domain, 3909 struct irq_data *irq_data) 3910 { 3911 struct amd_ir_data *data = irq_data->chip_data; 3912 struct irq_2_irte *irte_info = &data->irq_2_irte; 3913 struct amd_iommu *iommu = data->iommu; 3914 3915 if (iommu) 3916 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid, 3917 irte_info->index); 3918 } 3919 3920 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, 3921 enum irq_domain_bus_token bus_token) 3922 { 3923 struct amd_iommu *iommu; 3924 int devid = -1; 3925 3926 if (!amd_iommu_irq_remap) 3927 return 0; 3928 3929 if (x86_fwspec_is_ioapic(fwspec)) 3930 devid = get_ioapic_devid(fwspec->param[0]); 3931 else if (x86_fwspec_is_hpet(fwspec)) 3932 devid = get_hpet_devid(fwspec->param[0]); 3933 3934 if (devid < 0) 3935 return 0; 3936 iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff)); 3937 3938 return iommu && iommu->ir_domain == d; 3939 } 3940 3941 static const struct irq_domain_ops amd_ir_domain_ops = { 3942 .select = irq_remapping_select, 3943 .alloc = irq_remapping_alloc, 3944 .free = irq_remapping_free, 3945 .activate = irq_remapping_activate, 3946 .deactivate = irq_remapping_deactivate, 3947 }; 3948 3949 static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, 3950 bool ga_log_intr) 3951 { 3952 if (cpu >= 0) { 3953 entry->lo.fields_vapic.destination = 3954 APICID_TO_IRTE_DEST_LO(cpu); 3955 entry->hi.fields.destination = 3956 APICID_TO_IRTE_DEST_HI(cpu); 3957 entry->lo.fields_vapic.is_run = true; 3958 entry->lo.fields_vapic.ga_log_intr = false; 3959 } else { 3960 entry->lo.fields_vapic.is_run = false; 3961 entry->lo.fields_vapic.ga_log_intr = ga_log_intr; 3962 } 3963 } 3964 3965 /* 3966 * Update the pCPU information for an IRTE that is configured to post IRQs to 3967 * a vCPU, without issuing an IOMMU invalidation for the IRTE. 3968 * 3969 * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination 3970 * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't 3971 * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based 3972 * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is 3973 * blocking and requires a notification wake event). I.e. treat vCPUs that are 3974 * associated with a pCPU as running. This API is intended to be used when a 3975 * vCPU is scheduled in/out (or stops running for any reason), to do a fast 3976 * update of IsRun, GALogIntr, and (conditionally) Destination. 3977 * 3978 * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached 3979 * and thus don't require an invalidation to ensure the IOMMU consumes fresh 3980 * information. 3981 */ 3982 int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 3983 { 3984 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3985 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3986 3987 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3988 return -EINVAL; 3989 3990 if (!entry || !entry->lo.fields_vapic.guest_mode) 3991 return 0; 3992 3993 if (!ir_data->iommu) 3994 return -ENODEV; 3995 3996 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3997 3998 return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3999 ir_data->irq_2_irte.index, entry); 4000 } 4001 EXPORT_SYMBOL(amd_iommu_update_ga); 4002 4003 int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 4004 { 4005 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4006 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4007 u64 valid; 4008 4009 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4010 return -EINVAL; 4011 4012 if (!entry) 4013 return 0; 4014 4015 valid = entry->lo.fields_vapic.valid; 4016 4017 entry->lo.val = 0; 4018 entry->hi.val = 0; 4019 4020 entry->lo.fields_vapic.valid = valid; 4021 entry->lo.fields_vapic.guest_mode = 1; 4022 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; 4023 entry->hi.fields.vector = ir_data->ga_vector; 4024 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; 4025 4026 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 4027 4028 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4029 ir_data->irq_2_irte.index, entry); 4030 } 4031 EXPORT_SYMBOL(amd_iommu_activate_guest_mode); 4032 4033 int amd_iommu_deactivate_guest_mode(void *data) 4034 { 4035 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4036 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4037 struct irq_cfg *cfg = ir_data->cfg; 4038 u64 valid; 4039 4040 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4041 return -EINVAL; 4042 4043 if (!entry || !entry->lo.fields_vapic.guest_mode) 4044 return 0; 4045 4046 valid = entry->lo.fields_remap.valid; 4047 4048 entry->lo.val = 0; 4049 entry->hi.val = 0; 4050 4051 entry->lo.fields_remap.valid = valid; 4052 entry->lo.fields_remap.dm = apic->dest_mode_logical; 4053 entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED; 4054 entry->hi.fields.vector = cfg->vector; 4055 entry->lo.fields_remap.destination = 4056 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); 4057 entry->hi.fields.destination = 4058 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid); 4059 4060 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4061 ir_data->irq_2_irte.index, entry); 4062 } 4063 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); 4064 4065 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) 4066 { 4067 int ret; 4068 struct amd_iommu_pi_data *pi_data = info; 4069 struct amd_ir_data *ir_data = data->chip_data; 4070 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4071 struct iommu_dev_data *dev_data; 4072 4073 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4074 return -EINVAL; 4075 4076 if (ir_data->iommu == NULL) 4077 return -EINVAL; 4078 4079 dev_data = search_dev_data(ir_data->iommu, irte_info->devid); 4080 4081 /* Note: 4082 * This device has never been set up for guest mode. 4083 * we should not modify the IRTE 4084 */ 4085 if (!dev_data || !dev_data->use_vapic) 4086 return -EINVAL; 4087 4088 ir_data->cfg = irqd_cfg(data); 4089 4090 if (pi_data) { 4091 pi_data->ir_data = ir_data; 4092 4093 ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); 4094 ir_data->ga_vector = pi_data->vector; 4095 ir_data->ga_tag = pi_data->ga_tag; 4096 if (pi_data->is_guest_mode) 4097 ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, 4098 pi_data->ga_log_intr); 4099 else 4100 ret = amd_iommu_deactivate_guest_mode(ir_data); 4101 } else { 4102 ret = amd_iommu_deactivate_guest_mode(ir_data); 4103 } 4104 4105 return ret; 4106 } 4107 4108 4109 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 4110 struct amd_ir_data *ir_data, 4111 struct irq_2_irte *irte_info, 4112 struct irq_cfg *cfg) 4113 { 4114 4115 /* 4116 * Atomically updates the IRTE with the new destination, vector 4117 * and flushes the interrupt entry cache. 4118 */ 4119 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid, 4120 irte_info->index, cfg->vector, 4121 cfg->dest_apicid); 4122 } 4123 4124 static int amd_ir_set_affinity(struct irq_data *data, 4125 const struct cpumask *mask, bool force) 4126 { 4127 struct amd_ir_data *ir_data = data->chip_data; 4128 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4129 struct irq_cfg *cfg = irqd_cfg(data); 4130 struct irq_data *parent = data->parent_data; 4131 struct amd_iommu *iommu = ir_data->iommu; 4132 int ret; 4133 4134 if (!iommu) 4135 return -ENODEV; 4136 4137 ret = parent->chip->irq_set_affinity(parent, mask, force); 4138 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 4139 return ret; 4140 4141 amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg); 4142 /* 4143 * After this point, all the interrupts will start arriving 4144 * at the new destination. So, time to cleanup the previous 4145 * vector allocation. 4146 */ 4147 vector_schedule_cleanup(cfg); 4148 4149 return IRQ_SET_MASK_OK_DONE; 4150 } 4151 4152 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) 4153 { 4154 struct amd_ir_data *ir_data = irq_data->chip_data; 4155 4156 *msg = ir_data->msi_entry; 4157 } 4158 4159 static struct irq_chip amd_ir_chip = { 4160 .name = "AMD-IR", 4161 .irq_ack = apic_ack_irq, 4162 .irq_set_affinity = amd_ir_set_affinity, 4163 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, 4164 .irq_compose_msi_msg = ir_compose_msi_msg, 4165 }; 4166 4167 static const struct msi_parent_ops amdvi_msi_parent_ops = { 4168 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, 4169 .bus_select_token = DOMAIN_BUS_AMDVI, 4170 .bus_select_mask = MATCH_PCI_MSI, 4171 .prefix = "IR-", 4172 .init_dev_msi_info = msi_parent_init_dev_msi_info, 4173 }; 4174 4175 int amd_iommu_create_irq_domain(struct amd_iommu *iommu) 4176 { 4177 struct irq_domain_info info = { 4178 .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index), 4179 .ops = &amd_ir_domain_ops, 4180 .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, 4181 .host_data = iommu, 4182 .parent = arch_get_ir_parent_domain(), 4183 }; 4184 4185 if (!info.fwnode) 4186 return -ENOMEM; 4187 4188 iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops); 4189 if (!iommu->ir_domain) { 4190 irq_domain_free_fwnode(info.fwnode); 4191 return -ENOMEM; 4192 } 4193 return 0; 4194 } 4195 #endif 4196 4197 MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); 4198