1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 4 * Author: Joerg Roedel <jroedel@suse.de> 5 * Leo Duran <leo.duran@amd.com> 6 */ 7 8 #define pr_fmt(fmt) "AMD-Vi: " fmt 9 #define dev_fmt(fmt) pr_fmt(fmt) 10 11 #include <linux/ratelimit.h> 12 #include <linux/pci.h> 13 #include <linux/acpi.h> 14 #include <linux/pci-ats.h> 15 #include <linux/bitmap.h> 16 #include <linux/slab.h> 17 #include <linux/string_choices.h> 18 #include <linux/debugfs.h> 19 #include <linux/scatterlist.h> 20 #include <linux/dma-map-ops.h> 21 #include <linux/dma-direct.h> 22 #include <linux/idr.h> 23 #include <linux/iommu-helper.h> 24 #include <linux/delay.h> 25 #include <linux/amd-iommu.h> 26 #include <linux/notifier.h> 27 #include <linux/export.h> 28 #include <linux/irq.h> 29 #include <linux/irqchip/irq-msi-lib.h> 30 #include <linux/msi.h> 31 #include <linux/irqdomain.h> 32 #include <linux/percpu.h> 33 #include <linux/cc_platform.h> 34 #include <asm/irq_remapping.h> 35 #include <asm/io_apic.h> 36 #include <asm/apic.h> 37 #include <asm/hw_irq.h> 38 #include <asm/proto.h> 39 #include <asm/iommu.h> 40 #include <asm/gart.h> 41 #include <asm/dma.h> 42 #include <uapi/linux/iommufd.h> 43 #include <linux/generic_pt/iommu.h> 44 45 #include "amd_iommu.h" 46 #include "iommufd.h" 47 #include "../irq_remapping.h" 48 #include "../iommu-pages.h" 49 50 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 51 52 /* Reserved IOVA ranges */ 53 #define MSI_RANGE_START (0xfee00000) 54 #define MSI_RANGE_END (0xfeefffff) 55 #define HT_RANGE_START (0xfd00000000ULL) 56 #define HT_RANGE_END (0xffffffffffULL) 57 58 LIST_HEAD(ioapic_map); 59 LIST_HEAD(hpet_map); 60 LIST_HEAD(acpihid_map); 61 62 const struct iommu_ops amd_iommu_ops; 63 64 int amd_iommu_max_glx_val = -1; 65 66 /* 67 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap 68 * to know which ones are already in use. 69 */ 70 DEFINE_IDA(pdom_ids); 71 72 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 73 struct iommu_domain *old); 74 75 static void set_dte_entry(struct amd_iommu *iommu, 76 struct iommu_dev_data *dev_data, 77 phys_addr_t top_paddr, unsigned int top_level); 78 79 static int device_flush_dte(struct iommu_dev_data *dev_data); 80 81 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 82 phys_addr_t top_paddr, unsigned int top_level); 83 84 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 85 86 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 87 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 88 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 89 bool enable); 90 91 static void clone_aliases(struct amd_iommu *iommu, struct device *dev); 92 93 static int iommu_completion_wait(struct amd_iommu *iommu); 94 95 /**************************************************************************** 96 * 97 * Helper functions 98 * 99 ****************************************************************************/ 100 101 static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) 102 { 103 /* 104 * Note: 105 * We use arch_cmpxchg128_local() because: 106 * - Need cmpxchg16b instruction mainly for 128-bit store to DTE 107 * (not necessary for cmpxchg since this function is already 108 * protected by a spin_lock for this DTE). 109 * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. 110 */ 111 arch_cmpxchg128_local(ptr, *ptr, val); 112 } 113 114 static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) 115 { 116 struct dev_table_entry old; 117 118 old.data128[1] = ptr->data128[1]; 119 /* 120 * Preserve DTE_DATA2_INTR_MASK. This needs to be 121 * done here since it requires to be inside 122 * spin_lock(&dev_data->dte_lock) context. 123 */ 124 new->data[2] &= ~DTE_DATA2_INTR_MASK; 125 new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; 126 127 amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); 128 } 129 130 static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) 131 { 132 amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); 133 } 134 135 /* 136 * Note: 137 * IOMMU reads the entire Device Table entry in a single 256-bit transaction 138 * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver 139 * need to ensure the following: 140 * - DTE[V|GV] bit is being written last when setting. 141 * - DTE[V|GV] bit is being written first when clearing. 142 * 143 * This function is used only by code, which updates DMA translation part of the DTE. 144 * So, only consider control bits related to DMA when updating the entry. 145 */ 146 static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 147 struct dev_table_entry *new) 148 { 149 unsigned long flags; 150 struct dev_table_entry *dev_table = get_dev_table(iommu); 151 struct dev_table_entry *ptr = &dev_table[dev_data->devid]; 152 153 spin_lock_irqsave(&dev_data->dte_lock, flags); 154 155 if (!(ptr->data[0] & DTE_FLAG_V)) { 156 /* Existing DTE is not valid. */ 157 write_dte_upper128(ptr, new); 158 write_dte_lower128(ptr, new); 159 iommu_flush_dte_sync(iommu, dev_data->devid); 160 } else if (!(new->data[0] & DTE_FLAG_V)) { 161 /* Existing DTE is valid. New DTE is not valid. */ 162 write_dte_lower128(ptr, new); 163 write_dte_upper128(ptr, new); 164 iommu_flush_dte_sync(iommu, dev_data->devid); 165 } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { 166 /* 167 * Both DTEs are valid. 168 * Existing DTE has no guest page table. 169 */ 170 write_dte_upper128(ptr, new); 171 write_dte_lower128(ptr, new); 172 iommu_flush_dte_sync(iommu, dev_data->devid); 173 } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { 174 /* 175 * Both DTEs are valid. 176 * Existing DTE has guest page table, 177 * new DTE has no guest page table, 178 */ 179 write_dte_lower128(ptr, new); 180 write_dte_upper128(ptr, new); 181 iommu_flush_dte_sync(iommu, dev_data->devid); 182 } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != 183 FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { 184 /* 185 * Both DTEs are valid and have guest page table, 186 * but have different number of levels. So, we need 187 * to upadte both upper and lower 128-bit value, which 188 * require disabling and flushing. 189 */ 190 struct dev_table_entry clear = {}; 191 192 /* First disable DTE */ 193 write_dte_lower128(ptr, &clear); 194 iommu_flush_dte_sync(iommu, dev_data->devid); 195 196 /* Then update DTE */ 197 write_dte_upper128(ptr, new); 198 write_dte_lower128(ptr, new); 199 iommu_flush_dte_sync(iommu, dev_data->devid); 200 } else { 201 /* 202 * Both DTEs are valid and have guest page table, 203 * and same number of levels. We just need to only 204 * update the lower 128-bit. So no need to disable DTE. 205 */ 206 write_dte_lower128(ptr, new); 207 } 208 209 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 210 } 211 212 void amd_iommu_update_dte(struct amd_iommu *iommu, 213 struct iommu_dev_data *dev_data, 214 struct dev_table_entry *new) 215 { 216 update_dte256(iommu, dev_data, new); 217 clone_aliases(iommu, dev_data->dev); 218 device_flush_dte(dev_data); 219 iommu_completion_wait(iommu); 220 } 221 222 static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, 223 struct dev_table_entry *dte) 224 { 225 unsigned long flags; 226 struct dev_table_entry *ptr; 227 struct dev_table_entry *dev_table = get_dev_table(iommu); 228 229 ptr = &dev_table[dev_data->devid]; 230 231 spin_lock_irqsave(&dev_data->dte_lock, flags); 232 dte->data128[0] = ptr->data128[0]; 233 dte->data128[1] = ptr->data128[1]; 234 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 235 } 236 237 static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) 238 { 239 return (pdom && (pdom->pd_mode == PD_MODE_V2)); 240 } 241 242 static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom) 243 { 244 return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY); 245 } 246 247 /* 248 * We cannot support PASID w/ existing v1 page table in the same domain 249 * since it will be nested. However, existing domain w/ v2 page table 250 * or passthrough mode can be used for PASID. 251 */ 252 static inline bool pdom_is_sva_capable(struct protection_domain *pdom) 253 { 254 return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom); 255 } 256 257 static inline int get_acpihid_device_id(struct device *dev, 258 struct acpihid_map_entry **entry) 259 { 260 struct acpi_device *adev = ACPI_COMPANION(dev); 261 struct acpihid_map_entry *p, *p1 = NULL; 262 int hid_count = 0; 263 bool fw_bug; 264 265 if (!adev) 266 return -ENODEV; 267 268 list_for_each_entry(p, &acpihid_map, list) { 269 if (acpi_dev_hid_uid_match(adev, p->hid, 270 p->uid[0] ? p->uid : NULL)) { 271 p1 = p; 272 fw_bug = false; 273 hid_count = 1; 274 break; 275 } 276 277 /* 278 * Count HID matches w/o UID, raise FW_BUG but allow exactly one match 279 */ 280 if (acpi_dev_hid_match(adev, p->hid)) { 281 p1 = p; 282 hid_count++; 283 fw_bug = true; 284 } 285 } 286 287 if (!p1) 288 return -EINVAL; 289 if (fw_bug) 290 dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", 291 hid_count, str_plural(hid_count)); 292 if (hid_count > 1) 293 return -EINVAL; 294 if (entry) 295 *entry = p1; 296 297 return p1->devid; 298 } 299 300 static inline int get_device_sbdf_id(struct device *dev) 301 { 302 int sbdf; 303 304 if (dev_is_pci(dev)) 305 sbdf = get_pci_sbdf_id(to_pci_dev(dev)); 306 else 307 sbdf = get_acpihid_device_id(dev, NULL); 308 309 return sbdf; 310 } 311 312 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu) 313 { 314 struct dev_table_entry *dev_table; 315 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 316 317 BUG_ON(pci_seg == NULL); 318 dev_table = pci_seg->dev_table; 319 BUG_ON(dev_table == NULL); 320 321 return dev_table; 322 } 323 324 static inline u16 get_device_segment(struct device *dev) 325 { 326 u16 seg; 327 328 if (dev_is_pci(dev)) { 329 struct pci_dev *pdev = to_pci_dev(dev); 330 331 seg = pci_domain_nr(pdev->bus); 332 } else { 333 u32 devid = get_acpihid_device_id(dev, NULL); 334 335 seg = PCI_SBDF_TO_SEGID(devid); 336 } 337 338 return seg; 339 } 340 341 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */ 342 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid) 343 { 344 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 345 346 pci_seg->rlookup_table[devid] = iommu; 347 } 348 349 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) 350 { 351 struct amd_iommu_pci_seg *pci_seg; 352 353 for_each_pci_segment(pci_seg) { 354 if (pci_seg->id == seg) 355 return pci_seg->rlookup_table[devid]; 356 } 357 return NULL; 358 } 359 360 static struct amd_iommu *rlookup_amd_iommu(struct device *dev) 361 { 362 u16 seg = get_device_segment(dev); 363 int devid = get_device_sbdf_id(dev); 364 365 if (devid < 0) 366 return NULL; 367 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid)); 368 } 369 370 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) 371 { 372 struct iommu_dev_data *dev_data; 373 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 374 375 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); 376 if (!dev_data) 377 return NULL; 378 379 mutex_init(&dev_data->mutex); 380 spin_lock_init(&dev_data->dte_lock); 381 dev_data->devid = devid; 382 ratelimit_default_init(&dev_data->rs); 383 384 llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list); 385 return dev_data; 386 } 387 388 struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) 389 { 390 struct iommu_dev_data *dev_data; 391 struct llist_node *node; 392 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 393 394 if (llist_empty(&pci_seg->dev_data_list)) 395 return NULL; 396 397 node = pci_seg->dev_data_list.first; 398 llist_for_each_entry(dev_data, node, dev_data_list) { 399 if (dev_data->devid == devid) 400 return dev_data; 401 } 402 403 return NULL; 404 } 405 406 static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) 407 { 408 struct dev_table_entry new; 409 struct amd_iommu *iommu; 410 struct iommu_dev_data *dev_data, *alias_data; 411 u16 devid = pci_dev_id(pdev); 412 int ret = 0; 413 414 if (devid == alias) 415 return 0; 416 417 iommu = rlookup_amd_iommu(&pdev->dev); 418 if (!iommu) 419 return 0; 420 421 /* Copy the data from pdev */ 422 dev_data = dev_iommu_priv_get(&pdev->dev); 423 if (!dev_data) { 424 pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); 425 ret = -EINVAL; 426 goto out; 427 } 428 get_dte256(iommu, dev_data, &new); 429 430 /* Setup alias */ 431 alias_data = find_dev_data(iommu, alias); 432 if (!alias_data) { 433 pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); 434 ret = -EINVAL; 435 goto out; 436 } 437 update_dte256(iommu, alias_data, &new); 438 439 amd_iommu_set_rlookup_table(iommu, alias); 440 out: 441 return ret; 442 } 443 444 static void clone_aliases(struct amd_iommu *iommu, struct device *dev) 445 { 446 struct pci_dev *pdev; 447 448 if (!dev_is_pci(dev)) 449 return; 450 pdev = to_pci_dev(dev); 451 452 /* 453 * The IVRS alias stored in the alias table may not be 454 * part of the PCI DMA aliases if it's bus differs 455 * from the original device. 456 */ 457 clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL); 458 459 pci_for_each_dma_alias(pdev, clone_alias, NULL); 460 } 461 462 static void setup_aliases(struct amd_iommu *iommu, struct device *dev) 463 { 464 struct pci_dev *pdev = to_pci_dev(dev); 465 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 466 u16 ivrs_alias; 467 468 /* For ACPI HID devices, there are no aliases */ 469 if (!dev_is_pci(dev)) 470 return; 471 472 /* 473 * Add the IVRS alias to the pci aliases if it is on the same 474 * bus. The IVRS table may know about a quirk that we don't. 475 */ 476 ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)]; 477 if (ivrs_alias != pci_dev_id(pdev) && 478 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) 479 pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1); 480 481 clone_aliases(iommu, dev); 482 } 483 484 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid) 485 { 486 struct iommu_dev_data *dev_data; 487 488 dev_data = search_dev_data(iommu, devid); 489 490 if (dev_data == NULL) { 491 dev_data = alloc_dev_data(iommu, devid); 492 if (!dev_data) 493 return NULL; 494 495 if (translation_pre_enabled(iommu)) 496 dev_data->defer_attach = true; 497 } 498 499 return dev_data; 500 } 501 502 /* 503 * Find or create an IOMMU group for a acpihid device. 504 */ 505 static struct iommu_group *acpihid_device_group(struct device *dev) 506 { 507 struct acpihid_map_entry *p, *entry = NULL; 508 int devid; 509 510 devid = get_acpihid_device_id(dev, &entry); 511 if (devid < 0) 512 return ERR_PTR(devid); 513 514 list_for_each_entry(p, &acpihid_map, list) { 515 if ((devid == p->devid) && p->group) 516 entry->group = p->group; 517 } 518 519 if (!entry->group) 520 entry->group = generic_device_group(dev); 521 else 522 iommu_group_ref_get(entry->group); 523 524 return entry->group; 525 } 526 527 static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data) 528 { 529 return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP); 530 } 531 532 static u32 pdev_get_caps(struct pci_dev *pdev) 533 { 534 int features; 535 u32 flags = 0; 536 537 if (pci_ats_supported(pdev)) 538 flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 539 540 if (pci_pri_supported(pdev)) 541 flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 542 543 features = pci_pasid_features(pdev); 544 if (features >= 0) { 545 flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 546 547 if (features & PCI_PASID_CAP_EXEC) 548 flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 549 550 if (features & PCI_PASID_CAP_PRIV) 551 flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 552 } 553 554 return flags; 555 } 556 557 static inline int pdev_enable_cap_ats(struct pci_dev *pdev) 558 { 559 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 560 int ret = -EINVAL; 561 562 if (dev_data->ats_enabled) 563 return 0; 564 565 if (amd_iommu_iotlb_sup && 566 (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) { 567 ret = pci_enable_ats(pdev, PAGE_SHIFT); 568 if (!ret) { 569 dev_data->ats_enabled = 1; 570 dev_data->ats_qdep = pci_ats_queue_depth(pdev); 571 } 572 } 573 574 return ret; 575 } 576 577 static inline void pdev_disable_cap_ats(struct pci_dev *pdev) 578 { 579 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 580 581 if (dev_data->ats_enabled) { 582 pci_disable_ats(pdev); 583 dev_data->ats_enabled = 0; 584 } 585 } 586 587 static inline int pdev_enable_cap_pri(struct pci_dev *pdev) 588 { 589 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 590 int ret = -EINVAL; 591 592 if (dev_data->pri_enabled) 593 return 0; 594 595 if (!dev_data->ats_enabled) 596 return 0; 597 598 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) { 599 /* 600 * First reset the PRI state of the device. 601 * FIXME: Hardcode number of outstanding requests for now 602 */ 603 if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) { 604 dev_data->pri_enabled = 1; 605 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev); 606 607 ret = 0; 608 } 609 } 610 611 return ret; 612 } 613 614 static inline void pdev_disable_cap_pri(struct pci_dev *pdev) 615 { 616 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 617 618 if (dev_data->pri_enabled) { 619 pci_disable_pri(pdev); 620 dev_data->pri_enabled = 0; 621 } 622 } 623 624 static inline int pdev_enable_cap_pasid(struct pci_dev *pdev) 625 { 626 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 627 int ret = -EINVAL; 628 629 if (dev_data->pasid_enabled) 630 return 0; 631 632 if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) { 633 /* Only allow access to user-accessible pages */ 634 ret = pci_enable_pasid(pdev, 0); 635 if (!ret) 636 dev_data->pasid_enabled = 1; 637 } 638 639 return ret; 640 } 641 642 static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) 643 { 644 struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev); 645 646 if (dev_data->pasid_enabled) { 647 pci_disable_pasid(pdev); 648 dev_data->pasid_enabled = 0; 649 } 650 } 651 652 static void pdev_enable_caps(struct pci_dev *pdev) 653 { 654 pdev_enable_cap_pasid(pdev); 655 pdev_enable_cap_ats(pdev); 656 pdev_enable_cap_pri(pdev); 657 } 658 659 static void pdev_disable_caps(struct pci_dev *pdev) 660 { 661 pdev_disable_cap_ats(pdev); 662 pdev_disable_cap_pasid(pdev); 663 pdev_disable_cap_pri(pdev); 664 } 665 666 /* 667 * This function checks if the driver got a valid device from the caller to 668 * avoid dereferencing invalid pointers. 669 */ 670 static bool check_device(struct device *dev) 671 { 672 struct amd_iommu_pci_seg *pci_seg; 673 struct amd_iommu *iommu; 674 int devid, sbdf; 675 676 if (!dev) 677 return false; 678 679 sbdf = get_device_sbdf_id(dev); 680 if (sbdf < 0) 681 return false; 682 devid = PCI_SBDF_TO_DEVID(sbdf); 683 684 iommu = rlookup_amd_iommu(dev); 685 if (!iommu) 686 return false; 687 688 /* Out of our scope? */ 689 pci_seg = iommu->pci_seg; 690 if (devid > pci_seg->last_bdf) 691 return false; 692 693 return true; 694 } 695 696 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) 697 { 698 struct iommu_dev_data *dev_data; 699 int devid, sbdf; 700 701 if (dev_iommu_priv_get(dev)) 702 return 0; 703 704 sbdf = get_device_sbdf_id(dev); 705 if (sbdf < 0) 706 return sbdf; 707 708 devid = PCI_SBDF_TO_DEVID(sbdf); 709 dev_data = find_dev_data(iommu, devid); 710 if (!dev_data) 711 return -ENOMEM; 712 713 dev_data->dev = dev; 714 715 /* 716 * The dev_iommu_priv_set() needes to be called before setup_aliases. 717 * Otherwise, subsequent call to dev_iommu_priv_get() will fail. 718 */ 719 dev_iommu_priv_set(dev, dev_data); 720 setup_aliases(iommu, dev); 721 722 /* 723 * By default we use passthrough mode for IOMMUv2 capable device. 724 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to 725 * invalid address), we ignore the capability for the device so 726 * it'll be forced to go into translation mode. 727 */ 728 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) && 729 dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) { 730 dev_data->flags = pdev_get_caps(to_pci_dev(dev)); 731 } 732 733 return 0; 734 } 735 736 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) 737 { 738 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 739 struct dev_table_entry *dev_table = get_dev_table(iommu); 740 int devid, sbdf; 741 742 sbdf = get_device_sbdf_id(dev); 743 if (sbdf < 0) 744 return; 745 746 devid = PCI_SBDF_TO_DEVID(sbdf); 747 pci_seg->rlookup_table[devid] = NULL; 748 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry)); 749 750 setup_aliases(iommu, dev); 751 } 752 753 754 /**************************************************************************** 755 * 756 * Interrupt handling functions 757 * 758 ****************************************************************************/ 759 760 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) 761 { 762 int i; 763 struct dev_table_entry dte; 764 struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); 765 766 get_dte256(iommu, dev_data, &dte); 767 768 for (i = 0; i < 4; ++i) 769 pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); 770 } 771 772 static void dump_command(unsigned long phys_addr) 773 { 774 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 775 int i; 776 777 for (i = 0; i < 4; ++i) 778 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]); 779 } 780 781 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event) 782 { 783 struct iommu_dev_data *dev_data = NULL; 784 int devid, vmg_tag, flags; 785 struct pci_dev *pdev; 786 u64 spa; 787 788 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 789 vmg_tag = (event[1]) & 0xFFFF; 790 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 791 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8); 792 793 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 794 devid & 0xff); 795 if (pdev) 796 dev_data = dev_iommu_priv_get(&pdev->dev); 797 798 if (dev_data) { 799 if (__ratelimit(&dev_data->rs)) { 800 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 801 vmg_tag, spa, flags); 802 } 803 } else { 804 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", 805 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 806 vmg_tag, spa, flags); 807 } 808 809 if (pdev) 810 pci_dev_put(pdev); 811 } 812 813 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event) 814 { 815 struct iommu_dev_data *dev_data = NULL; 816 int devid, flags_rmp, vmg_tag, flags; 817 struct pci_dev *pdev; 818 u64 gpa; 819 820 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 821 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF; 822 vmg_tag = (event[1]) & 0xFFFF; 823 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 824 gpa = ((u64)event[3] << 32) | event[2]; 825 826 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 827 devid & 0xff); 828 if (pdev) 829 dev_data = dev_iommu_priv_get(&pdev->dev); 830 831 if (dev_data) { 832 if (__ratelimit(&dev_data->rs)) { 833 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 834 vmg_tag, gpa, flags_rmp, flags); 835 } 836 } else { 837 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", 838 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 839 vmg_tag, gpa, flags_rmp, flags); 840 } 841 842 if (pdev) 843 pci_dev_put(pdev); 844 } 845 846 #define IS_IOMMU_MEM_TRANSACTION(flags) \ 847 (((flags) & EVENT_FLAG_I) == 0) 848 849 #define IS_WRITE_REQUEST(flags) \ 850 ((flags) & EVENT_FLAG_RW) 851 852 static void amd_iommu_report_page_fault(struct amd_iommu *iommu, 853 u16 devid, u16 domain_id, 854 u64 address, int flags) 855 { 856 struct iommu_dev_data *dev_data = NULL; 857 struct pci_dev *pdev; 858 859 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid), 860 devid & 0xff); 861 if (pdev) 862 dev_data = dev_iommu_priv_get(&pdev->dev); 863 864 if (dev_data) { 865 /* 866 * If this is a DMA fault (for which the I(nterrupt) 867 * bit will be unset), allow report_iommu_fault() to 868 * prevent logging it. 869 */ 870 if (IS_IOMMU_MEM_TRANSACTION(flags)) { 871 /* Device not attached to domain properly */ 872 if (dev_data->domain == NULL) { 873 pr_err_ratelimited("Event logged [Device not attached to domain properly]\n"); 874 pr_err_ratelimited(" device=%04x:%02x:%02x.%x domain=0x%04x\n", 875 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), 876 PCI_FUNC(devid), domain_id); 877 goto out; 878 } 879 880 if (!report_iommu_fault(&dev_data->domain->domain, 881 &pdev->dev, address, 882 IS_WRITE_REQUEST(flags) ? 883 IOMMU_FAULT_WRITE : 884 IOMMU_FAULT_READ)) 885 goto out; 886 } 887 888 if (__ratelimit(&dev_data->rs)) { 889 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", 890 domain_id, address, flags); 891 } 892 } else { 893 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", 894 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 895 domain_id, address, flags); 896 } 897 898 out: 899 if (pdev) 900 pci_dev_put(pdev); 901 } 902 903 static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 904 { 905 struct device *dev = iommu->iommu.dev; 906 int type, devid, flags, tag; 907 volatile u32 *event = __evt; 908 int count = 0; 909 u64 address, ctrl; 910 u32 pasid; 911 912 retry: 913 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 914 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 915 pasid = (event[0] & EVENT_DOMID_MASK_HI) | 916 (event[1] & EVENT_DOMID_MASK_LO); 917 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 918 address = (u64)(((u64)event[3]) << 32) | event[2]; 919 ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); 920 921 if (type == 0) { 922 /* Did we hit the erratum? */ 923 if (++count == LOOP_TIMEOUT) { 924 pr_err("No event written to event log\n"); 925 return; 926 } 927 udelay(1); 928 goto retry; 929 } 930 931 if (type == EVENT_TYPE_IO_FAULT) { 932 amd_iommu_report_page_fault(iommu, devid, pasid, address, flags); 933 return; 934 } 935 936 switch (type) { 937 case EVENT_TYPE_ILL_DEV: 938 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 939 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 940 pasid, address, flags); 941 dev_err(dev, "Control Reg : 0x%llx\n", ctrl); 942 dump_dte_entry(iommu, devid); 943 break; 944 case EVENT_TYPE_DEV_TAB_ERR: 945 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x " 946 "address=0x%llx flags=0x%04x]\n", 947 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 948 address, flags); 949 break; 950 case EVENT_TYPE_PAGE_TAB_ERR: 951 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n", 952 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 953 pasid, address, flags); 954 break; 955 case EVENT_TYPE_ILL_CMD: 956 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address); 957 dump_command(address); 958 break; 959 case EVENT_TYPE_CMD_HARD_ERR: 960 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n", 961 address, flags); 962 break; 963 case EVENT_TYPE_IOTLB_INV_TO: 964 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n", 965 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 966 address); 967 break; 968 case EVENT_TYPE_INV_DEV_REQ: 969 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", 970 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 971 pasid, address, flags); 972 break; 973 case EVENT_TYPE_RMP_FAULT: 974 amd_iommu_report_rmp_fault(iommu, event); 975 break; 976 case EVENT_TYPE_RMP_HW_ERR: 977 amd_iommu_report_rmp_hw_error(iommu, event); 978 break; 979 case EVENT_TYPE_INV_PPR_REQ: 980 pasid = PPR_PASID(*((u64 *)__evt)); 981 tag = event[1] & 0x03FF; 982 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n", 983 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 984 pasid, address, flags, tag); 985 break; 986 default: 987 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n", 988 event[0], event[1], event[2], event[3]); 989 } 990 991 /* 992 * To detect the hardware errata 732 we need to clear the 993 * entry back to zero. This issue does not exist on SNP 994 * enabled system. Also this buffer is not writeable on 995 * SNP enabled system. 996 */ 997 if (!amd_iommu_snp_en) 998 memset(__evt, 0, 4 * sizeof(u32)); 999 } 1000 1001 static void iommu_poll_events(struct amd_iommu *iommu) 1002 { 1003 u32 head, tail; 1004 1005 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1006 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 1007 1008 while (head != tail) { 1009 iommu_print_event(iommu, iommu->evt_buf + head); 1010 1011 /* Update head pointer of hardware ring-buffer */ 1012 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; 1013 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 1014 } 1015 1016 } 1017 1018 #ifdef CONFIG_IRQ_REMAP 1019 static int (*iommu_ga_log_notifier)(u32); 1020 1021 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) 1022 { 1023 iommu_ga_log_notifier = notifier; 1024 1025 /* 1026 * Ensure all in-flight IRQ handlers run to completion before returning 1027 * to the caller, e.g. to ensure module code isn't unloaded while it's 1028 * being executed in the IRQ handler. 1029 */ 1030 if (!notifier) 1031 synchronize_rcu(); 1032 1033 return 0; 1034 } 1035 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); 1036 1037 static void iommu_poll_ga_log(struct amd_iommu *iommu) 1038 { 1039 u32 head, tail; 1040 1041 if (iommu->ga_log == NULL) 1042 return; 1043 1044 head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1045 tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); 1046 1047 while (head != tail) { 1048 volatile u64 *raw; 1049 u64 log_entry; 1050 1051 raw = (u64 *)(iommu->ga_log + head); 1052 1053 /* Avoid memcpy function-call overhead */ 1054 log_entry = *raw; 1055 1056 /* Update head pointer of hardware ring-buffer */ 1057 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; 1058 writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 1059 1060 /* Handle GA entry */ 1061 switch (GA_REQ_TYPE(log_entry)) { 1062 case GA_GUEST_NR: 1063 if (!iommu_ga_log_notifier) 1064 break; 1065 1066 pr_debug("%s: devid=%#x, ga_tag=%#x\n", 1067 __func__, GA_DEVID(log_entry), 1068 GA_TAG(log_entry)); 1069 1070 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) 1071 pr_err("GA log notifier failed.\n"); 1072 break; 1073 default: 1074 break; 1075 } 1076 } 1077 } 1078 1079 static void 1080 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) 1081 { 1082 if (!irq_remapping_enabled || !dev_is_pci(dev) || 1083 !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev))) 1084 return; 1085 1086 dev_set_msi_domain(dev, iommu->ir_domain); 1087 } 1088 1089 #else /* CONFIG_IRQ_REMAP */ 1090 static inline void 1091 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } 1092 #endif /* !CONFIG_IRQ_REMAP */ 1093 1094 static void amd_iommu_handle_irq(void *data, const char *evt_type, 1095 u32 int_mask, u32 overflow_mask, 1096 void (*int_handler)(struct amd_iommu *), 1097 void (*overflow_handler)(struct amd_iommu *)) 1098 { 1099 struct amd_iommu *iommu = (struct amd_iommu *) data; 1100 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1101 u32 mask = int_mask | overflow_mask; 1102 1103 while (status & mask) { 1104 /* Enable interrupt sources again */ 1105 writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET); 1106 1107 if (int_handler) { 1108 pr_devel("Processing IOMMU (ivhd%d) %s Log\n", 1109 iommu->index, evt_type); 1110 int_handler(iommu); 1111 } 1112 1113 if ((status & overflow_mask) && overflow_handler) 1114 overflow_handler(iommu); 1115 1116 /* 1117 * Hardware bug: ERBT1312 1118 * When re-enabling interrupt (by writing 1 1119 * to clear the bit), the hardware might also try to set 1120 * the interrupt bit in the event status register. 1121 * In this scenario, the bit will be set, and disable 1122 * subsequent interrupts. 1123 * 1124 * Workaround: The IOMMU driver should read back the 1125 * status register and check if the interrupt bits are cleared. 1126 * If not, driver will need to go through the interrupt handler 1127 * again and re-clear the bits 1128 */ 1129 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 1130 } 1131 } 1132 1133 irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data) 1134 { 1135 amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK, 1136 MMIO_STATUS_EVT_OVERFLOW_MASK, 1137 iommu_poll_events, amd_iommu_restart_event_logging); 1138 1139 return IRQ_HANDLED; 1140 } 1141 1142 irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data) 1143 { 1144 amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK, 1145 MMIO_STATUS_PPR_OVERFLOW_MASK, 1146 amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log); 1147 1148 return IRQ_HANDLED; 1149 } 1150 1151 irqreturn_t amd_iommu_int_thread_galog(int irq, void *data) 1152 { 1153 #ifdef CONFIG_IRQ_REMAP 1154 amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK, 1155 MMIO_STATUS_GALOG_OVERFLOW_MASK, 1156 iommu_poll_ga_log, amd_iommu_restart_ga_log); 1157 #endif 1158 1159 return IRQ_HANDLED; 1160 } 1161 1162 irqreturn_t amd_iommu_int_thread(int irq, void *data) 1163 { 1164 amd_iommu_int_thread_evtlog(irq, data); 1165 amd_iommu_int_thread_pprlog(irq, data); 1166 amd_iommu_int_thread_galog(irq, data); 1167 1168 return IRQ_HANDLED; 1169 } 1170 1171 /**************************************************************************** 1172 * 1173 * IOMMU command queuing functions 1174 * 1175 ****************************************************************************/ 1176 1177 static void dump_command_buffer(struct amd_iommu *iommu) 1178 { 1179 struct iommu_cmd *cmd; 1180 u32 head, tail; 1181 int i; 1182 1183 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 1184 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1185 1186 pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), 1187 MMIO_CMD_BUFFER_TAIL(tail)); 1188 1189 for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { 1190 cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); 1191 pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], 1192 cmd->data[3]); 1193 } 1194 } 1195 1196 static int wait_on_sem(struct amd_iommu *iommu, u64 data) 1197 { 1198 int i = 0; 1199 1200 /* 1201 * cmd_sem holds a monotonically non-decreasing completion sequence 1202 * number. 1203 */ 1204 while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && 1205 i < LOOP_TIMEOUT) { 1206 udelay(1); 1207 i += 1; 1208 } 1209 1210 if (i == LOOP_TIMEOUT) { 1211 1212 pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", 1213 iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), 1214 PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); 1215 1216 if (amd_iommu_dump) 1217 DO_ONCE_LITE(dump_command_buffer, iommu); 1218 1219 return -EIO; 1220 } 1221 1222 return 0; 1223 } 1224 1225 static void copy_cmd_to_buffer(struct amd_iommu *iommu, 1226 struct iommu_cmd *cmd) 1227 { 1228 u8 *target; 1229 u32 tail; 1230 1231 /* Copy command to buffer */ 1232 tail = iommu->cmd_buf_tail; 1233 target = iommu->cmd_buf + tail; 1234 memcpy(target, cmd, sizeof(*cmd)); 1235 1236 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1237 iommu->cmd_buf_tail = tail; 1238 1239 /* Tell the IOMMU about it */ 1240 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1241 } 1242 1243 static void build_completion_wait(struct iommu_cmd *cmd, 1244 struct amd_iommu *iommu, 1245 u64 data) 1246 { 1247 u64 paddr = iommu->cmd_sem_paddr; 1248 1249 memset(cmd, 0, sizeof(*cmd)); 1250 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 1251 cmd->data[1] = upper_32_bits(paddr); 1252 cmd->data[2] = lower_32_bits(data); 1253 cmd->data[3] = upper_32_bits(data); 1254 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 1255 } 1256 1257 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 1258 { 1259 memset(cmd, 0, sizeof(*cmd)); 1260 cmd->data[0] = devid; 1261 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 1262 } 1263 1264 /* 1265 * Builds an invalidation address which is suitable for one page or multiple 1266 * pages. Sets the size bit (S) as needed is more than one page is flushed. 1267 */ 1268 static inline u64 build_inv_address(u64 address, size_t size) 1269 { 1270 u64 pages, end, msb_diff; 1271 1272 pages = iommu_num_pages(address, size, PAGE_SIZE); 1273 1274 if (pages == 1) 1275 return address & PAGE_MASK; 1276 1277 end = address + size - 1; 1278 1279 /* 1280 * msb_diff would hold the index of the most significant bit that 1281 * flipped between the start and end. 1282 */ 1283 msb_diff = fls64(end ^ address) - 1; 1284 1285 /* 1286 * Bits 63:52 are sign extended. If for some reason bit 51 is different 1287 * between the start and the end, invalidate everything. 1288 */ 1289 if (unlikely(msb_diff > 51)) { 1290 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 1291 } else { 1292 /* 1293 * The msb-bit must be clear on the address. Just set all the 1294 * lower bits. 1295 */ 1296 address |= (1ull << msb_diff) - 1; 1297 } 1298 1299 /* Clear bits 11:0 */ 1300 address &= PAGE_MASK; 1301 1302 /* Set the size bit - we flush more than one 4kb page */ 1303 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK; 1304 } 1305 1306 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 1307 size_t size, u16 domid, 1308 ioasid_t pasid, bool gn) 1309 { 1310 u64 inv_address = build_inv_address(address, size); 1311 1312 memset(cmd, 0, sizeof(*cmd)); 1313 1314 cmd->data[1] |= domid; 1315 cmd->data[2] = lower_32_bits(inv_address); 1316 cmd->data[3] = upper_32_bits(inv_address); 1317 /* PDE bit - we want to flush everything, not only the PTEs */ 1318 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 1319 if (gn) { 1320 cmd->data[0] |= pasid; 1321 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1322 } 1323 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 1324 } 1325 1326 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 1327 u64 address, size_t size, 1328 ioasid_t pasid, bool gn) 1329 { 1330 u64 inv_address = build_inv_address(address, size); 1331 1332 memset(cmd, 0, sizeof(*cmd)); 1333 1334 cmd->data[0] = devid; 1335 cmd->data[0] |= (qdep & 0xff) << 24; 1336 cmd->data[1] = devid; 1337 cmd->data[2] = lower_32_bits(inv_address); 1338 cmd->data[3] = upper_32_bits(inv_address); 1339 if (gn) { 1340 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 1341 cmd->data[1] |= (pasid & 0xff) << 16; 1342 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 1343 } 1344 1345 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 1346 } 1347 1348 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid, 1349 int status, int tag, u8 gn) 1350 { 1351 memset(cmd, 0, sizeof(*cmd)); 1352 1353 cmd->data[0] = devid; 1354 if (gn) { 1355 cmd->data[1] = pasid; 1356 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 1357 } 1358 cmd->data[3] = tag & 0x1ff; 1359 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 1360 1361 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 1362 } 1363 1364 static void build_inv_all(struct iommu_cmd *cmd) 1365 { 1366 memset(cmd, 0, sizeof(*cmd)); 1367 CMD_SET_TYPE(cmd, CMD_INV_ALL); 1368 } 1369 1370 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 1371 { 1372 memset(cmd, 0, sizeof(*cmd)); 1373 cmd->data[0] = devid; 1374 CMD_SET_TYPE(cmd, CMD_INV_IRT); 1375 } 1376 1377 /* 1378 * Writes the command to the IOMMUs command buffer and informs the 1379 * hardware about the new command. 1380 */ 1381 static int __iommu_queue_command_sync(struct amd_iommu *iommu, 1382 struct iommu_cmd *cmd, 1383 bool sync) 1384 { 1385 unsigned int count = 0; 1386 u32 left, next_tail; 1387 1388 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; 1389 again: 1390 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; 1391 1392 if (left <= 0x20) { 1393 /* Skip udelay() the first time around */ 1394 if (count++) { 1395 if (count == LOOP_TIMEOUT) { 1396 pr_err("Command buffer timeout\n"); 1397 return -EIO; 1398 } 1399 1400 udelay(1); 1401 } 1402 1403 /* Update head and recheck remaining space */ 1404 iommu->cmd_buf_head = readl(iommu->mmio_base + 1405 MMIO_CMD_HEAD_OFFSET); 1406 1407 goto again; 1408 } 1409 1410 copy_cmd_to_buffer(iommu, cmd); 1411 1412 /* Do we need to make sure all commands are processed? */ 1413 iommu->need_sync = sync; 1414 1415 return 0; 1416 } 1417 1418 static int iommu_queue_command_sync(struct amd_iommu *iommu, 1419 struct iommu_cmd *cmd, 1420 bool sync) 1421 { 1422 unsigned long flags; 1423 int ret; 1424 1425 raw_spin_lock_irqsave(&iommu->lock, flags); 1426 ret = __iommu_queue_command_sync(iommu, cmd, sync); 1427 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1428 1429 return ret; 1430 } 1431 1432 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 1433 { 1434 return iommu_queue_command_sync(iommu, cmd, true); 1435 } 1436 1437 static u64 get_cmdsem_val(struct amd_iommu *iommu) 1438 { 1439 lockdep_assert_held(&iommu->lock); 1440 return ++iommu->cmd_sem_val; 1441 } 1442 1443 /* 1444 * This function queues a completion wait command into the command 1445 * buffer of an IOMMU 1446 */ 1447 static int iommu_completion_wait(struct amd_iommu *iommu) 1448 { 1449 struct iommu_cmd cmd; 1450 unsigned long flags; 1451 int ret; 1452 u64 data; 1453 1454 if (!iommu->need_sync) 1455 return 0; 1456 1457 raw_spin_lock_irqsave(&iommu->lock, flags); 1458 1459 data = get_cmdsem_val(iommu); 1460 build_completion_wait(&cmd, iommu, data); 1461 1462 ret = __iommu_queue_command_sync(iommu, &cmd, false); 1463 raw_spin_unlock_irqrestore(&iommu->lock, flags); 1464 1465 if (ret) 1466 return ret; 1467 1468 ret = wait_on_sem(iommu, data); 1469 1470 return ret; 1471 } 1472 1473 static void domain_flush_complete(struct protection_domain *domain) 1474 { 1475 struct pdom_iommu_info *pdom_iommu_info; 1476 unsigned long i; 1477 1478 lockdep_assert_held(&domain->lock); 1479 1480 /* 1481 * Devices of this domain are behind this IOMMU 1482 * We need to wait for completion of all commands. 1483 */ 1484 xa_for_each(&domain->iommu_array, i, pdom_iommu_info) 1485 iommu_completion_wait(pdom_iommu_info->iommu); 1486 } 1487 1488 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 1489 { 1490 struct iommu_cmd cmd; 1491 1492 build_inv_dte(&cmd, devid); 1493 1494 return iommu_queue_command(iommu, &cmd); 1495 } 1496 1497 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) 1498 { 1499 int ret; 1500 1501 ret = iommu_flush_dte(iommu, devid); 1502 if (!ret) 1503 iommu_completion_wait(iommu); 1504 } 1505 1506 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) 1507 { 1508 u32 devid; 1509 u16 last_bdf = iommu->pci_seg->last_bdf; 1510 1511 for (devid = 0; devid <= last_bdf; ++devid) 1512 iommu_flush_dte(iommu, devid); 1513 1514 iommu_completion_wait(iommu); 1515 } 1516 1517 /* 1518 * This function uses heavy locking and may disable irqs for some time. But 1519 * this is no issue because it is only called during resume. 1520 */ 1521 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) 1522 { 1523 u32 dom_id; 1524 u16 last_bdf = iommu->pci_seg->last_bdf; 1525 1526 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { 1527 struct iommu_cmd cmd; 1528 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1529 dom_id, IOMMU_NO_PASID, false); 1530 iommu_queue_command(iommu, &cmd); 1531 } 1532 1533 iommu_completion_wait(iommu); 1534 } 1535 1536 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) 1537 { 1538 struct iommu_cmd cmd; 1539 1540 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1541 dom_id, IOMMU_NO_PASID, false); 1542 iommu_queue_command(iommu, &cmd); 1543 1544 iommu_completion_wait(iommu); 1545 } 1546 1547 static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size) 1548 { 1549 int ret = 0; 1550 struct amd_iommu_viommu *aviommu; 1551 1552 list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) { 1553 unsigned long i; 1554 struct guest_domain_mapping_info *gdom_info; 1555 struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, 1556 struct amd_iommu, iommu); 1557 1558 xa_lock(&aviommu->gdomid_array); 1559 xa_for_each(&aviommu->gdomid_array, i, gdom_info) { 1560 struct iommu_cmd cmd; 1561 1562 pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__, 1563 iommu->devid, gdom_info->hdom_id); 1564 build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id, 1565 IOMMU_NO_PASID, false); 1566 ret |= iommu_queue_command(iommu, &cmd); 1567 } 1568 xa_unlock(&aviommu->gdomid_array); 1569 } 1570 return ret; 1571 } 1572 1573 static void amd_iommu_flush_all(struct amd_iommu *iommu) 1574 { 1575 struct iommu_cmd cmd; 1576 1577 build_inv_all(&cmd); 1578 1579 iommu_queue_command(iommu, &cmd); 1580 iommu_completion_wait(iommu); 1581 } 1582 1583 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 1584 { 1585 struct iommu_cmd cmd; 1586 1587 build_inv_irt(&cmd, devid); 1588 1589 iommu_queue_command(iommu, &cmd); 1590 } 1591 1592 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) 1593 { 1594 u32 devid; 1595 u16 last_bdf = iommu->pci_seg->last_bdf; 1596 1597 if (iommu->irtcachedis_enabled) 1598 return; 1599 1600 for (devid = 0; devid <= last_bdf; devid++) 1601 iommu_flush_irt(iommu, devid); 1602 1603 iommu_completion_wait(iommu); 1604 } 1605 1606 void amd_iommu_flush_all_caches(struct amd_iommu *iommu) 1607 { 1608 if (check_feature(FEATURE_IA)) { 1609 amd_iommu_flush_all(iommu); 1610 } else { 1611 amd_iommu_flush_dte_all(iommu); 1612 amd_iommu_flush_irt_all(iommu); 1613 amd_iommu_flush_tlb_all(iommu); 1614 } 1615 } 1616 1617 /* 1618 * Command send function for flushing on-device TLB 1619 */ 1620 static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, 1621 size_t size, ioasid_t pasid, bool gn) 1622 { 1623 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1624 struct iommu_cmd cmd; 1625 int qdep = dev_data->ats_qdep; 1626 1627 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, 1628 size, pasid, gn); 1629 1630 return iommu_queue_command(iommu, &cmd); 1631 } 1632 1633 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data) 1634 { 1635 struct amd_iommu *iommu = data; 1636 1637 return iommu_flush_dte(iommu, alias); 1638 } 1639 1640 /* 1641 * Command send function for invalidating a device table entry 1642 */ 1643 static int device_flush_dte(struct iommu_dev_data *dev_data) 1644 { 1645 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 1646 struct pci_dev *pdev = NULL; 1647 struct amd_iommu_pci_seg *pci_seg; 1648 u16 alias; 1649 int ret; 1650 1651 if (dev_is_pci(dev_data->dev)) 1652 pdev = to_pci_dev(dev_data->dev); 1653 1654 if (pdev) 1655 ret = pci_for_each_dma_alias(pdev, 1656 device_flush_dte_alias, iommu); 1657 else 1658 ret = iommu_flush_dte(iommu, dev_data->devid); 1659 if (ret) 1660 return ret; 1661 1662 pci_seg = iommu->pci_seg; 1663 alias = pci_seg->alias_table[dev_data->devid]; 1664 if (alias != dev_data->devid) { 1665 ret = iommu_flush_dte(iommu, alias); 1666 if (ret) 1667 return ret; 1668 } 1669 1670 if (dev_data->ats_enabled) { 1671 /* Invalidate the entire contents of an IOTLB */ 1672 ret = device_flush_iotlb(dev_data, 0, ~0UL, 1673 IOMMU_NO_PASID, false); 1674 } 1675 1676 return ret; 1677 } 1678 1679 static int domain_flush_pages_v2(struct protection_domain *pdom, 1680 u64 address, size_t size) 1681 { 1682 struct iommu_dev_data *dev_data; 1683 struct iommu_cmd cmd; 1684 int ret = 0; 1685 1686 lockdep_assert_held(&pdom->lock); 1687 list_for_each_entry(dev_data, &pdom->dev_list, list) { 1688 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1689 u16 domid = dev_data->gcr3_info.domid; 1690 1691 build_inv_iommu_pages(&cmd, address, size, 1692 domid, IOMMU_NO_PASID, true); 1693 1694 ret |= iommu_queue_command(iommu, &cmd); 1695 } 1696 1697 return ret; 1698 } 1699 1700 static int domain_flush_pages_v1(struct protection_domain *pdom, 1701 u64 address, size_t size) 1702 { 1703 struct pdom_iommu_info *pdom_iommu_info; 1704 struct iommu_cmd cmd; 1705 int ret = 0; 1706 unsigned long i; 1707 1708 lockdep_assert_held(&pdom->lock); 1709 1710 build_inv_iommu_pages(&cmd, address, size, 1711 pdom->id, IOMMU_NO_PASID, false); 1712 1713 xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { 1714 /* 1715 * Devices of this domain are behind this IOMMU 1716 * We need a TLB flush 1717 */ 1718 ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); 1719 } 1720 1721 /* 1722 * A domain w/ v1 table can be a nest parent, which can have 1723 * multiple nested domains. Each nested domain has 1:1 mapping 1724 * between gDomID and hDomID. Therefore, flush every hDomID 1725 * associated to this nest parent domain. 1726 * 1727 * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested() 1728 */ 1729 if (!list_empty(&pdom->viommu_list)) 1730 ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size); 1731 1732 return ret; 1733 } 1734 1735 /* 1736 * TLB invalidation function which is called from the mapping functions. 1737 * It flushes range of PTEs of the domain. 1738 */ 1739 static void __domain_flush_pages(struct protection_domain *domain, 1740 u64 address, size_t size) 1741 { 1742 struct iommu_dev_data *dev_data; 1743 int ret = 0; 1744 ioasid_t pasid = IOMMU_NO_PASID; 1745 bool gn = false; 1746 1747 lockdep_assert_held(&domain->lock); 1748 1749 if (pdom_is_v2_pgtbl_mode(domain)) { 1750 gn = true; 1751 ret = domain_flush_pages_v2(domain, address, size); 1752 } else { 1753 ret = domain_flush_pages_v1(domain, address, size); 1754 } 1755 1756 list_for_each_entry(dev_data, &domain->dev_list, list) { 1757 1758 if (!dev_data->ats_enabled) 1759 continue; 1760 1761 ret |= device_flush_iotlb(dev_data, address, size, pasid, gn); 1762 } 1763 1764 WARN_ON(ret); 1765 } 1766 1767 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 1768 u64 address, size_t size) 1769 { 1770 lockdep_assert_held(&domain->lock); 1771 1772 if (likely(!amd_iommu_np_cache)) { 1773 __domain_flush_pages(domain, address, size); 1774 1775 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1776 domain_flush_complete(domain); 1777 1778 return; 1779 } 1780 1781 /* 1782 * When NpCache is on, we infer that we run in a VM and use a vIOMMU. 1783 * In such setups it is best to avoid flushes of ranges which are not 1784 * naturally aligned, since it would lead to flushes of unmodified 1785 * PTEs. Such flushes would require the hypervisor to do more work than 1786 * necessary. Therefore, perform repeated flushes of aligned ranges 1787 * until you cover the range. Each iteration flushes the smaller 1788 * between the natural alignment of the address that we flush and the 1789 * greatest naturally aligned region that fits in the range. 1790 */ 1791 while (size != 0) { 1792 int addr_alignment = __ffs(address); 1793 int size_alignment = __fls(size); 1794 int min_alignment; 1795 size_t flush_size; 1796 1797 /* 1798 * size is always non-zero, but address might be zero, causing 1799 * addr_alignment to be negative. As the casting of the 1800 * argument in __ffs(address) to long might trim the high bits 1801 * of the address on x86-32, cast to long when doing the check. 1802 */ 1803 if (likely((unsigned long)address != 0)) 1804 min_alignment = min(addr_alignment, size_alignment); 1805 else 1806 min_alignment = size_alignment; 1807 1808 flush_size = 1ul << min_alignment; 1809 1810 __domain_flush_pages(domain, address, flush_size); 1811 address += flush_size; 1812 size -= flush_size; 1813 } 1814 1815 /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ 1816 domain_flush_complete(domain); 1817 } 1818 1819 /* Flush the whole IO/TLB for a given protection domain - including PDE */ 1820 static void amd_iommu_domain_flush_all(struct protection_domain *domain) 1821 { 1822 amd_iommu_domain_flush_pages(domain, 0, 1823 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1824 } 1825 1826 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, 1827 ioasid_t pasid, u64 address, size_t size) 1828 { 1829 struct iommu_cmd cmd; 1830 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 1831 1832 build_inv_iommu_pages(&cmd, address, size, 1833 dev_data->gcr3_info.domid, pasid, true); 1834 iommu_queue_command(iommu, &cmd); 1835 1836 if (dev_data->ats_enabled) 1837 device_flush_iotlb(dev_data, address, size, pasid, true); 1838 1839 iommu_completion_wait(iommu); 1840 } 1841 1842 static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, 1843 ioasid_t pasid) 1844 { 1845 amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, 1846 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1847 } 1848 1849 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1850 { 1851 struct iommu_dev_data *dev_data; 1852 struct amd_iommu *iommu; 1853 struct iommu_cmd cmd; 1854 1855 dev_data = dev_iommu_priv_get(dev); 1856 iommu = get_amd_iommu_from_dev(dev); 1857 1858 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 1859 tag, dev_data->pri_tlp); 1860 1861 return iommu_queue_command(iommu, &cmd); 1862 } 1863 1864 /**************************************************************************** 1865 * 1866 * The next functions belong to the domain allocation. A domain is 1867 * allocated for every IOMMU as the default domain. If device isolation 1868 * is enabled, every device get its own domain. The most important thing 1869 * about domains is the page table mapping the DMA address space they 1870 * contain. 1871 * 1872 ****************************************************************************/ 1873 int amd_iommu_pdom_id_alloc(void) 1874 { 1875 return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); 1876 } 1877 1878 int amd_iommu_pdom_id_reserve(u16 id, gfp_t gfp) 1879 { 1880 return ida_alloc_range(&pdom_ids, id, id, gfp); 1881 } 1882 1883 void amd_iommu_pdom_id_free(int id) 1884 { 1885 ida_free(&pdom_ids, id); 1886 } 1887 1888 void amd_iommu_pdom_id_destroy(void) 1889 { 1890 ida_destroy(&pdom_ids); 1891 } 1892 1893 static void free_gcr3_tbl_level1(u64 *tbl) 1894 { 1895 u64 *ptr; 1896 int i; 1897 1898 for (i = 0; i < 512; ++i) { 1899 if (!(tbl[i] & GCR3_VALID)) 1900 continue; 1901 1902 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1903 1904 iommu_free_pages(ptr); 1905 } 1906 } 1907 1908 static void free_gcr3_tbl_level2(u64 *tbl) 1909 { 1910 u64 *ptr; 1911 int i; 1912 1913 for (i = 0; i < 512; ++i) { 1914 if (!(tbl[i] & GCR3_VALID)) 1915 continue; 1916 1917 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1918 1919 free_gcr3_tbl_level1(ptr); 1920 } 1921 } 1922 1923 static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) 1924 { 1925 if (gcr3_info->glx == 2) 1926 free_gcr3_tbl_level2(gcr3_info->gcr3_tbl); 1927 else if (gcr3_info->glx == 1) 1928 free_gcr3_tbl_level1(gcr3_info->gcr3_tbl); 1929 else 1930 WARN_ON_ONCE(gcr3_info->glx != 0); 1931 1932 gcr3_info->glx = 0; 1933 1934 /* Free per device domain ID */ 1935 amd_iommu_pdom_id_free(gcr3_info->domid); 1936 1937 iommu_free_pages(gcr3_info->gcr3_tbl); 1938 gcr3_info->gcr3_tbl = NULL; 1939 } 1940 1941 /* 1942 * Number of GCR3 table levels required. Level must be 4-Kbyte 1943 * page and can contain up to 512 entries. 1944 */ 1945 static int get_gcr3_levels(int pasids) 1946 { 1947 int levels; 1948 1949 if (pasids == -1) 1950 return amd_iommu_max_glx_val; 1951 1952 levels = get_count_order(pasids); 1953 1954 return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels; 1955 } 1956 1957 static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, 1958 struct amd_iommu *iommu, int pasids) 1959 { 1960 int levels = get_gcr3_levels(pasids); 1961 int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 1962 int domid; 1963 1964 if (levels > amd_iommu_max_glx_val) 1965 return -EINVAL; 1966 1967 if (gcr3_info->gcr3_tbl) 1968 return -EBUSY; 1969 1970 /* Allocate per device domain ID */ 1971 domid = amd_iommu_pdom_id_alloc(); 1972 if (domid <= 0) 1973 return -ENOSPC; 1974 gcr3_info->domid = domid; 1975 1976 gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K); 1977 if (gcr3_info->gcr3_tbl == NULL) { 1978 amd_iommu_pdom_id_free(domid); 1979 return -ENOMEM; 1980 } 1981 1982 gcr3_info->glx = levels; 1983 1984 return 0; 1985 } 1986 1987 static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info, 1988 ioasid_t pasid, bool alloc) 1989 { 1990 int index; 1991 u64 *pte; 1992 u64 *root = gcr3_info->gcr3_tbl; 1993 int level = gcr3_info->glx; 1994 1995 while (true) { 1996 1997 index = (pasid >> (9 * level)) & 0x1ff; 1998 pte = &root[index]; 1999 2000 if (level == 0) 2001 break; 2002 2003 if (!(*pte & GCR3_VALID)) { 2004 if (!alloc) 2005 return NULL; 2006 2007 root = (void *)get_zeroed_page(GFP_ATOMIC); 2008 if (root == NULL) 2009 return NULL; 2010 2011 *pte = iommu_virt_to_phys(root) | GCR3_VALID; 2012 } 2013 2014 root = iommu_phys_to_virt(*pte & PAGE_MASK); 2015 2016 level -= 1; 2017 } 2018 2019 return pte; 2020 } 2021 2022 static int update_gcr3(struct iommu_dev_data *dev_data, 2023 ioasid_t pasid, unsigned long gcr3, bool set) 2024 { 2025 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2026 u64 *pte; 2027 2028 pte = __get_gcr3_pte(gcr3_info, pasid, true); 2029 if (pte == NULL) 2030 return -ENOMEM; 2031 2032 if (set) 2033 *pte = (gcr3 & PAGE_MASK) | GCR3_VALID; 2034 else 2035 *pte = 0; 2036 2037 dev_flush_pasid_all(dev_data, pasid); 2038 return 0; 2039 } 2040 2041 int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid, 2042 unsigned long gcr3) 2043 { 2044 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2045 int ret; 2046 2047 iommu_group_mutex_assert(dev_data->dev); 2048 2049 ret = update_gcr3(dev_data, pasid, gcr3, true); 2050 if (ret) 2051 return ret; 2052 2053 gcr3_info->pasid_cnt++; 2054 return ret; 2055 } 2056 2057 int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) 2058 { 2059 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2060 int ret; 2061 2062 iommu_group_mutex_assert(dev_data->dev); 2063 2064 ret = update_gcr3(dev_data, pasid, 0, false); 2065 if (ret) 2066 return ret; 2067 2068 gcr3_info->pasid_cnt--; 2069 return ret; 2070 } 2071 2072 /* 2073 * Note: 2074 * The old value for GCR3 table and GPT have been cleared from caller. 2075 */ 2076 static void set_dte_gcr3_table(struct iommu_dev_data *dev_data, 2077 struct dev_table_entry *new) 2078 { 2079 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2080 u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2081 2082 new->data[0] |= DTE_FLAG_TV | 2083 (dev_data->ppr ? DTE_FLAG_PPR : 0) | 2084 (pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) | 2085 DTE_FLAG_GV | 2086 FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2087 FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) | 2088 DTE_FLAG_IR | DTE_FLAG_IW; 2089 2090 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) | 2091 FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2092 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) | 2093 FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2094 2095 /* Guest page table can only support 4 and 5 levels */ 2096 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2097 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2098 else 2099 new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2100 } 2101 2102 void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, 2103 struct protection_domain *domain, u16 domid, 2104 struct pt_iommu_amdv1_hw_info *pt_info, 2105 struct dev_table_entry *new) 2106 { 2107 u64 host_pt_root = __sme_set(pt_info->host_pt_root); 2108 2109 /* Note Dirty tracking is used for v1 table only for now */ 2110 new->data[0] |= DTE_FLAG_TV | 2111 FIELD_PREP(DTE_MODE_MASK, pt_info->mode) | 2112 (domain->dirty_tracking ? DTE_FLAG_HAD : 0) | 2113 FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) | 2114 DTE_FLAG_IR | DTE_FLAG_IW; 2115 2116 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) | 2117 (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); 2118 } 2119 2120 static void set_dte_v1(struct iommu_dev_data *dev_data, 2121 struct protection_domain *domain, u16 domid, 2122 phys_addr_t top_paddr, unsigned int top_level, 2123 struct dev_table_entry *new) 2124 { 2125 struct pt_iommu_amdv1_hw_info pt_info; 2126 2127 /* 2128 * When updating the IO pagetable, the new top and level 2129 * are provided as parameters. For other operations i.e. 2130 * device attach, retrieve the current pagetable info 2131 * via the IOMMU PT API. 2132 */ 2133 if (top_paddr) { 2134 pt_info.host_pt_root = top_paddr; 2135 pt_info.mode = top_level + 1; 2136 } else { 2137 WARN_ON(top_paddr || top_level); 2138 pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info); 2139 } 2140 2141 amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new); 2142 } 2143 2144 static void set_dte_passthrough(struct iommu_dev_data *dev_data, 2145 struct protection_domain *domain, 2146 struct dev_table_entry *new) 2147 { 2148 new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; 2149 2150 new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | 2151 (dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; 2152 } 2153 2154 static void set_dte_entry(struct amd_iommu *iommu, 2155 struct iommu_dev_data *dev_data, 2156 phys_addr_t top_paddr, unsigned int top_level) 2157 { 2158 u32 old_domid; 2159 struct dev_table_entry new = {}; 2160 struct protection_domain *domain = dev_data->domain; 2161 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2162 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2163 2164 amd_iommu_make_clear_dte(dev_data, &new); 2165 2166 old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; 2167 if (gcr3_info->gcr3_tbl) 2168 set_dte_gcr3_table(dev_data, &new); 2169 else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) 2170 set_dte_passthrough(dev_data, domain, &new); 2171 else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) && 2172 domain->pd_mode == PD_MODE_V1) 2173 set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new); 2174 else 2175 WARN_ON(true); 2176 2177 amd_iommu_update_dte(iommu, dev_data, &new); 2178 2179 /* 2180 * A kdump kernel might be replacing a domain ID that was copied from 2181 * the previous kernel--if so, it needs to flush the translation cache 2182 * entries for the old domain ID that is being overwritten 2183 */ 2184 if (old_domid) { 2185 amd_iommu_flush_tlb_domid(iommu, old_domid); 2186 } 2187 } 2188 2189 /* 2190 * Clear DMA-remap related flags to block all DMA (blockeded domain) 2191 */ 2192 static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) 2193 { 2194 struct dev_table_entry new = {}; 2195 2196 amd_iommu_make_clear_dte(dev_data, &new); 2197 amd_iommu_update_dte(iommu, dev_data, &new); 2198 } 2199 2200 /* Update and flush DTE for the given device */ 2201 static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) 2202 { 2203 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2204 2205 if (set) 2206 set_dte_entry(iommu, dev_data, 0, 0); 2207 else 2208 clear_dte_entry(iommu, dev_data); 2209 } 2210 2211 /* 2212 * If domain is SVA capable then initialize GCR3 table. Also if domain is 2213 * in v2 page table mode then update GCR3[0]. 2214 */ 2215 static int init_gcr3_table(struct iommu_dev_data *dev_data, 2216 struct protection_domain *pdom) 2217 { 2218 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2219 int max_pasids = dev_data->max_pasids; 2220 struct pt_iommu_x86_64_hw_info pt_info; 2221 int ret = 0; 2222 2223 /* 2224 * If domain is in pt mode then setup GCR3 table only if device 2225 * is PASID capable 2226 */ 2227 if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data)) 2228 return ret; 2229 2230 /* 2231 * By default, setup GCR3 table to support MAX PASIDs 2232 * supported by the device/IOMMU. 2233 */ 2234 ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 2235 max_pasids > 0 ? max_pasids : 1); 2236 if (ret) 2237 return ret; 2238 2239 /* Setup GCR3[0] only if domain is setup with v2 page table mode */ 2240 if (!pdom_is_v2_pgtbl_mode(pdom)) 2241 return ret; 2242 2243 pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); 2244 ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); 2245 if (ret) 2246 free_gcr3_table(&dev_data->gcr3_info); 2247 2248 return ret; 2249 } 2250 2251 static void destroy_gcr3_table(struct iommu_dev_data *dev_data, 2252 struct protection_domain *pdom) 2253 { 2254 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2255 2256 if (pdom_is_v2_pgtbl_mode(pdom)) 2257 update_gcr3(dev_data, 0, 0, false); 2258 2259 if (gcr3_info->gcr3_tbl == NULL) 2260 return; 2261 2262 free_gcr3_table(gcr3_info); 2263 } 2264 2265 static int pdom_attach_iommu(struct amd_iommu *iommu, 2266 struct protection_domain *pdom) 2267 { 2268 struct pdom_iommu_info *pdom_iommu_info, *curr; 2269 unsigned long flags; 2270 int ret = 0; 2271 2272 spin_lock_irqsave(&pdom->lock, flags); 2273 2274 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2275 if (pdom_iommu_info) { 2276 pdom_iommu_info->refcnt++; 2277 goto out_unlock; 2278 } 2279 2280 pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); 2281 if (!pdom_iommu_info) { 2282 ret = -ENOMEM; 2283 goto out_unlock; 2284 } 2285 2286 pdom_iommu_info->iommu = iommu; 2287 pdom_iommu_info->refcnt = 1; 2288 2289 curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, 2290 NULL, pdom_iommu_info, GFP_ATOMIC); 2291 if (curr) { 2292 kfree(pdom_iommu_info); 2293 ret = -ENOSPC; 2294 goto out_unlock; 2295 } 2296 2297 out_unlock: 2298 spin_unlock_irqrestore(&pdom->lock, flags); 2299 return ret; 2300 } 2301 2302 static void pdom_detach_iommu(struct amd_iommu *iommu, 2303 struct protection_domain *pdom) 2304 { 2305 struct pdom_iommu_info *pdom_iommu_info; 2306 unsigned long flags; 2307 2308 spin_lock_irqsave(&pdom->lock, flags); 2309 2310 pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); 2311 if (!pdom_iommu_info) { 2312 spin_unlock_irqrestore(&pdom->lock, flags); 2313 return; 2314 } 2315 2316 pdom_iommu_info->refcnt--; 2317 if (pdom_iommu_info->refcnt == 0) { 2318 xa_erase(&pdom->iommu_array, iommu->index); 2319 kfree(pdom_iommu_info); 2320 } 2321 2322 spin_unlock_irqrestore(&pdom->lock, flags); 2323 } 2324 2325 /* 2326 * If a device is not yet associated with a domain, this function makes the 2327 * device visible in the domain 2328 */ 2329 static int attach_device(struct device *dev, 2330 struct protection_domain *domain) 2331 { 2332 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2333 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2334 struct pci_dev *pdev; 2335 unsigned long flags; 2336 int ret = 0; 2337 2338 mutex_lock(&dev_data->mutex); 2339 2340 if (dev_data->domain != NULL) { 2341 ret = -EBUSY; 2342 goto out; 2343 } 2344 2345 /* Do reference counting */ 2346 ret = pdom_attach_iommu(iommu, domain); 2347 if (ret) 2348 goto out; 2349 2350 /* Setup GCR3 table */ 2351 if (pdom_is_sva_capable(domain)) { 2352 ret = init_gcr3_table(dev_data, domain); 2353 if (ret) { 2354 pdom_detach_iommu(iommu, domain); 2355 goto out; 2356 } 2357 } 2358 2359 pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; 2360 if (pdev && pdom_is_sva_capable(domain)) { 2361 pdev_enable_caps(pdev); 2362 2363 /* 2364 * Device can continue to function even if IOPF 2365 * enablement failed. Hence in error path just 2366 * disable device PRI support. 2367 */ 2368 if (amd_iommu_iopf_add_device(iommu, dev_data)) 2369 pdev_disable_cap_pri(pdev); 2370 } else if (pdev) { 2371 pdev_enable_cap_ats(pdev); 2372 } 2373 2374 /* Update data structures */ 2375 dev_data->domain = domain; 2376 spin_lock_irqsave(&domain->lock, flags); 2377 list_add(&dev_data->list, &domain->dev_list); 2378 spin_unlock_irqrestore(&domain->lock, flags); 2379 2380 /* Update device table */ 2381 dev_update_dte(dev_data, true); 2382 2383 out: 2384 mutex_unlock(&dev_data->mutex); 2385 2386 return ret; 2387 } 2388 2389 /* 2390 * Removes a device from a protection domain (with devtable_lock held) 2391 */ 2392 static void detach_device(struct device *dev) 2393 { 2394 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2395 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2396 struct protection_domain *domain = dev_data->domain; 2397 unsigned long flags; 2398 2399 mutex_lock(&dev_data->mutex); 2400 2401 /* 2402 * First check if the device is still attached. It might already 2403 * be detached from its domain because the generic 2404 * iommu_detach_group code detached it and we try again here in 2405 * our alias handling. 2406 */ 2407 if (WARN_ON(!dev_data->domain)) 2408 goto out; 2409 2410 /* Remove IOPF handler */ 2411 if (dev_data->ppr) { 2412 iopf_queue_flush_dev(dev); 2413 amd_iommu_iopf_remove_device(iommu, dev_data); 2414 } 2415 2416 if (dev_is_pci(dev)) 2417 pdev_disable_caps(to_pci_dev(dev)); 2418 2419 /* Clear DTE and flush the entry */ 2420 dev_update_dte(dev_data, false); 2421 2422 /* Flush IOTLB and wait for the flushes to finish */ 2423 spin_lock_irqsave(&domain->lock, flags); 2424 amd_iommu_domain_flush_all(domain); 2425 list_del(&dev_data->list); 2426 spin_unlock_irqrestore(&domain->lock, flags); 2427 2428 /* Clear GCR3 table */ 2429 if (pdom_is_sva_capable(domain)) 2430 destroy_gcr3_table(dev_data, domain); 2431 2432 /* Update data structures */ 2433 dev_data->domain = NULL; 2434 2435 /* decrease reference counters - needs to happen after the flushes */ 2436 pdom_detach_iommu(iommu, domain); 2437 2438 out: 2439 mutex_unlock(&dev_data->mutex); 2440 } 2441 2442 static struct iommu_device *amd_iommu_probe_device(struct device *dev) 2443 { 2444 struct iommu_device *iommu_dev; 2445 struct amd_iommu *iommu; 2446 struct iommu_dev_data *dev_data; 2447 int ret; 2448 2449 if (!check_device(dev)) 2450 return ERR_PTR(-ENODEV); 2451 2452 iommu = rlookup_amd_iommu(dev); 2453 if (!iommu) 2454 return ERR_PTR(-ENODEV); 2455 2456 /* Not registered yet? */ 2457 if (!iommu->iommu.ops) 2458 return ERR_PTR(-ENODEV); 2459 2460 if (dev_iommu_priv_get(dev)) 2461 return &iommu->iommu; 2462 2463 ret = iommu_init_device(iommu, dev); 2464 if (ret) { 2465 dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); 2466 iommu_dev = ERR_PTR(ret); 2467 iommu_ignore_device(iommu, dev); 2468 goto out_err; 2469 } 2470 2471 amd_iommu_set_pci_msi_domain(dev, iommu); 2472 iommu_dev = &iommu->iommu; 2473 2474 /* 2475 * If IOMMU and device supports PASID then it will contain max 2476 * supported PASIDs, else it will be zero. 2477 */ 2478 dev_data = dev_iommu_priv_get(dev); 2479 if (amd_iommu_pasid_supported() && dev_is_pci(dev) && 2480 pdev_pasid_supported(dev_data)) { 2481 dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids, 2482 pci_max_pasids(to_pci_dev(dev))); 2483 } 2484 2485 if (amd_iommu_pgtable == PD_MODE_NONE) { 2486 pr_warn_once("%s: DMA translation not supported by iommu.\n", 2487 __func__); 2488 iommu_dev = ERR_PTR(-ENODEV); 2489 goto out_err; 2490 } 2491 2492 iommu_completion_wait(iommu); 2493 2494 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 2495 dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; 2496 else 2497 dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; 2498 2499 if (dev_is_pci(dev)) 2500 pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); 2501 2502 out_err: 2503 return iommu_dev; 2504 } 2505 2506 static void amd_iommu_release_device(struct device *dev) 2507 { 2508 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2509 2510 WARN_ON(dev_data->domain); 2511 2512 /* 2513 * We keep dev_data around for unplugged devices and reuse it when the 2514 * device is re-plugged - not doing so would introduce a ton of races. 2515 */ 2516 } 2517 2518 static struct iommu_group *amd_iommu_device_group(struct device *dev) 2519 { 2520 if (dev_is_pci(dev)) 2521 return pci_device_group(dev); 2522 2523 return acpihid_device_group(dev); 2524 } 2525 2526 /***************************************************************************** 2527 * 2528 * The following functions belong to the exported interface of AMD IOMMU 2529 * 2530 * This interface allows access to lower level functions of the IOMMU 2531 * like protection domain handling and assignement of devices to domains 2532 * which is not possible with the dma_ops interface. 2533 * 2534 *****************************************************************************/ 2535 2536 static void protection_domain_init(struct protection_domain *domain) 2537 { 2538 spin_lock_init(&domain->lock); 2539 INIT_LIST_HEAD(&domain->dev_list); 2540 INIT_LIST_HEAD(&domain->dev_data_list); 2541 INIT_LIST_HEAD(&domain->viommu_list); 2542 xa_init(&domain->iommu_array); 2543 } 2544 2545 struct protection_domain *protection_domain_alloc(void) 2546 { 2547 struct protection_domain *domain; 2548 int domid; 2549 2550 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2551 if (!domain) 2552 return NULL; 2553 2554 domid = amd_iommu_pdom_id_alloc(); 2555 if (domid <= 0) { 2556 kfree(domain); 2557 return NULL; 2558 } 2559 domain->id = domid; 2560 2561 protection_domain_init(domain); 2562 2563 return domain; 2564 } 2565 2566 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2567 { 2568 if (amd_iommu_hatdis) 2569 return false; 2570 2571 return iommu && (iommu->features & FEATURE_HDSUP); 2572 } 2573 2574 static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) 2575 { 2576 struct protection_domain *pdom = 2577 container_of(iommupt, struct protection_domain, iommu); 2578 2579 return &pdom->lock; 2580 } 2581 2582 /* 2583 * Update all HW references to the domain with a new pgtable configuration. 2584 */ 2585 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 2586 phys_addr_t top_paddr, unsigned int top_level) 2587 { 2588 struct protection_domain *pdom = 2589 container_of(iommu_table, struct protection_domain, iommu); 2590 struct iommu_dev_data *dev_data; 2591 2592 lockdep_assert_held(&pdom->lock); 2593 2594 /* Update the DTE for all devices attached to this domain */ 2595 list_for_each_entry(dev_data, &pdom->dev_list, list) { 2596 struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 2597 2598 /* Update the HW references with the new level and top ptr */ 2599 set_dte_entry(iommu, dev_data, top_paddr, top_level); 2600 clone_aliases(iommu, dev_data->dev); 2601 } 2602 2603 list_for_each_entry(dev_data, &pdom->dev_list, list) 2604 device_flush_dte(dev_data); 2605 2606 domain_flush_complete(pdom); 2607 } 2608 2609 /* 2610 * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to 2611 * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non 2612 * present caching (like hypervisor shadowing). 2613 */ 2614 static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2615 unsigned long iova, size_t size) 2616 { 2617 struct protection_domain *domain = to_pdomain(dom); 2618 unsigned long flags; 2619 2620 if (likely(!amd_iommu_np_cache)) 2621 return 0; 2622 2623 spin_lock_irqsave(&domain->lock, flags); 2624 amd_iommu_domain_flush_pages(domain, iova, size); 2625 spin_unlock_irqrestore(&domain->lock, flags); 2626 return 0; 2627 } 2628 2629 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2630 { 2631 struct protection_domain *dom = to_pdomain(domain); 2632 unsigned long flags; 2633 2634 spin_lock_irqsave(&dom->lock, flags); 2635 amd_iommu_domain_flush_all(dom); 2636 spin_unlock_irqrestore(&dom->lock, flags); 2637 } 2638 2639 static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2640 struct iommu_iotlb_gather *gather) 2641 { 2642 struct protection_domain *dom = to_pdomain(domain); 2643 unsigned long flags; 2644 2645 spin_lock_irqsave(&dom->lock, flags); 2646 amd_iommu_domain_flush_pages(dom, gather->start, 2647 gather->end - gather->start + 1); 2648 spin_unlock_irqrestore(&dom->lock, flags); 2649 iommu_put_pages_list(&gather->freelist); 2650 } 2651 2652 static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { 2653 .get_top_lock = amd_iommu_get_top_lock, 2654 .change_top = amd_iommu_change_top, 2655 }; 2656 2657 static const struct iommu_domain_ops amdv1_ops = { 2658 IOMMU_PT_DOMAIN_OPS(amdv1), 2659 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2660 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2661 .iotlb_sync = amd_iommu_iotlb_sync, 2662 .attach_dev = amd_iommu_attach_device, 2663 .free = amd_iommu_domain_free, 2664 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2665 }; 2666 2667 static const struct iommu_dirty_ops amdv1_dirty_ops = { 2668 IOMMU_PT_DIRTY_OPS(amdv1), 2669 .set_dirty_tracking = amd_iommu_set_dirty_tracking, 2670 }; 2671 2672 static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, 2673 u32 flags) 2674 { 2675 struct pt_iommu_amdv1_cfg cfg = {}; 2676 struct protection_domain *domain; 2677 int ret; 2678 2679 if (amd_iommu_hatdis) 2680 return ERR_PTR(-EOPNOTSUPP); 2681 2682 domain = protection_domain_alloc(); 2683 if (!domain) 2684 return ERR_PTR(-ENOMEM); 2685 2686 domain->pd_mode = PD_MODE_V1; 2687 domain->iommu.driver_ops = &amd_hw_driver_ops_v1; 2688 domain->iommu.nid = dev_to_node(dev); 2689 if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2690 domain->domain.dirty_ops = &amdv1_dirty_ops; 2691 2692 /* 2693 * Someday FORCE_COHERENCE should be set by 2694 * amd_iommu_enforce_cache_coherency() like VT-d does. 2695 */ 2696 cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 2697 BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 2698 BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 2699 2700 /* 2701 * AMD's IOMMU can flush as many pages as necessary in a single flush. 2702 * Unless we run in a virtual machine, which can be inferred according 2703 * to whether "non-present cache" is on, it is probably best to prefer 2704 * (potentially) too extensive TLB flushing (i.e., more misses) over 2705 * multiple TLB flushes (i.e., more flushes). For virtual machines the 2706 * hypervisor needs to synchronize the host IOMMU PTEs with those of 2707 * the guest, and the trade-off is different: unnecessary TLB flushes 2708 * should be avoided. 2709 */ 2710 if (amd_iommu_np_cache) 2711 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2712 else 2713 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2714 2715 cfg.common.hw_max_vasz_lg2 = 2716 min(64, (amd_iommu_hpt_level - 1) * 9 + 21); 2717 cfg.common.hw_max_oasz_lg2 = 52; 2718 cfg.starting_level = 2; 2719 domain->domain.ops = &amdv1_ops; 2720 2721 ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); 2722 if (ret) { 2723 amd_iommu_domain_free(&domain->domain); 2724 return ERR_PTR(ret); 2725 } 2726 2727 /* 2728 * Narrow the supported page sizes to those selected by the kernel 2729 * command line. 2730 */ 2731 domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; 2732 return &domain->domain; 2733 } 2734 2735 static const struct iommu_domain_ops amdv2_ops = { 2736 IOMMU_PT_DOMAIN_OPS(x86_64), 2737 .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2738 .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2739 .iotlb_sync = amd_iommu_iotlb_sync, 2740 .attach_dev = amd_iommu_attach_device, 2741 .free = amd_iommu_domain_free, 2742 /* 2743 * Note the AMDv2 page table format does not support a Force Coherency 2744 * bit, so enforce_cache_coherency should not be set. However VFIO is 2745 * not prepared to handle a case where some domains will support 2746 * enforcement and others do not. VFIO and iommufd will have to be fixed 2747 * before it can fully use the V2 page table. See the comment in 2748 * iommufd_hwpt_paging_alloc(). For now leave things as they have 2749 * historically been and lie about enforce_cache_coherencey. 2750 */ 2751 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2752 }; 2753 2754 static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, 2755 u32 flags) 2756 { 2757 struct pt_iommu_x86_64_cfg cfg = {}; 2758 struct protection_domain *domain; 2759 int ret; 2760 2761 if (!amd_iommu_v2_pgtbl_supported()) 2762 return ERR_PTR(-EOPNOTSUPP); 2763 2764 domain = protection_domain_alloc(); 2765 if (!domain) 2766 return ERR_PTR(-ENOMEM); 2767 2768 domain->pd_mode = PD_MODE_V2; 2769 domain->iommu.nid = dev_to_node(dev); 2770 2771 cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); 2772 if (amd_iommu_np_cache) 2773 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2774 else 2775 cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2776 2777 /* 2778 * The v2 table behaves differently if it is attached to PASID 0 vs a 2779 * non-zero PASID. On PASID 0 it has no sign extension and the full 2780 * 57/48 bits decode the lower addresses. Otherwise it behaves like a 2781 * normal sign extended x86 page table. Since we want the domain to work 2782 * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not 2783 * set which creates a table that is compatible in both modes. 2784 */ 2785 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { 2786 cfg.common.hw_max_vasz_lg2 = 56; 2787 cfg.top_level = 4; 2788 } else { 2789 cfg.common.hw_max_vasz_lg2 = 47; 2790 cfg.top_level = 3; 2791 } 2792 cfg.common.hw_max_oasz_lg2 = 52; 2793 domain->domain.ops = &amdv2_ops; 2794 2795 ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); 2796 if (ret) { 2797 amd_iommu_domain_free(&domain->domain); 2798 return ERR_PTR(ret); 2799 } 2800 return &domain->domain; 2801 } 2802 2803 static inline bool is_nest_parent_supported(u32 flags) 2804 { 2805 /* Only allow nest parent when these features are supported */ 2806 return check_feature(FEATURE_GT) && 2807 check_feature(FEATURE_GIOSUP) && 2808 check_feature2(FEATURE_GCR3TRPMODE); 2809 } 2810 2811 static struct iommu_domain * 2812 amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 2813 const struct iommu_user_data *user_data) 2814 2815 { 2816 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2817 const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 2818 IOMMU_HWPT_ALLOC_PASID | 2819 IOMMU_HWPT_ALLOC_NEST_PARENT; 2820 2821 if ((flags & ~supported_flags) || user_data) 2822 return ERR_PTR(-EOPNOTSUPP); 2823 2824 switch (flags & supported_flags) { 2825 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: 2826 case IOMMU_HWPT_ALLOC_NEST_PARENT: 2827 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: 2828 /* 2829 * Allocate domain with v1 page table for dirty tracking 2830 * and/or Nest parent. 2831 */ 2832 if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && 2833 !amd_iommu_hd_support(iommu)) 2834 break; 2835 2836 if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && 2837 !is_nest_parent_supported(flags)) 2838 break; 2839 2840 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2841 case IOMMU_HWPT_ALLOC_PASID: 2842 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2843 if (!amd_iommu_pasid_supported()) 2844 break; 2845 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2846 case 0: { 2847 struct iommu_domain *ret; 2848 2849 /* If nothing specific is required use the kernel commandline default */ 2850 if (amd_iommu_pgtable == PD_MODE_V1) { 2851 ret = amd_iommu_domain_alloc_paging_v1(dev, flags); 2852 if (ret != ERR_PTR(-EOPNOTSUPP)) 2853 return ret; 2854 return amd_iommu_domain_alloc_paging_v2(dev, flags); 2855 } 2856 ret = amd_iommu_domain_alloc_paging_v2(dev, flags); 2857 if (ret != ERR_PTR(-EOPNOTSUPP)) 2858 return ret; 2859 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2860 } 2861 default: 2862 break; 2863 } 2864 return ERR_PTR(-EOPNOTSUPP); 2865 } 2866 2867 void amd_iommu_domain_free(struct iommu_domain *dom) 2868 { 2869 struct protection_domain *domain = to_pdomain(dom); 2870 2871 WARN_ON(!list_empty(&domain->dev_list)); 2872 pt_iommu_deinit(&domain->iommu); 2873 amd_iommu_pdom_id_free(domain->id); 2874 kfree(domain); 2875 } 2876 2877 static int blocked_domain_attach_device(struct iommu_domain *domain, 2878 struct device *dev, 2879 struct iommu_domain *old) 2880 { 2881 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2882 2883 if (dev_data->domain) 2884 detach_device(dev); 2885 2886 /* Clear DTE and flush the entry */ 2887 mutex_lock(&dev_data->mutex); 2888 dev_update_dte(dev_data, false); 2889 mutex_unlock(&dev_data->mutex); 2890 2891 return 0; 2892 } 2893 2894 static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, 2895 struct device *dev, ioasid_t pasid, 2896 struct iommu_domain *old) 2897 { 2898 amd_iommu_remove_dev_pasid(dev, pasid, old); 2899 return 0; 2900 } 2901 2902 static struct iommu_domain blocked_domain = { 2903 .type = IOMMU_DOMAIN_BLOCKED, 2904 .ops = &(const struct iommu_domain_ops) { 2905 .attach_dev = blocked_domain_attach_device, 2906 .set_dev_pasid = blocked_domain_set_dev_pasid, 2907 } 2908 }; 2909 2910 static struct protection_domain identity_domain; 2911 2912 static const struct iommu_domain_ops identity_domain_ops = { 2913 .attach_dev = amd_iommu_attach_device, 2914 }; 2915 2916 void amd_iommu_init_identity_domain(void) 2917 { 2918 struct iommu_domain *domain = &identity_domain.domain; 2919 2920 domain->type = IOMMU_DOMAIN_IDENTITY; 2921 domain->ops = &identity_domain_ops; 2922 domain->owner = &amd_iommu_ops; 2923 2924 identity_domain.id = amd_iommu_pdom_id_alloc(); 2925 2926 protection_domain_init(&identity_domain); 2927 } 2928 2929 static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 2930 struct iommu_domain *old) 2931 { 2932 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2933 struct protection_domain *domain = to_pdomain(dom); 2934 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2935 int ret; 2936 2937 /* 2938 * Skip attach device to domain if new domain is same as 2939 * devices current domain 2940 */ 2941 if (dev_data->domain == domain) 2942 return 0; 2943 2944 dev_data->defer_attach = false; 2945 2946 /* 2947 * Restrict to devices with compatible IOMMU hardware support 2948 * when enforcement of dirty tracking is enabled. 2949 */ 2950 if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) 2951 return -EINVAL; 2952 2953 if (dev_data->domain) 2954 detach_device(dev); 2955 2956 ret = attach_device(dev, domain); 2957 2958 #ifdef CONFIG_IRQ_REMAP 2959 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { 2960 if (dom->type == IOMMU_DOMAIN_UNMANAGED) 2961 dev_data->use_vapic = 1; 2962 else 2963 dev_data->use_vapic = 0; 2964 } 2965 #endif 2966 2967 return ret; 2968 } 2969 2970 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) 2971 { 2972 switch (cap) { 2973 case IOMMU_CAP_CACHE_COHERENCY: 2974 return true; 2975 case IOMMU_CAP_NOEXEC: 2976 return false; 2977 case IOMMU_CAP_PRE_BOOT_PROTECTION: 2978 return amdr_ivrs_remap_support; 2979 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 2980 return true; 2981 case IOMMU_CAP_DEFERRED_FLUSH: 2982 return true; 2983 case IOMMU_CAP_DIRTY_TRACKING: { 2984 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2985 2986 return amd_iommu_hd_support(iommu); 2987 } 2988 default: 2989 break; 2990 } 2991 2992 return false; 2993 } 2994 2995 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 2996 bool enable) 2997 { 2998 struct protection_domain *pdomain = to_pdomain(domain); 2999 struct dev_table_entry *dte; 3000 struct iommu_dev_data *dev_data; 3001 bool domain_flush = false; 3002 struct amd_iommu *iommu; 3003 unsigned long flags; 3004 u64 new; 3005 3006 spin_lock_irqsave(&pdomain->lock, flags); 3007 if (!(pdomain->dirty_tracking ^ enable)) { 3008 spin_unlock_irqrestore(&pdomain->lock, flags); 3009 return 0; 3010 } 3011 3012 list_for_each_entry(dev_data, &pdomain->dev_list, list) { 3013 spin_lock(&dev_data->dte_lock); 3014 iommu = get_amd_iommu_from_dev_data(dev_data); 3015 dte = &get_dev_table(iommu)[dev_data->devid]; 3016 new = dte->data[0]; 3017 new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); 3018 dte->data[0] = new; 3019 spin_unlock(&dev_data->dte_lock); 3020 3021 /* Flush device DTE */ 3022 device_flush_dte(dev_data); 3023 domain_flush = true; 3024 } 3025 3026 /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ 3027 if (domain_flush) 3028 amd_iommu_domain_flush_all(pdomain); 3029 3030 pdomain->dirty_tracking = enable; 3031 spin_unlock_irqrestore(&pdomain->lock, flags); 3032 3033 return 0; 3034 } 3035 3036 static void amd_iommu_get_resv_regions(struct device *dev, 3037 struct list_head *head) 3038 { 3039 struct iommu_resv_region *region; 3040 struct unity_map_entry *entry; 3041 struct amd_iommu *iommu; 3042 struct amd_iommu_pci_seg *pci_seg; 3043 int devid, sbdf; 3044 3045 sbdf = get_device_sbdf_id(dev); 3046 if (sbdf < 0) 3047 return; 3048 3049 devid = PCI_SBDF_TO_DEVID(sbdf); 3050 iommu = get_amd_iommu_from_dev(dev); 3051 pci_seg = iommu->pci_seg; 3052 3053 list_for_each_entry(entry, &pci_seg->unity_map, list) { 3054 int type, prot = 0; 3055 size_t length; 3056 3057 if (devid < entry->devid_start || devid > entry->devid_end) 3058 continue; 3059 3060 type = IOMMU_RESV_DIRECT; 3061 length = entry->address_end - entry->address_start; 3062 if (entry->prot & IOMMU_PROT_IR) 3063 prot |= IOMMU_READ; 3064 if (entry->prot & IOMMU_PROT_IW) 3065 prot |= IOMMU_WRITE; 3066 if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE) 3067 /* Exclusion range */ 3068 type = IOMMU_RESV_RESERVED; 3069 3070 region = iommu_alloc_resv_region(entry->address_start, 3071 length, prot, type, 3072 GFP_KERNEL); 3073 if (!region) { 3074 dev_err(dev, "Out of memory allocating dm-regions\n"); 3075 return; 3076 } 3077 list_add_tail(®ion->list, head); 3078 } 3079 3080 region = iommu_alloc_resv_region(MSI_RANGE_START, 3081 MSI_RANGE_END - MSI_RANGE_START + 1, 3082 0, IOMMU_RESV_MSI, GFP_KERNEL); 3083 if (!region) 3084 return; 3085 list_add_tail(®ion->list, head); 3086 3087 if (amd_iommu_ht_range_ignore()) 3088 return; 3089 3090 region = iommu_alloc_resv_region(HT_RANGE_START, 3091 HT_RANGE_END - HT_RANGE_START + 1, 3092 0, IOMMU_RESV_RESERVED, GFP_KERNEL); 3093 if (!region) 3094 return; 3095 list_add_tail(®ion->list, head); 3096 } 3097 3098 static bool amd_iommu_is_attach_deferred(struct device *dev) 3099 { 3100 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 3101 3102 return dev_data->defer_attach; 3103 } 3104 3105 static int amd_iommu_def_domain_type(struct device *dev) 3106 { 3107 struct iommu_dev_data *dev_data; 3108 3109 dev_data = dev_iommu_priv_get(dev); 3110 if (!dev_data) 3111 return 0; 3112 3113 /* Always use DMA domain for untrusted device */ 3114 if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted) 3115 return IOMMU_DOMAIN_DMA; 3116 3117 /* 3118 * Do not identity map IOMMUv2 capable devices when: 3119 * - memory encryption is active, because some of those devices 3120 * (AMD GPUs) don't have the encryption bit in their DMA-mask 3121 * and require remapping. 3122 * - SNP is enabled, because it prohibits DTE[Mode]=0. 3123 */ 3124 if (pdev_pasid_supported(dev_data) && 3125 !cc_platform_has(CC_ATTR_MEM_ENCRYPT) && 3126 !amd_iommu_snp_en) { 3127 return IOMMU_DOMAIN_IDENTITY; 3128 } 3129 3130 return 0; 3131 } 3132 3133 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) 3134 { 3135 /* IOMMU_PTE_FC is always set */ 3136 return true; 3137 } 3138 3139 const struct iommu_ops amd_iommu_ops = { 3140 .capable = amd_iommu_capable, 3141 .hw_info = amd_iommufd_hw_info, 3142 .blocked_domain = &blocked_domain, 3143 .release_domain = &blocked_domain, 3144 .identity_domain = &identity_domain.domain, 3145 .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, 3146 .domain_alloc_sva = amd_iommu_domain_alloc_sva, 3147 .probe_device = amd_iommu_probe_device, 3148 .release_device = amd_iommu_release_device, 3149 .device_group = amd_iommu_device_group, 3150 .get_resv_regions = amd_iommu_get_resv_regions, 3151 .is_attach_deferred = amd_iommu_is_attach_deferred, 3152 .def_domain_type = amd_iommu_def_domain_type, 3153 .page_response = amd_iommu_page_response, 3154 .get_viommu_size = amd_iommufd_get_viommu_size, 3155 .viommu_init = amd_iommufd_viommu_init, 3156 }; 3157 3158 #ifdef CONFIG_IRQ_REMAP 3159 3160 /***************************************************************************** 3161 * 3162 * Interrupt Remapping Implementation 3163 * 3164 *****************************************************************************/ 3165 3166 static struct irq_chip amd_ir_chip; 3167 static DEFINE_SPINLOCK(iommu_table_lock); 3168 3169 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) 3170 { 3171 int ret; 3172 u64 data; 3173 unsigned long flags; 3174 struct iommu_cmd cmd, cmd2; 3175 3176 if (iommu->irtcachedis_enabled) 3177 return; 3178 3179 build_inv_irt(&cmd, devid); 3180 3181 raw_spin_lock_irqsave(&iommu->lock, flags); 3182 data = get_cmdsem_val(iommu); 3183 build_completion_wait(&cmd2, iommu, data); 3184 3185 ret = __iommu_queue_command_sync(iommu, &cmd, true); 3186 if (ret) 3187 goto out_err; 3188 ret = __iommu_queue_command_sync(iommu, &cmd2, false); 3189 if (ret) 3190 goto out_err; 3191 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3192 3193 wait_on_sem(iommu, data); 3194 return; 3195 3196 out_err: 3197 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3198 } 3199 3200 static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) 3201 { 3202 if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) 3203 return DTE_INTTABLEN_2K; 3204 return DTE_INTTABLEN_512; 3205 } 3206 3207 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, 3208 struct irq_remap_table *table) 3209 { 3210 u64 new; 3211 struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; 3212 struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3213 3214 if (dev_data) 3215 spin_lock(&dev_data->dte_lock); 3216 3217 new = READ_ONCE(dte->data[2]); 3218 new &= ~DTE_IRQ_PHYS_ADDR_MASK; 3219 new |= iommu_virt_to_phys(table->table); 3220 new |= DTE_IRQ_REMAP_INTCTL; 3221 new |= iommu_get_int_tablen(dev_data); 3222 new |= DTE_IRQ_REMAP_ENABLE; 3223 WRITE_ONCE(dte->data[2], new); 3224 3225 if (dev_data) 3226 spin_unlock(&dev_data->dte_lock); 3227 } 3228 3229 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) 3230 { 3231 struct irq_remap_table *table; 3232 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3233 3234 if (WARN_ONCE(!pci_seg->rlookup_table[devid], 3235 "%s: no iommu for devid %x:%x\n", 3236 __func__, pci_seg->id, devid)) 3237 return NULL; 3238 3239 table = pci_seg->irq_lookup_table[devid]; 3240 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n", 3241 __func__, pci_seg->id, devid)) 3242 return NULL; 3243 3244 return table; 3245 } 3246 3247 static struct irq_remap_table *__alloc_irq_table(int nid, size_t size) 3248 { 3249 struct irq_remap_table *table; 3250 3251 table = kzalloc(sizeof(*table), GFP_KERNEL); 3252 if (!table) 3253 return NULL; 3254 3255 table->table = iommu_alloc_pages_node_sz( 3256 nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size)); 3257 if (!table->table) { 3258 kfree(table); 3259 return NULL; 3260 } 3261 raw_spin_lock_init(&table->lock); 3262 3263 return table; 3264 } 3265 3266 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, 3267 struct irq_remap_table *table) 3268 { 3269 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; 3270 3271 pci_seg->irq_lookup_table[devid] = table; 3272 set_dte_irq_entry(iommu, devid, table); 3273 iommu_flush_dte(iommu, devid); 3274 } 3275 3276 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, 3277 void *data) 3278 { 3279 struct irq_remap_table *table = data; 3280 struct amd_iommu_pci_seg *pci_seg; 3281 struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev); 3282 3283 if (!iommu) 3284 return -EINVAL; 3285 3286 pci_seg = iommu->pci_seg; 3287 pci_seg->irq_lookup_table[alias] = table; 3288 set_dte_irq_entry(iommu, alias, table); 3289 iommu_flush_dte(pci_seg->rlookup_table[alias], alias); 3290 3291 return 0; 3292 } 3293 3294 static inline size_t get_irq_table_size(unsigned int max_irqs) 3295 { 3296 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3297 return max_irqs * sizeof(u32); 3298 3299 return max_irqs * (sizeof(u64) * 2); 3300 } 3301 3302 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, 3303 u16 devid, struct pci_dev *pdev, 3304 unsigned int max_irqs) 3305 { 3306 struct irq_remap_table *table = NULL; 3307 struct irq_remap_table *new_table = NULL; 3308 struct amd_iommu_pci_seg *pci_seg; 3309 unsigned long flags; 3310 int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3311 u16 alias; 3312 3313 spin_lock_irqsave(&iommu_table_lock, flags); 3314 3315 pci_seg = iommu->pci_seg; 3316 table = pci_seg->irq_lookup_table[devid]; 3317 if (table) 3318 goto out_unlock; 3319 3320 alias = pci_seg->alias_table[devid]; 3321 table = pci_seg->irq_lookup_table[alias]; 3322 if (table) { 3323 set_remap_table_entry(iommu, devid, table); 3324 goto out_wait; 3325 } 3326 spin_unlock_irqrestore(&iommu_table_lock, flags); 3327 3328 /* Nothing there yet, allocate new irq remapping table */ 3329 new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs)); 3330 if (!new_table) 3331 return NULL; 3332 3333 spin_lock_irqsave(&iommu_table_lock, flags); 3334 3335 table = pci_seg->irq_lookup_table[devid]; 3336 if (table) 3337 goto out_unlock; 3338 3339 table = pci_seg->irq_lookup_table[alias]; 3340 if (table) { 3341 set_remap_table_entry(iommu, devid, table); 3342 goto out_wait; 3343 } 3344 3345 table = new_table; 3346 new_table = NULL; 3347 3348 if (pdev) 3349 pci_for_each_dma_alias(pdev, set_remap_table_entry_alias, 3350 table); 3351 else 3352 set_remap_table_entry(iommu, devid, table); 3353 3354 if (devid != alias) 3355 set_remap_table_entry(iommu, alias, table); 3356 3357 out_wait: 3358 iommu_completion_wait(iommu); 3359 3360 out_unlock: 3361 spin_unlock_irqrestore(&iommu_table_lock, flags); 3362 3363 if (new_table) { 3364 iommu_free_pages(new_table->table); 3365 kfree(new_table); 3366 } 3367 return table; 3368 } 3369 3370 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, 3371 bool align, struct pci_dev *pdev, 3372 unsigned long max_irqs) 3373 { 3374 struct irq_remap_table *table; 3375 int index, c, alignment = 1; 3376 unsigned long flags; 3377 3378 table = alloc_irq_table(iommu, devid, pdev, max_irqs); 3379 if (!table) 3380 return -ENODEV; 3381 3382 if (align) 3383 alignment = roundup_pow_of_two(count); 3384 3385 raw_spin_lock_irqsave(&table->lock, flags); 3386 3387 /* Scan table for free entries */ 3388 for (index = ALIGN(table->min_index, alignment), c = 0; 3389 index < max_irqs;) { 3390 if (!iommu->irte_ops->is_allocated(table, index)) { 3391 c += 1; 3392 } else { 3393 c = 0; 3394 index = ALIGN(index + 1, alignment); 3395 continue; 3396 } 3397 3398 if (c == count) { 3399 for (; c != 0; --c) 3400 iommu->irte_ops->set_allocated(table, index - c + 1); 3401 3402 index -= count - 1; 3403 goto out; 3404 } 3405 3406 index++; 3407 } 3408 3409 index = -ENOSPC; 3410 3411 out: 3412 raw_spin_unlock_irqrestore(&table->lock, flags); 3413 3414 return index; 3415 } 3416 3417 static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3418 struct irte_ga *irte) 3419 { 3420 struct irq_remap_table *table; 3421 struct irte_ga *entry; 3422 unsigned long flags; 3423 u128 old; 3424 3425 table = get_irq_table(iommu, devid); 3426 if (!table) 3427 return -ENOMEM; 3428 3429 raw_spin_lock_irqsave(&table->lock, flags); 3430 3431 entry = (struct irte_ga *)table->table; 3432 entry = &entry[index]; 3433 3434 /* 3435 * We use cmpxchg16 to atomically update the 128-bit IRTE, 3436 * and it cannot be updated by the hardware or other processors 3437 * behind us, so the return value of cmpxchg16 should be the 3438 * same as the old value. 3439 */ 3440 old = entry->irte; 3441 WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte)); 3442 3443 raw_spin_unlock_irqrestore(&table->lock, flags); 3444 3445 return 0; 3446 } 3447 3448 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3449 struct irte_ga *irte) 3450 { 3451 int ret; 3452 3453 ret = __modify_irte_ga(iommu, devid, index, irte); 3454 if (ret) 3455 return ret; 3456 3457 iommu_flush_irt_and_complete(iommu, devid); 3458 3459 return 0; 3460 } 3461 3462 static int modify_irte(struct amd_iommu *iommu, 3463 u16 devid, int index, union irte *irte) 3464 { 3465 struct irq_remap_table *table; 3466 unsigned long flags; 3467 3468 table = get_irq_table(iommu, devid); 3469 if (!table) 3470 return -ENOMEM; 3471 3472 raw_spin_lock_irqsave(&table->lock, flags); 3473 table->table[index] = irte->val; 3474 raw_spin_unlock_irqrestore(&table->lock, flags); 3475 3476 iommu_flush_irt_and_complete(iommu, devid); 3477 3478 return 0; 3479 } 3480 3481 static void free_irte(struct amd_iommu *iommu, u16 devid, int index) 3482 { 3483 struct irq_remap_table *table; 3484 unsigned long flags; 3485 3486 table = get_irq_table(iommu, devid); 3487 if (!table) 3488 return; 3489 3490 raw_spin_lock_irqsave(&table->lock, flags); 3491 iommu->irte_ops->clear_allocated(table, index); 3492 raw_spin_unlock_irqrestore(&table->lock, flags); 3493 3494 iommu_flush_irt_and_complete(iommu, devid); 3495 } 3496 3497 static void irte_prepare(void *entry, 3498 u32 delivery_mode, bool dest_mode, 3499 u8 vector, u32 dest_apicid, int devid) 3500 { 3501 union irte *irte = (union irte *) entry; 3502 3503 irte->val = 0; 3504 irte->fields.vector = vector; 3505 irte->fields.int_type = delivery_mode; 3506 irte->fields.destination = dest_apicid; 3507 irte->fields.dm = dest_mode; 3508 irte->fields.valid = 1; 3509 } 3510 3511 static void irte_ga_prepare(void *entry, 3512 u32 delivery_mode, bool dest_mode, 3513 u8 vector, u32 dest_apicid, int devid) 3514 { 3515 struct irte_ga *irte = (struct irte_ga *) entry; 3516 3517 irte->lo.val = 0; 3518 irte->hi.val = 0; 3519 irte->lo.fields_remap.int_type = delivery_mode; 3520 irte->lo.fields_remap.dm = dest_mode; 3521 irte->hi.fields.vector = vector; 3522 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid); 3523 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid); 3524 irte->lo.fields_remap.valid = 1; 3525 } 3526 3527 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3528 { 3529 union irte *irte = (union irte *) entry; 3530 3531 irte->fields.valid = 1; 3532 modify_irte(iommu, devid, index, irte); 3533 } 3534 3535 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3536 { 3537 struct irte_ga *irte = (struct irte_ga *) entry; 3538 3539 irte->lo.fields_remap.valid = 1; 3540 modify_irte_ga(iommu, devid, index, irte); 3541 } 3542 3543 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3544 { 3545 union irte *irte = (union irte *) entry; 3546 3547 irte->fields.valid = 0; 3548 modify_irte(iommu, devid, index, irte); 3549 } 3550 3551 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index) 3552 { 3553 struct irte_ga *irte = (struct irte_ga *) entry; 3554 3555 irte->lo.fields_remap.valid = 0; 3556 modify_irte_ga(iommu, devid, index, irte); 3557 } 3558 3559 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3560 u8 vector, u32 dest_apicid) 3561 { 3562 union irte *irte = (union irte *) entry; 3563 3564 irte->fields.vector = vector; 3565 irte->fields.destination = dest_apicid; 3566 modify_irte(iommu, devid, index, irte); 3567 } 3568 3569 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index, 3570 u8 vector, u32 dest_apicid) 3571 { 3572 struct irte_ga *irte = (struct irte_ga *) entry; 3573 3574 if (!irte->lo.fields_remap.guest_mode) { 3575 irte->hi.fields.vector = vector; 3576 irte->lo.fields_remap.destination = 3577 APICID_TO_IRTE_DEST_LO(dest_apicid); 3578 irte->hi.fields.destination = 3579 APICID_TO_IRTE_DEST_HI(dest_apicid); 3580 modify_irte_ga(iommu, devid, index, irte); 3581 } 3582 } 3583 3584 #define IRTE_ALLOCATED (~1U) 3585 static void irte_set_allocated(struct irq_remap_table *table, int index) 3586 { 3587 table->table[index] = IRTE_ALLOCATED; 3588 } 3589 3590 static void irte_ga_set_allocated(struct irq_remap_table *table, int index) 3591 { 3592 struct irte_ga *ptr = (struct irte_ga *)table->table; 3593 struct irte_ga *irte = &ptr[index]; 3594 3595 memset(&irte->lo.val, 0, sizeof(u64)); 3596 memset(&irte->hi.val, 0, sizeof(u64)); 3597 irte->hi.fields.vector = 0xff; 3598 } 3599 3600 static bool irte_is_allocated(struct irq_remap_table *table, int index) 3601 { 3602 union irte *ptr = (union irte *)table->table; 3603 union irte *irte = &ptr[index]; 3604 3605 return irte->val != 0; 3606 } 3607 3608 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) 3609 { 3610 struct irte_ga *ptr = (struct irte_ga *)table->table; 3611 struct irte_ga *irte = &ptr[index]; 3612 3613 return irte->hi.fields.vector != 0; 3614 } 3615 3616 static void irte_clear_allocated(struct irq_remap_table *table, int index) 3617 { 3618 table->table[index] = 0; 3619 } 3620 3621 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) 3622 { 3623 struct irte_ga *ptr = (struct irte_ga *)table->table; 3624 struct irte_ga *irte = &ptr[index]; 3625 3626 memset(&irte->lo.val, 0, sizeof(u64)); 3627 memset(&irte->hi.val, 0, sizeof(u64)); 3628 } 3629 3630 static int get_devid(struct irq_alloc_info *info) 3631 { 3632 switch (info->type) { 3633 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3634 return get_ioapic_devid(info->devid); 3635 case X86_IRQ_ALLOC_TYPE_HPET: 3636 return get_hpet_devid(info->devid); 3637 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3638 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3639 return get_device_sbdf_id(msi_desc_to_dev(info->desc)); 3640 default: 3641 WARN_ON_ONCE(1); 3642 return -1; 3643 } 3644 } 3645 3646 struct irq_remap_ops amd_iommu_irq_ops = { 3647 .prepare = amd_iommu_prepare, 3648 .enable = amd_iommu_enable, 3649 .disable = amd_iommu_disable, 3650 .reenable = amd_iommu_reenable, 3651 .enable_faulting = amd_iommu_enable_faulting, 3652 }; 3653 3654 static void fill_msi_msg(struct msi_msg *msg, u32 index) 3655 { 3656 msg->data = index; 3657 msg->address_lo = 0; 3658 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; 3659 /* 3660 * The struct msi_msg.dest_mode_logical is used to set the DM bit 3661 * in MSI Message Address Register. For device w/ 2K int-remap support, 3662 * this is bit must be set to 1 regardless of the actual destination 3663 * mode, which is signified by the IRTE[DM]. 3664 */ 3665 if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) 3666 msg->arch_addr_lo.dest_mode_logical = true; 3667 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; 3668 } 3669 3670 static void irq_remapping_prepare_irte(struct amd_ir_data *data, 3671 struct irq_cfg *irq_cfg, 3672 struct irq_alloc_info *info, 3673 int devid, int index, int sub_handle) 3674 { 3675 struct irq_2_irte *irte_info = &data->irq_2_irte; 3676 struct amd_iommu *iommu = data->iommu; 3677 3678 if (!iommu) 3679 return; 3680 3681 data->irq_2_irte.devid = devid; 3682 data->irq_2_irte.index = index + sub_handle; 3683 iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED, 3684 apic->dest_mode_logical, irq_cfg->vector, 3685 irq_cfg->dest_apicid, devid); 3686 3687 switch (info->type) { 3688 case X86_IRQ_ALLOC_TYPE_IOAPIC: 3689 case X86_IRQ_ALLOC_TYPE_HPET: 3690 case X86_IRQ_ALLOC_TYPE_PCI_MSI: 3691 case X86_IRQ_ALLOC_TYPE_PCI_MSIX: 3692 fill_msi_msg(&data->msi_entry, irte_info->index); 3693 break; 3694 3695 default: 3696 BUG_ON(1); 3697 break; 3698 } 3699 } 3700 3701 struct amd_irte_ops irte_32_ops = { 3702 .prepare = irte_prepare, 3703 .activate = irte_activate, 3704 .deactivate = irte_deactivate, 3705 .set_affinity = irte_set_affinity, 3706 .set_allocated = irte_set_allocated, 3707 .is_allocated = irte_is_allocated, 3708 .clear_allocated = irte_clear_allocated, 3709 }; 3710 3711 struct amd_irte_ops irte_128_ops = { 3712 .prepare = irte_ga_prepare, 3713 .activate = irte_ga_activate, 3714 .deactivate = irte_ga_deactivate, 3715 .set_affinity = irte_ga_set_affinity, 3716 .set_allocated = irte_ga_set_allocated, 3717 .is_allocated = irte_ga_is_allocated, 3718 .clear_allocated = irte_ga_clear_allocated, 3719 }; 3720 3721 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, 3722 unsigned int nr_irqs, void *arg) 3723 { 3724 struct irq_alloc_info *info = arg; 3725 struct irq_data *irq_data; 3726 struct amd_ir_data *data = NULL; 3727 struct amd_iommu *iommu; 3728 struct irq_cfg *cfg; 3729 struct iommu_dev_data *dev_data; 3730 unsigned long max_irqs; 3731 int i, ret, devid, seg, sbdf; 3732 int index; 3733 3734 if (!info) 3735 return -EINVAL; 3736 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) 3737 return -EINVAL; 3738 3739 sbdf = get_devid(info); 3740 if (sbdf < 0) 3741 return -EINVAL; 3742 3743 seg = PCI_SBDF_TO_SEGID(sbdf); 3744 devid = PCI_SBDF_TO_DEVID(sbdf); 3745 iommu = __rlookup_amd_iommu(seg, devid); 3746 if (!iommu) 3747 return -EINVAL; 3748 3749 dev_data = search_dev_data(iommu, devid); 3750 max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; 3751 3752 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); 3753 if (ret < 0) 3754 return ret; 3755 3756 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { 3757 struct irq_remap_table *table; 3758 3759 table = alloc_irq_table(iommu, devid, NULL, max_irqs); 3760 if (table) { 3761 if (!table->min_index) { 3762 /* 3763 * Keep the first 32 indexes free for IOAPIC 3764 * interrupts. 3765 */ 3766 table->min_index = 32; 3767 for (i = 0; i < 32; ++i) 3768 iommu->irte_ops->set_allocated(table, i); 3769 } 3770 WARN_ON(table->min_index != 32); 3771 index = info->ioapic.pin; 3772 } else { 3773 index = -ENOMEM; 3774 } 3775 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI || 3776 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) { 3777 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); 3778 3779 index = alloc_irq_index(iommu, devid, nr_irqs, align, 3780 msi_desc_to_pci_dev(info->desc), 3781 max_irqs); 3782 } else { 3783 index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, 3784 max_irqs); 3785 } 3786 3787 if (index < 0) { 3788 pr_warn("Failed to allocate IRTE\n"); 3789 ret = index; 3790 goto out_free_parent; 3791 } 3792 3793 for (i = 0; i < nr_irqs; i++) { 3794 irq_data = irq_domain_get_irq_data(domain, virq + i); 3795 cfg = irq_data ? irqd_cfg(irq_data) : NULL; 3796 if (!cfg) { 3797 ret = -EINVAL; 3798 goto out_free_data; 3799 } 3800 3801 ret = -ENOMEM; 3802 data = kzalloc(sizeof(*data), GFP_KERNEL); 3803 if (!data) 3804 goto out_free_data; 3805 3806 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) 3807 data->entry = kzalloc(sizeof(union irte), GFP_KERNEL); 3808 else 3809 data->entry = kzalloc(sizeof(struct irte_ga), 3810 GFP_KERNEL); 3811 if (!data->entry) { 3812 kfree(data); 3813 goto out_free_data; 3814 } 3815 3816 data->iommu = iommu; 3817 irq_data->hwirq = (devid << 16) + i; 3818 irq_data->chip_data = data; 3819 irq_data->chip = &amd_ir_chip; 3820 irq_remapping_prepare_irte(data, cfg, info, devid, index, i); 3821 } 3822 3823 return 0; 3824 3825 out_free_data: 3826 for (i--; i >= 0; i--) { 3827 irq_data = irq_domain_get_irq_data(domain, virq + i); 3828 if (irq_data) 3829 kfree(irq_data->chip_data); 3830 } 3831 for (i = 0; i < nr_irqs; i++) 3832 free_irte(iommu, devid, index + i); 3833 out_free_parent: 3834 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3835 return ret; 3836 } 3837 3838 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, 3839 unsigned int nr_irqs) 3840 { 3841 struct irq_2_irte *irte_info; 3842 struct irq_data *irq_data; 3843 struct amd_ir_data *data; 3844 int i; 3845 3846 for (i = 0; i < nr_irqs; i++) { 3847 irq_data = irq_domain_get_irq_data(domain, virq + i); 3848 if (irq_data && irq_data->chip_data) { 3849 data = irq_data->chip_data; 3850 irte_info = &data->irq_2_irte; 3851 free_irte(data->iommu, irte_info->devid, irte_info->index); 3852 kfree(data->entry); 3853 kfree(data); 3854 } 3855 } 3856 irq_domain_free_irqs_common(domain, virq, nr_irqs); 3857 } 3858 3859 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 3860 struct amd_ir_data *ir_data, 3861 struct irq_2_irte *irte_info, 3862 struct irq_cfg *cfg); 3863 3864 static int irq_remapping_activate(struct irq_domain *domain, 3865 struct irq_data *irq_data, bool reserve) 3866 { 3867 struct amd_ir_data *data = irq_data->chip_data; 3868 struct irq_2_irte *irte_info = &data->irq_2_irte; 3869 struct amd_iommu *iommu = data->iommu; 3870 struct irq_cfg *cfg = irqd_cfg(irq_data); 3871 3872 if (!iommu) 3873 return 0; 3874 3875 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid, 3876 irte_info->index); 3877 amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg); 3878 return 0; 3879 } 3880 3881 static void irq_remapping_deactivate(struct irq_domain *domain, 3882 struct irq_data *irq_data) 3883 { 3884 struct amd_ir_data *data = irq_data->chip_data; 3885 struct irq_2_irte *irte_info = &data->irq_2_irte; 3886 struct amd_iommu *iommu = data->iommu; 3887 3888 if (iommu) 3889 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid, 3890 irte_info->index); 3891 } 3892 3893 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, 3894 enum irq_domain_bus_token bus_token) 3895 { 3896 struct amd_iommu *iommu; 3897 int devid = -1; 3898 3899 if (!amd_iommu_irq_remap) 3900 return 0; 3901 3902 if (x86_fwspec_is_ioapic(fwspec)) 3903 devid = get_ioapic_devid(fwspec->param[0]); 3904 else if (x86_fwspec_is_hpet(fwspec)) 3905 devid = get_hpet_devid(fwspec->param[0]); 3906 3907 if (devid < 0) 3908 return 0; 3909 iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff)); 3910 3911 return iommu && iommu->ir_domain == d; 3912 } 3913 3914 static const struct irq_domain_ops amd_ir_domain_ops = { 3915 .select = irq_remapping_select, 3916 .alloc = irq_remapping_alloc, 3917 .free = irq_remapping_free, 3918 .activate = irq_remapping_activate, 3919 .deactivate = irq_remapping_deactivate, 3920 }; 3921 3922 static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, 3923 bool ga_log_intr) 3924 { 3925 if (cpu >= 0) { 3926 entry->lo.fields_vapic.destination = 3927 APICID_TO_IRTE_DEST_LO(cpu); 3928 entry->hi.fields.destination = 3929 APICID_TO_IRTE_DEST_HI(cpu); 3930 entry->lo.fields_vapic.is_run = true; 3931 entry->lo.fields_vapic.ga_log_intr = false; 3932 } else { 3933 entry->lo.fields_vapic.is_run = false; 3934 entry->lo.fields_vapic.ga_log_intr = ga_log_intr; 3935 } 3936 } 3937 3938 /* 3939 * Update the pCPU information for an IRTE that is configured to post IRQs to 3940 * a vCPU, without issuing an IOMMU invalidation for the IRTE. 3941 * 3942 * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination 3943 * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't 3944 * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based 3945 * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is 3946 * blocking and requires a notification wake event). I.e. treat vCPUs that are 3947 * associated with a pCPU as running. This API is intended to be used when a 3948 * vCPU is scheduled in/out (or stops running for any reason), to do a fast 3949 * update of IsRun, GALogIntr, and (conditionally) Destination. 3950 * 3951 * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached 3952 * and thus don't require an invalidation to ensure the IOMMU consumes fresh 3953 * information. 3954 */ 3955 int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 3956 { 3957 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3958 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3959 3960 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3961 return -EINVAL; 3962 3963 if (!entry || !entry->lo.fields_vapic.guest_mode) 3964 return 0; 3965 3966 if (!ir_data->iommu) 3967 return -ENODEV; 3968 3969 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3970 3971 return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3972 ir_data->irq_2_irte.index, entry); 3973 } 3974 EXPORT_SYMBOL(amd_iommu_update_ga); 3975 3976 int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 3977 { 3978 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3979 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3980 u64 valid; 3981 3982 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3983 return -EINVAL; 3984 3985 if (!entry) 3986 return 0; 3987 3988 valid = entry->lo.fields_vapic.valid; 3989 3990 entry->lo.val = 0; 3991 entry->hi.val = 0; 3992 3993 entry->lo.fields_vapic.valid = valid; 3994 entry->lo.fields_vapic.guest_mode = 1; 3995 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; 3996 entry->hi.fields.vector = ir_data->ga_vector; 3997 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; 3998 3999 __amd_iommu_update_ga(entry, cpu, ga_log_intr); 4000 4001 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4002 ir_data->irq_2_irte.index, entry); 4003 } 4004 EXPORT_SYMBOL(amd_iommu_activate_guest_mode); 4005 4006 int amd_iommu_deactivate_guest_mode(void *data) 4007 { 4008 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4009 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4010 struct irq_cfg *cfg = ir_data->cfg; 4011 u64 valid; 4012 4013 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4014 return -EINVAL; 4015 4016 if (!entry || !entry->lo.fields_vapic.guest_mode) 4017 return 0; 4018 4019 valid = entry->lo.fields_remap.valid; 4020 4021 entry->lo.val = 0; 4022 entry->hi.val = 0; 4023 4024 entry->lo.fields_remap.valid = valid; 4025 entry->lo.fields_remap.dm = apic->dest_mode_logical; 4026 entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED; 4027 entry->hi.fields.vector = cfg->vector; 4028 entry->lo.fields_remap.destination = 4029 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); 4030 entry->hi.fields.destination = 4031 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid); 4032 4033 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4034 ir_data->irq_2_irte.index, entry); 4035 } 4036 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); 4037 4038 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) 4039 { 4040 int ret; 4041 struct amd_iommu_pi_data *pi_data = info; 4042 struct amd_ir_data *ir_data = data->chip_data; 4043 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4044 struct iommu_dev_data *dev_data; 4045 4046 if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 4047 return -EINVAL; 4048 4049 if (ir_data->iommu == NULL) 4050 return -EINVAL; 4051 4052 dev_data = search_dev_data(ir_data->iommu, irte_info->devid); 4053 4054 /* Note: 4055 * This device has never been set up for guest mode. 4056 * we should not modify the IRTE 4057 */ 4058 if (!dev_data || !dev_data->use_vapic) 4059 return -EINVAL; 4060 4061 ir_data->cfg = irqd_cfg(data); 4062 4063 if (pi_data) { 4064 pi_data->ir_data = ir_data; 4065 4066 ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); 4067 ir_data->ga_vector = pi_data->vector; 4068 ir_data->ga_tag = pi_data->ga_tag; 4069 if (pi_data->is_guest_mode) 4070 ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, 4071 pi_data->ga_log_intr); 4072 else 4073 ret = amd_iommu_deactivate_guest_mode(ir_data); 4074 } else { 4075 ret = amd_iommu_deactivate_guest_mode(ir_data); 4076 } 4077 4078 return ret; 4079 } 4080 4081 4082 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, 4083 struct amd_ir_data *ir_data, 4084 struct irq_2_irte *irte_info, 4085 struct irq_cfg *cfg) 4086 { 4087 4088 /* 4089 * Atomically updates the IRTE with the new destination, vector 4090 * and flushes the interrupt entry cache. 4091 */ 4092 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid, 4093 irte_info->index, cfg->vector, 4094 cfg->dest_apicid); 4095 } 4096 4097 static int amd_ir_set_affinity(struct irq_data *data, 4098 const struct cpumask *mask, bool force) 4099 { 4100 struct amd_ir_data *ir_data = data->chip_data; 4101 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 4102 struct irq_cfg *cfg = irqd_cfg(data); 4103 struct irq_data *parent = data->parent_data; 4104 struct amd_iommu *iommu = ir_data->iommu; 4105 int ret; 4106 4107 if (!iommu) 4108 return -ENODEV; 4109 4110 ret = parent->chip->irq_set_affinity(parent, mask, force); 4111 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 4112 return ret; 4113 4114 amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg); 4115 /* 4116 * After this point, all the interrupts will start arriving 4117 * at the new destination. So, time to cleanup the previous 4118 * vector allocation. 4119 */ 4120 vector_schedule_cleanup(cfg); 4121 4122 return IRQ_SET_MASK_OK_DONE; 4123 } 4124 4125 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) 4126 { 4127 struct amd_ir_data *ir_data = irq_data->chip_data; 4128 4129 *msg = ir_data->msi_entry; 4130 } 4131 4132 static struct irq_chip amd_ir_chip = { 4133 .name = "AMD-IR", 4134 .irq_ack = apic_ack_irq, 4135 .irq_set_affinity = amd_ir_set_affinity, 4136 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, 4137 .irq_compose_msi_msg = ir_compose_msi_msg, 4138 }; 4139 4140 static const struct msi_parent_ops amdvi_msi_parent_ops = { 4141 .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, 4142 .bus_select_token = DOMAIN_BUS_AMDVI, 4143 .bus_select_mask = MATCH_PCI_MSI, 4144 .prefix = "IR-", 4145 .init_dev_msi_info = msi_parent_init_dev_msi_info, 4146 }; 4147 4148 int amd_iommu_create_irq_domain(struct amd_iommu *iommu) 4149 { 4150 struct irq_domain_info info = { 4151 .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index), 4152 .ops = &amd_ir_domain_ops, 4153 .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, 4154 .host_data = iommu, 4155 .parent = arch_get_ir_parent_domain(), 4156 }; 4157 4158 if (!info.fwnode) 4159 return -ENOMEM; 4160 4161 iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops); 4162 if (!iommu->ir_domain) { 4163 irq_domain_free_fwnode(info.fwnode); 4164 return -ENOMEM; 4165 } 4166 return 0; 4167 } 4168 #endif 4169 4170 MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); 4171