1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 4 * 5 * Rewrite, cleanup: 6 * 7 * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation 8 * Copyright (C) 2006 Olof Johansson <olof@lixom.net> 9 * 10 * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR. 11 */ 12 13 #include <linux/init.h> 14 #include <linux/types.h> 15 #include <linux/slab.h> 16 #include <linux/mm.h> 17 #include <linux/memblock.h> 18 #include <linux/spinlock.h> 19 #include <linux/string.h> 20 #include <linux/pci.h> 21 #include <linux/dma-mapping.h> 22 #include <linux/crash_dump.h> 23 #include <linux/memory.h> 24 #include <linux/vmalloc.h> 25 #include <linux/of.h> 26 #include <linux/of_address.h> 27 #include <linux/iommu.h> 28 #include <linux/rculist.h> 29 #include <asm/io.h> 30 #include <asm/prom.h> 31 #include <asm/rtas.h> 32 #include <asm/iommu.h> 33 #include <asm/pci-bridge.h> 34 #include <asm/machdep.h> 35 #include <asm/firmware.h> 36 #include <asm/tce.h> 37 #include <asm/ppc-pci.h> 38 #include <asm/udbg.h> 39 #include <asm/mmzone.h> 40 #include <asm/plpar_wrappers.h> 41 42 #include "pseries.h" 43 44 enum { 45 DDW_QUERY_PE_DMA_WIN = 0, 46 DDW_CREATE_PE_DMA_WIN = 1, 47 DDW_REMOVE_PE_DMA_WIN = 2, 48 49 DDW_APPLICABLE_SIZE 50 }; 51 52 enum { 53 DDW_EXT_SIZE = 0, 54 DDW_EXT_RESET_DMA_WIN = 1, 55 DDW_EXT_QUERY_OUT_SIZE = 2, 56 DDW_EXT_LIMITED_ADDR_MODE = 3 57 }; 58 59 static struct iommu_table *iommu_pseries_alloc_table(int node) 60 { 61 struct iommu_table *tbl; 62 63 tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); 64 if (!tbl) 65 return NULL; 66 67 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 68 kref_init(&tbl->it_kref); 69 return tbl; 70 } 71 72 #ifdef CONFIG_IOMMU_API 73 static struct iommu_table_group_ops spapr_tce_table_group_ops; 74 #endif 75 76 static struct iommu_table_group *iommu_pseries_alloc_group(int node) 77 { 78 struct iommu_table_group *table_group; 79 80 table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); 81 if (!table_group) 82 return NULL; 83 84 #ifdef CONFIG_IOMMU_API 85 table_group->ops = &spapr_tce_table_group_ops; 86 table_group->pgsizes = SZ_4K; 87 #endif 88 89 table_group->tables[0] = iommu_pseries_alloc_table(node); 90 if (table_group->tables[0]) 91 return table_group; 92 93 kfree(table_group); 94 return NULL; 95 } 96 97 static void iommu_pseries_free_group(struct iommu_table_group *table_group, 98 const char *node_name) 99 { 100 if (!table_group) 101 return; 102 103 #ifdef CONFIG_IOMMU_API 104 if (table_group->group) { 105 iommu_group_put(table_group->group); 106 BUG_ON(table_group->group); 107 } 108 #endif 109 110 /* Default DMA window table is at index 0, while DDW at 1. SR-IOV 111 * adapters only have table on index 0(if not direct mapped). 112 */ 113 if (table_group->tables[0]) 114 iommu_tce_table_put(table_group->tables[0]); 115 116 if (table_group->tables[1]) 117 iommu_tce_table_put(table_group->tables[1]); 118 119 kfree(table_group); 120 } 121 122 static int tce_build_pSeries(struct iommu_table *tbl, long index, 123 long npages, unsigned long uaddr, 124 enum dma_data_direction direction, 125 unsigned long attrs) 126 { 127 u64 proto_tce; 128 __be64 *tcep; 129 u64 rpn; 130 const unsigned long tceshift = tbl->it_page_shift; 131 const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); 132 133 proto_tce = TCE_PCI_READ; // Read allowed 134 135 if (direction != DMA_TO_DEVICE) 136 proto_tce |= TCE_PCI_WRITE; 137 138 tcep = ((__be64 *)tbl->it_base) + index; 139 140 while (npages--) { 141 /* can't move this out since we might cross MEMBLOCK boundary */ 142 rpn = __pa(uaddr) >> tceshift; 143 *tcep = cpu_to_be64(proto_tce | rpn << tceshift); 144 145 uaddr += pagesize; 146 tcep++; 147 } 148 return 0; 149 } 150 151 152 static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages) 153 { 154 __be64 *tcep; 155 156 tcep = ((__be64 *)tbl->it_base) + index; 157 158 while (npages--) 159 *(tcep++) = 0; 160 } 161 162 static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) 163 { 164 __be64 *tcep; 165 166 tcep = ((__be64 *)tbl->it_base) + index; 167 168 return be64_to_cpu(*tcep); 169 } 170 171 #ifdef CONFIG_IOMMU_API 172 static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl) 173 { 174 unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); 175 unsigned long *uas; 176 177 if (tbl->it_indirect_levels) /* Impossible */ 178 return -EPERM; 179 180 WARN_ON(tbl->it_userspace); 181 182 uas = vzalloc(cb); 183 if (!uas) 184 return -ENOMEM; 185 186 tbl->it_userspace = (__be64 *) uas; 187 188 return 0; 189 } 190 #endif 191 192 static void tce_iommu_userspace_view_free(struct iommu_table *tbl) 193 { 194 vfree(tbl->it_userspace); 195 tbl->it_userspace = NULL; 196 } 197 198 static void tce_free_pSeries(struct iommu_table *tbl) 199 { 200 if (!tbl->it_userspace) 201 tce_iommu_userspace_view_free(tbl); 202 } 203 204 static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); 205 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); 206 207 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, 208 long npages, unsigned long uaddr, 209 enum dma_data_direction direction, 210 unsigned long attrs) 211 { 212 u64 rc = 0; 213 u64 proto_tce, tce; 214 u64 rpn; 215 int ret = 0; 216 long tcenum_start = tcenum, npages_start = npages; 217 218 rpn = __pa(uaddr) >> tceshift; 219 proto_tce = TCE_PCI_READ; 220 if (direction != DMA_TO_DEVICE) 221 proto_tce |= TCE_PCI_WRITE; 222 223 while (npages--) { 224 tce = proto_tce | rpn << tceshift; 225 rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); 226 227 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { 228 ret = (int)rc; 229 tce_free_pSeriesLP(liobn, tcenum_start, tceshift, 230 (npages_start - (npages + 1))); 231 break; 232 } 233 234 if (rc && printk_ratelimit()) { 235 printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 236 printk("\tindex = 0x%llx\n", (u64)liobn); 237 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 238 printk("\ttce val = 0x%llx\n", tce ); 239 dump_stack(); 240 } 241 242 tcenum++; 243 rpn++; 244 } 245 return ret; 246 } 247 248 static DEFINE_PER_CPU(__be64 *, tce_page); 249 250 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, 251 long npages, unsigned long uaddr, 252 enum dma_data_direction direction, 253 unsigned long attrs) 254 { 255 u64 rc = 0; 256 u64 proto_tce; 257 __be64 *tcep; 258 u64 rpn; 259 long l, limit; 260 long tcenum_start = tcenum, npages_start = npages; 261 int ret = 0; 262 unsigned long flags; 263 const unsigned long tceshift = tbl->it_page_shift; 264 265 if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { 266 return tce_build_pSeriesLP(tbl->it_index, tcenum, 267 tceshift, npages, uaddr, 268 direction, attrs); 269 } 270 271 local_irq_save(flags); /* to protect tcep and the page behind it */ 272 273 tcep = __this_cpu_read(tce_page); 274 275 /* This is safe to do since interrupts are off when we're called 276 * from iommu_alloc{,_sg}() 277 */ 278 if (!tcep) { 279 tcep = (__be64 *)__get_free_page(GFP_ATOMIC); 280 /* If allocation fails, fall back to the loop implementation */ 281 if (!tcep) { 282 local_irq_restore(flags); 283 return tce_build_pSeriesLP(tbl->it_index, tcenum, 284 tceshift, 285 npages, uaddr, direction, attrs); 286 } 287 __this_cpu_write(tce_page, tcep); 288 } 289 290 rpn = __pa(uaddr) >> tceshift; 291 proto_tce = TCE_PCI_READ; 292 if (direction != DMA_TO_DEVICE) 293 proto_tce |= TCE_PCI_WRITE; 294 295 /* We can map max one pageful of TCEs at a time */ 296 do { 297 /* 298 * Set up the page with TCE data, looping through and setting 299 * the values. 300 */ 301 limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE); 302 303 for (l = 0; l < limit; l++) { 304 tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); 305 rpn++; 306 } 307 308 rc = plpar_tce_put_indirect((u64)tbl->it_index, 309 (u64)tcenum << tceshift, 310 (u64)__pa(tcep), 311 limit); 312 313 npages -= limit; 314 tcenum += limit; 315 } while (npages > 0 && !rc); 316 317 local_irq_restore(flags); 318 319 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { 320 ret = (int)rc; 321 tce_freemulti_pSeriesLP(tbl, tcenum_start, 322 (npages_start - (npages + limit))); 323 return ret; 324 } 325 326 if (rc && printk_ratelimit()) { 327 printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 328 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 329 printk("\tnpages = 0x%llx\n", (u64)npages); 330 printk("\ttce[0] val = 0x%llx\n", tcep[0]); 331 dump_stack(); 332 } 333 return ret; 334 } 335 336 static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, 337 long npages) 338 { 339 u64 rc; 340 341 while (npages--) { 342 rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); 343 344 if (rc && printk_ratelimit()) { 345 printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 346 printk("\tindex = 0x%llx\n", (u64)liobn); 347 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 348 dump_stack(); 349 } 350 351 tcenum++; 352 } 353 } 354 355 356 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) 357 { 358 u64 rc; 359 long rpages = npages; 360 unsigned long limit; 361 362 if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) 363 return tce_free_pSeriesLP(tbl->it_index, tcenum, 364 tbl->it_page_shift, npages); 365 366 do { 367 limit = min_t(unsigned long, rpages, 512); 368 369 rc = plpar_tce_stuff((u64)tbl->it_index, 370 (u64)tcenum << tbl->it_page_shift, 0, limit); 371 372 rpages -= limit; 373 tcenum += limit; 374 } while (rpages > 0 && !rc); 375 376 if (rc && printk_ratelimit()) { 377 printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); 378 printk("\trc = %lld\n", rc); 379 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 380 printk("\tnpages = 0x%llx\n", (u64)npages); 381 dump_stack(); 382 } 383 } 384 385 static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) 386 { 387 u64 rc; 388 unsigned long tce_ret; 389 390 rc = plpar_tce_get((u64)tbl->it_index, 391 (u64)tcenum << tbl->it_page_shift, &tce_ret); 392 393 if (rc && printk_ratelimit()) { 394 printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); 395 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 396 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 397 dump_stack(); 398 } 399 400 return tce_ret; 401 } 402 403 /* this is compatible with cells for the device tree property */ 404 struct dynamic_dma_window_prop { 405 __be32 liobn; /* tce table number */ 406 __be64 dma_base; /* address hi,lo */ 407 __be32 tce_shift; /* ilog2(tce_page_size) */ 408 __be32 window_shift; /* ilog2(tce_window_size) */ 409 }; 410 411 struct dma_win { 412 struct device_node *device; 413 const struct dynamic_dma_window_prop *prop; 414 bool direct; 415 struct list_head list; 416 }; 417 418 /* Dynamic DMA Window support */ 419 struct ddw_query_response { 420 u32 windows_available; 421 u64 largest_available_block; 422 u32 page_size; 423 u32 migration_capable; 424 }; 425 426 struct ddw_create_response { 427 u32 liobn; 428 u32 addr_hi; 429 u32 addr_lo; 430 }; 431 432 static LIST_HEAD(dma_win_list); 433 /* prevents races between memory on/offline and window creation */ 434 static DEFINE_SPINLOCK(dma_win_list_lock); 435 /* protects initializing window twice for same device */ 436 static DEFINE_MUTEX(dma_win_init_mutex); 437 438 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, 439 unsigned long num_pfn, const void *arg) 440 { 441 const struct dynamic_dma_window_prop *maprange = arg; 442 int rc; 443 u64 tce_size, num_tce, dma_offset, next; 444 u32 tce_shift; 445 long limit; 446 447 tce_shift = be32_to_cpu(maprange->tce_shift); 448 tce_size = 1ULL << tce_shift; 449 next = start_pfn << PAGE_SHIFT; 450 num_tce = num_pfn << PAGE_SHIFT; 451 452 /* round back to the beginning of the tce page size */ 453 num_tce += next & (tce_size - 1); 454 next &= ~(tce_size - 1); 455 456 /* covert to number of tces */ 457 num_tce |= tce_size - 1; 458 num_tce >>= tce_shift; 459 460 do { 461 /* 462 * Set up the page with TCE data, looping through and setting 463 * the values. 464 */ 465 limit = min_t(long, num_tce, 512); 466 dma_offset = next + be64_to_cpu(maprange->dma_base); 467 468 rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), 469 dma_offset, 470 0, limit); 471 next += limit * tce_size; 472 num_tce -= limit; 473 } while (num_tce > 0 && !rc); 474 475 return rc; 476 } 477 478 static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, 479 unsigned long num_pfn, const void *arg) 480 { 481 const struct dynamic_dma_window_prop *maprange = arg; 482 u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn; 483 __be64 *tcep; 484 u32 tce_shift; 485 u64 rc = 0; 486 long l, limit; 487 488 if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { 489 unsigned long tceshift = be32_to_cpu(maprange->tce_shift); 490 unsigned long dmastart = (start_pfn << PAGE_SHIFT) + 491 be64_to_cpu(maprange->dma_base); 492 unsigned long tcenum = dmastart >> tceshift; 493 unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; 494 void *uaddr = __va(start_pfn << PAGE_SHIFT); 495 496 return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), 497 tcenum, tceshift, npages, (unsigned long) uaddr, 498 DMA_BIDIRECTIONAL, 0); 499 } 500 501 local_irq_disable(); /* to protect tcep and the page behind it */ 502 tcep = __this_cpu_read(tce_page); 503 504 if (!tcep) { 505 tcep = (__be64 *)__get_free_page(GFP_ATOMIC); 506 if (!tcep) { 507 local_irq_enable(); 508 return -ENOMEM; 509 } 510 __this_cpu_write(tce_page, tcep); 511 } 512 513 proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; 514 515 liobn = (u64)be32_to_cpu(maprange->liobn); 516 tce_shift = be32_to_cpu(maprange->tce_shift); 517 tce_size = 1ULL << tce_shift; 518 next = start_pfn << PAGE_SHIFT; 519 num_tce = num_pfn << PAGE_SHIFT; 520 521 /* round back to the beginning of the tce page size */ 522 num_tce += next & (tce_size - 1); 523 next &= ~(tce_size - 1); 524 525 /* covert to number of tces */ 526 num_tce |= tce_size - 1; 527 num_tce >>= tce_shift; 528 529 /* We can map max one pageful of TCEs at a time */ 530 do { 531 /* 532 * Set up the page with TCE data, looping through and setting 533 * the values. 534 */ 535 limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE); 536 dma_offset = next + be64_to_cpu(maprange->dma_base); 537 538 for (l = 0; l < limit; l++) { 539 tcep[l] = cpu_to_be64(proto_tce | next); 540 next += tce_size; 541 } 542 543 rc = plpar_tce_put_indirect(liobn, 544 dma_offset, 545 (u64)__pa(tcep), 546 limit); 547 548 num_tce -= limit; 549 } while (num_tce > 0 && !rc); 550 551 /* error cleanup: caller will clear whole range */ 552 553 local_irq_enable(); 554 return rc; 555 } 556 557 static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, 558 unsigned long num_pfn, void *arg) 559 { 560 return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); 561 } 562 563 static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, 564 unsigned long liobn, unsigned long win_addr, 565 unsigned long window_size, unsigned long page_shift, 566 void *base, struct iommu_table_ops *table_ops) 567 { 568 tbl->it_busno = busno; 569 tbl->it_index = liobn; 570 tbl->it_offset = win_addr >> page_shift; 571 tbl->it_size = window_size >> page_shift; 572 tbl->it_page_shift = page_shift; 573 tbl->it_base = (unsigned long)base; 574 tbl->it_blocksize = 16; 575 tbl->it_type = TCE_PCI; 576 tbl->it_ops = table_ops; 577 } 578 579 struct iommu_table_ops iommu_table_pseries_ops; 580 581 static void iommu_table_setparms(struct pci_controller *phb, 582 struct device_node *dn, 583 struct iommu_table *tbl) 584 { 585 struct device_node *node; 586 const unsigned long *basep; 587 const u32 *sizep; 588 589 /* Test if we are going over 2GB of DMA space */ 590 if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { 591 udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 592 panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 593 } 594 595 node = phb->dn; 596 basep = of_get_property(node, "linux,tce-base", NULL); 597 sizep = of_get_property(node, "linux,tce-size", NULL); 598 if (basep == NULL || sizep == NULL) { 599 printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has " 600 "missing tce entries !\n", dn); 601 return; 602 } 603 604 iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, 605 phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, 606 __va(*basep), &iommu_table_pseries_ops); 607 608 if (!is_kdump_kernel()) 609 memset((void *)tbl->it_base, 0, *sizep); 610 611 phb->dma_window_base_cur += phb->dma_window_size; 612 } 613 614 struct iommu_table_ops iommu_table_lpar_multi_ops; 615 616 struct iommu_table_ops iommu_table_pseries_ops = { 617 .set = tce_build_pSeries, 618 .clear = tce_clear_pSeries, 619 .get = tce_get_pseries 620 }; 621 622 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) 623 { 624 struct device_node *dn; 625 struct iommu_table *tbl; 626 struct device_node *isa_dn, *isa_dn_orig; 627 struct device_node *tmp; 628 struct pci_dn *pci; 629 int children; 630 631 dn = pci_bus_to_OF_node(bus); 632 633 pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn); 634 635 if (bus->self) { 636 /* This is not a root bus, any setup will be done for the 637 * device-side of the bridge in iommu_dev_setup_pSeries(). 638 */ 639 return; 640 } 641 pci = PCI_DN(dn); 642 643 /* Check if the ISA bus on the system is under 644 * this PHB. 645 */ 646 isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa"); 647 648 while (isa_dn && isa_dn != dn) 649 isa_dn = isa_dn->parent; 650 651 of_node_put(isa_dn_orig); 652 653 /* Count number of direct PCI children of the PHB. */ 654 for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling) 655 children++; 656 657 pr_debug("Children: %d\n", children); 658 659 /* Calculate amount of DMA window per slot. Each window must be 660 * a power of two (due to pci_alloc_consistent requirements). 661 * 662 * Keep 256MB aside for PHBs with ISA. 663 */ 664 665 if (!isa_dn) { 666 /* No ISA/IDE - just set window size and return */ 667 pci->phb->dma_window_size = 0x80000000ul; /* To be divided */ 668 669 while (pci->phb->dma_window_size * children > 0x80000000ul) 670 pci->phb->dma_window_size >>= 1; 671 pr_debug("No ISA/IDE, window size is 0x%llx\n", 672 pci->phb->dma_window_size); 673 pci->phb->dma_window_base_cur = 0; 674 675 return; 676 } 677 678 /* If we have ISA, then we probably have an IDE 679 * controller too. Allocate a 128MB table but 680 * skip the first 128MB to avoid stepping on ISA 681 * space. 682 */ 683 pci->phb->dma_window_size = 0x8000000ul; 684 pci->phb->dma_window_base_cur = 0x8000000ul; 685 686 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 687 tbl = pci->table_group->tables[0]; 688 689 iommu_table_setparms(pci->phb, dn, tbl); 690 691 if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) 692 panic("Failed to initialize iommu table"); 693 694 /* Divide the rest (1.75GB) among the children */ 695 pci->phb->dma_window_size = 0x80000000ul; 696 while (pci->phb->dma_window_size * children > 0x70000000ul) 697 pci->phb->dma_window_size >>= 1; 698 699 pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); 700 } 701 702 #ifdef CONFIG_IOMMU_API 703 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 704 long *tce, enum dma_data_direction *direction) 705 { 706 long rc; 707 unsigned long ioba = (unsigned long) index << tbl->it_page_shift; 708 unsigned long flags, oldtce = 0; 709 u64 proto_tce = iommu_direction_to_tce_perm(*direction); 710 unsigned long newtce = *tce | proto_tce; 711 712 spin_lock_irqsave(&tbl->large_pool.lock, flags); 713 714 rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce); 715 if (!rc) 716 rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce); 717 718 if (!rc) { 719 *direction = iommu_tce_direction(oldtce); 720 *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); 721 } 722 723 spin_unlock_irqrestore(&tbl->large_pool.lock, flags); 724 725 return rc; 726 } 727 728 static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index, 729 bool __always_unused alloc) 730 { 731 return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL; 732 } 733 #endif 734 735 struct iommu_table_ops iommu_table_lpar_multi_ops = { 736 .set = tce_buildmulti_pSeriesLP, 737 #ifdef CONFIG_IOMMU_API 738 .xchg_no_kill = tce_exchange_pseries, 739 .useraddrptr = tce_useraddr_pSeriesLP, 740 #endif 741 .clear = tce_freemulti_pSeriesLP, 742 .get = tce_get_pSeriesLP, 743 .free = tce_free_pSeries 744 }; 745 746 #ifdef CONFIG_IOMMU_API 747 /* 748 * When the DMA window properties might have been removed, 749 * the parent node has the table_group setup on it. 750 */ 751 static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev, 752 struct iommu_table_group *table_group) 753 { 754 struct device_node *dn = pci_device_to_OF_node(dev); 755 struct pci_dn *rpdn; 756 757 for (; dn && PCI_DN(dn); dn = dn->parent) { 758 rpdn = PCI_DN(dn); 759 760 if (table_group == rpdn->table_group) 761 return dn; 762 } 763 764 return NULL; 765 } 766 #endif 767 768 /* 769 * Find nearest ibm,dma-window (default DMA window) or direct DMA window or 770 * dynamic 64bit DMA window, walking up the device tree. 771 */ 772 static struct device_node *pci_dma_find(struct device_node *dn, 773 struct dynamic_dma_window_prop *prop) 774 { 775 const __be32 *default_prop = NULL; 776 const __be32 *ddw_prop = NULL; 777 struct device_node *rdn = NULL; 778 bool default_win = false, ddw_win = false; 779 780 for ( ; dn && PCI_DN(dn); dn = dn->parent) { 781 default_prop = of_get_property(dn, "ibm,dma-window", NULL); 782 if (default_prop) { 783 rdn = dn; 784 default_win = true; 785 } 786 ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); 787 if (ddw_prop) { 788 rdn = dn; 789 ddw_win = true; 790 break; 791 } 792 ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); 793 if (ddw_prop) { 794 rdn = dn; 795 ddw_win = true; 796 break; 797 } 798 799 /* At least found default window, which is the case for normal boot */ 800 if (default_win) 801 break; 802 } 803 804 /* For PCI devices there will always be a DMA window, either on the device 805 * or parent bus 806 */ 807 WARN_ON(!(default_win | ddw_win)); 808 809 /* caller doesn't want to get DMA window property */ 810 if (!prop) 811 return rdn; 812 813 /* parse DMA window property. During normal system boot, only default 814 * DMA window is passed in OF. But, for kdump, a dedicated adapter might 815 * have both default and DDW in FDT. In this scenario, DDW takes precedence 816 * over default window. 817 */ 818 if (ddw_win) { 819 struct dynamic_dma_window_prop *p; 820 821 p = (struct dynamic_dma_window_prop *)ddw_prop; 822 prop->liobn = p->liobn; 823 prop->dma_base = p->dma_base; 824 prop->tce_shift = p->tce_shift; 825 prop->window_shift = p->window_shift; 826 } else if (default_win) { 827 unsigned long offset, size, liobn; 828 829 of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); 830 831 prop->liobn = cpu_to_be32((u32)liobn); 832 prop->dma_base = cpu_to_be64(offset); 833 prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); 834 prop->window_shift = cpu_to_be32(order_base_2(size)); 835 } 836 837 return rdn; 838 } 839 840 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) 841 { 842 struct iommu_table *tbl; 843 struct device_node *dn, *pdn; 844 struct pci_dn *ppci; 845 struct dynamic_dma_window_prop prop; 846 847 dn = pci_bus_to_OF_node(bus); 848 849 pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", 850 dn); 851 852 pdn = pci_dma_find(dn, &prop); 853 854 /* In PPC architecture, there will always be DMA window on bus or one of the 855 * parent bus. During reboot, there will be ibm,dma-window property to 856 * define DMA window. For kdump, there will at least be default window or DDW 857 * or both. 858 * There is an exception to the above. In case the PE goes into frozen 859 * state, firmware may not provide ibm,dma-window property at the time 860 * of LPAR boot up. 861 */ 862 863 if (!pdn) { 864 pr_debug(" no ibm,dma-window property !\n"); 865 return; 866 } 867 868 ppci = PCI_DN(pdn); 869 870 pr_debug(" parent is %pOF, iommu_table: 0x%p\n", 871 pdn, ppci->table_group); 872 873 if (!ppci->table_group) { 874 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); 875 tbl = ppci->table_group->tables[0]; 876 877 iommu_table_setparms_common(tbl, ppci->phb->bus->number, 878 be32_to_cpu(prop.liobn), 879 be64_to_cpu(prop.dma_base), 880 1ULL << be32_to_cpu(prop.window_shift), 881 be32_to_cpu(prop.tce_shift), NULL, 882 &iommu_table_lpar_multi_ops); 883 884 if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) 885 panic("Failed to initialize iommu table"); 886 887 iommu_register_group(ppci->table_group, 888 pci_domain_nr(bus), 0); 889 pr_debug(" created table: %p\n", ppci->table_group); 890 } 891 } 892 893 894 static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) 895 { 896 struct device_node *dn; 897 struct iommu_table *tbl; 898 899 pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev)); 900 901 dn = dev->dev.of_node; 902 903 /* If we're the direct child of a root bus, then we need to allocate 904 * an iommu table ourselves. The bus setup code should have setup 905 * the window sizes already. 906 */ 907 if (!dev->bus->self) { 908 struct pci_controller *phb = PCI_DN(dn)->phb; 909 910 pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); 911 PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); 912 tbl = PCI_DN(dn)->table_group->tables[0]; 913 iommu_table_setparms(phb, dn, tbl); 914 915 if (!iommu_init_table(tbl, phb->node, 0, 0)) 916 panic("Failed to initialize iommu table"); 917 918 set_iommu_table_base(&dev->dev, tbl); 919 return; 920 } 921 922 /* If this device is further down the bus tree, search upwards until 923 * an already allocated iommu table is found and use that. 924 */ 925 926 while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) 927 dn = dn->parent; 928 929 if (dn && PCI_DN(dn)) 930 set_iommu_table_base(&dev->dev, 931 PCI_DN(dn)->table_group->tables[0]); 932 else 933 printk(KERN_WARNING "iommu: Device %s has no iommu table\n", 934 pci_name(dev)); 935 } 936 937 static int __read_mostly disable_ddw; 938 939 static int __init disable_ddw_setup(char *str) 940 { 941 disable_ddw = 1; 942 printk(KERN_INFO "ppc iommu: disabling ddw.\n"); 943 944 return 0; 945 } 946 947 early_param("disable_ddw", disable_ddw_setup); 948 949 static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) 950 { 951 int ret; 952 953 ret = tce_clearrange_multi_pSeriesLP(0, 954 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); 955 if (ret) 956 pr_warn("%pOF failed to clear tces in window.\n", 957 np); 958 else 959 pr_debug("%pOF successfully cleared tces in window.\n", 960 np); 961 } 962 963 /* 964 * Call only if DMA window is clean. 965 */ 966 static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) 967 { 968 int ret; 969 970 ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); 971 if (ret) 972 pr_warn("%pOF: failed to remove DMA window: rtas returned " 973 "%d to ibm,remove-pe-dma-window(%x) %llx\n", 974 np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); 975 else 976 pr_debug("%pOF: successfully removed DMA window: rtas returned " 977 "%d to ibm,remove-pe-dma-window(%x) %llx\n", 978 np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); 979 } 980 981 static void remove_dma_window(struct device_node *np, u32 *ddw_avail, 982 struct property *win, bool cleanup) 983 { 984 struct dynamic_dma_window_prop *dwp; 985 u64 liobn; 986 987 dwp = win->value; 988 liobn = (u64)be32_to_cpu(dwp->liobn); 989 990 if (cleanup) 991 clean_dma_window(np, dwp); 992 __remove_dma_window(np, ddw_avail, liobn); 993 } 994 995 static void copy_property(struct device_node *pdn, const char *from, const char *to) 996 { 997 struct property *src, *dst; 998 999 src = of_find_property(pdn, from, NULL); 1000 if (!src) 1001 return; 1002 1003 dst = kzalloc(sizeof(*dst), GFP_KERNEL); 1004 if (!dst) 1005 return; 1006 1007 dst->name = kstrdup(to, GFP_KERNEL); 1008 dst->value = kmemdup(src->value, src->length, GFP_KERNEL); 1009 dst->length = src->length; 1010 if (!dst->name || !dst->value) 1011 return; 1012 1013 if (of_add_property(pdn, dst)) { 1014 pr_err("Unable to add DMA window property for %pOF", pdn); 1015 goto free_prop; 1016 } 1017 1018 return; 1019 1020 free_prop: 1021 kfree(dst->name); 1022 kfree(dst->value); 1023 kfree(dst); 1024 } 1025 1026 static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name, 1027 bool cleanup) 1028 { 1029 struct property *win; 1030 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1031 int ret = 0; 1032 1033 win = of_find_property(np, win_name, NULL); 1034 if (!win) 1035 return -EINVAL; 1036 1037 ret = of_property_read_u32_array(np, "ibm,ddw-applicable", 1038 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1039 if (ret) 1040 return 0; 1041 1042 if (win->length >= sizeof(struct dynamic_dma_window_prop)) 1043 remove_dma_window(np, ddw_avail, win, cleanup); 1044 1045 if (!remove_prop) 1046 return 0; 1047 1048 /* Default window property if removed is lost as reset-pe doesn't restore it. 1049 * Though FDT has a copy of it, the DLPAR hotplugged devices will not have a 1050 * node on FDT until next reboot. So, back it up. 1051 */ 1052 if ((strcmp(win_name, "ibm,dma-window") == 0) && 1053 !of_find_property(np, "ibm,dma-window-saved", NULL)) 1054 copy_property(np, win_name, "ibm,dma-window-saved"); 1055 1056 ret = of_remove_property(np, win); 1057 if (ret) 1058 pr_warn("%pOF: failed to remove DMA window property: %d\n", 1059 np, ret); 1060 return 0; 1061 } 1062 1063 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift, 1064 bool *direct_mapping) 1065 { 1066 struct dma_win *window; 1067 const struct dynamic_dma_window_prop *dma64; 1068 bool found = false; 1069 1070 spin_lock(&dma_win_list_lock); 1071 /* check if we already created a window and dupe that config if so */ 1072 list_for_each_entry(window, &dma_win_list, list) { 1073 if (window->device == pdn) { 1074 dma64 = window->prop; 1075 *dma_addr = be64_to_cpu(dma64->dma_base); 1076 *window_shift = be32_to_cpu(dma64->window_shift); 1077 *direct_mapping = window->direct; 1078 found = true; 1079 break; 1080 } 1081 } 1082 spin_unlock(&dma_win_list_lock); 1083 1084 return found; 1085 } 1086 1087 static struct dma_win *ddw_list_new_entry(struct device_node *pdn, 1088 const struct dynamic_dma_window_prop *dma64) 1089 { 1090 struct dma_win *window; 1091 1092 window = kzalloc(sizeof(*window), GFP_KERNEL); 1093 if (!window) 1094 return NULL; 1095 1096 window->device = pdn; 1097 window->prop = dma64; 1098 window->direct = false; 1099 1100 return window; 1101 } 1102 1103 static void find_existing_ddw_windows_named(const char *name) 1104 { 1105 int len; 1106 struct device_node *pdn; 1107 struct dma_win *window; 1108 const struct dynamic_dma_window_prop *dma64; 1109 1110 for_each_node_with_property(pdn, name) { 1111 dma64 = of_get_property(pdn, name, &len); 1112 if (!dma64 || len < sizeof(*dma64)) { 1113 remove_dma_window_named(pdn, true, name, true); 1114 continue; 1115 } 1116 1117 /* If at the time of system initialization, there are DDWs in OF, 1118 * it means this is during kexec. DDW could be direct or dynamic. 1119 * We will just mark DDWs as "dynamic" since this is kdump path, 1120 * no need to worry about perforance. ddw_list_new_entry() will 1121 * set window->direct = false. 1122 */ 1123 window = ddw_list_new_entry(pdn, dma64); 1124 if (!window) { 1125 of_node_put(pdn); 1126 break; 1127 } 1128 1129 spin_lock(&dma_win_list_lock); 1130 list_add(&window->list, &dma_win_list); 1131 spin_unlock(&dma_win_list_lock); 1132 } 1133 } 1134 1135 static int find_existing_ddw_windows(void) 1136 { 1137 if (!firmware_has_feature(FW_FEATURE_LPAR)) 1138 return 0; 1139 1140 find_existing_ddw_windows_named(DIRECT64_PROPNAME); 1141 find_existing_ddw_windows_named(DMA64_PROPNAME); 1142 1143 return 0; 1144 } 1145 machine_arch_initcall(pseries, find_existing_ddw_windows); 1146 1147 /** 1148 * ddw_read_ext - Get the value of an DDW extension 1149 * @np: device node from which the extension value is to be read. 1150 * @extnum: index number of the extension. 1151 * @value: pointer to return value, modified when extension is available. 1152 * 1153 * Checks if "ibm,ddw-extensions" exists for this node, and get the value 1154 * on index 'extnum'. 1155 * It can be used only to check if a property exists, passing value == NULL. 1156 * 1157 * Returns: 1158 * 0 if extension successfully read 1159 * -EINVAL if the "ibm,ddw-extensions" does not exist, 1160 * -ENODATA if "ibm,ddw-extensions" does not have a value, and 1161 * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension. 1162 */ 1163 static inline int ddw_read_ext(const struct device_node *np, int extnum, 1164 u32 *value) 1165 { 1166 static const char propname[] = "ibm,ddw-extensions"; 1167 u32 count; 1168 int ret; 1169 1170 ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count); 1171 if (ret) 1172 return ret; 1173 1174 if (count < extnum) 1175 return -EOVERFLOW; 1176 1177 if (!value) 1178 value = &count; 1179 1180 return of_property_read_u32_index(np, propname, extnum, value); 1181 } 1182 1183 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, 1184 struct ddw_query_response *query, 1185 struct device_node *parent) 1186 { 1187 struct device_node *dn; 1188 struct pci_dn *pdn; 1189 u32 cfg_addr, ext_query, query_out[5]; 1190 u64 buid; 1191 int ret, out_sz; 1192 1193 /* 1194 * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many 1195 * output parameters ibm,query-pe-dma-windows will have, ranging from 1196 * 5 to 6. 1197 */ 1198 ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query); 1199 if (!ret && ext_query == 1) 1200 out_sz = 6; 1201 else 1202 out_sz = 5; 1203 1204 /* 1205 * Get the config address and phb buid of the PE window. 1206 * Rely on eeh to retrieve this for us. 1207 * Retrieve them from the pci device, not the node with the 1208 * dma-window property 1209 */ 1210 dn = pci_device_to_OF_node(dev); 1211 pdn = PCI_DN(dn); 1212 buid = pdn->phb->buid; 1213 cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); 1214 1215 ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out, 1216 cfg_addr, BUID_HI(buid), BUID_LO(buid)); 1217 1218 switch (out_sz) { 1219 case 5: 1220 query->windows_available = query_out[0]; 1221 query->largest_available_block = query_out[1]; 1222 query->page_size = query_out[2]; 1223 query->migration_capable = query_out[3]; 1224 break; 1225 case 6: 1226 query->windows_available = query_out[0]; 1227 query->largest_available_block = ((u64)query_out[1] << 32) | 1228 query_out[2]; 1229 query->page_size = query_out[3]; 1230 query->migration_capable = query_out[4]; 1231 break; 1232 } 1233 1234 dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n", 1235 ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid), 1236 BUID_LO(buid), ret, query->largest_available_block, 1237 query->page_size, query->windows_available); 1238 1239 return ret; 1240 } 1241 1242 static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, 1243 struct ddw_create_response *create, int page_shift, 1244 int window_shift) 1245 { 1246 struct device_node *dn; 1247 struct pci_dn *pdn; 1248 u32 cfg_addr; 1249 u64 buid; 1250 int ret; 1251 1252 /* 1253 * Get the config address and phb buid of the PE window. 1254 * Rely on eeh to retrieve this for us. 1255 * Retrieve them from the pci device, not the node with the 1256 * dma-window property 1257 */ 1258 dn = pci_device_to_OF_node(dev); 1259 pdn = PCI_DN(dn); 1260 buid = pdn->phb->buid; 1261 cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); 1262 1263 do { 1264 /* extra outputs are LIOBN and dma-addr (hi, lo) */ 1265 ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4, 1266 (u32 *)create, cfg_addr, BUID_HI(buid), 1267 BUID_LO(buid), page_shift, window_shift); 1268 } while (rtas_busy_delay(ret)); 1269 dev_info(&dev->dev, 1270 "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " 1271 "(liobn = 0x%x starting addr = %x %x)\n", 1272 ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid), 1273 BUID_LO(buid), page_shift, window_shift, ret, create->liobn, 1274 create->addr_hi, create->addr_lo); 1275 1276 return ret; 1277 } 1278 1279 struct failed_ddw_pdn { 1280 struct device_node *pdn; 1281 struct list_head list; 1282 }; 1283 1284 static LIST_HEAD(failed_ddw_pdn_list); 1285 1286 static phys_addr_t ddw_memory_hotplug_max(void) 1287 { 1288 resource_size_t max_addr; 1289 1290 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1291 max_addr = hot_add_drconf_memory_max(); 1292 #else 1293 max_addr = memblock_end_of_DRAM(); 1294 #endif 1295 1296 return max_addr; 1297 } 1298 1299 /* 1300 * Platforms supporting the DDW option starting with LoPAR level 2.7 implement 1301 * ibm,ddw-extensions, which carries the rtas token for 1302 * ibm,reset-pe-dma-windows. 1303 * That rtas-call can be used to restore the default DMA window for the device. 1304 */ 1305 static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) 1306 { 1307 int ret; 1308 u32 cfg_addr, reset_dma_win; 1309 u64 buid; 1310 struct device_node *dn; 1311 struct pci_dn *pdn; 1312 1313 ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); 1314 if (ret) 1315 return; 1316 1317 dn = pci_device_to_OF_node(dev); 1318 pdn = PCI_DN(dn); 1319 buid = pdn->phb->buid; 1320 cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); 1321 1322 ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid), 1323 BUID_LO(buid)); 1324 if (ret) 1325 dev_info(&dev->dev, 1326 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ", 1327 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), 1328 ret); 1329 } 1330 1331 /* 1332 * Platforms support placing PHB in limited address mode starting with LoPAR 1333 * level 2.13 implement. In this mode, the DMA address returned by DDW is over 1334 * 4GB but, less than 64-bits. This benefits IO adapters that don't support 1335 * 64-bits for DMA addresses. 1336 */ 1337 static int limited_dma_window(struct pci_dev *dev, struct device_node *par_dn) 1338 { 1339 int ret; 1340 u32 cfg_addr, reset_dma_win, las_supported; 1341 u64 buid; 1342 struct device_node *dn; 1343 struct pci_dn *pdn; 1344 1345 ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); 1346 if (ret) 1347 goto out; 1348 1349 ret = ddw_read_ext(par_dn, DDW_EXT_LIMITED_ADDR_MODE, &las_supported); 1350 1351 /* Limited Address Space extension available on the platform but DDW in 1352 * limited addressing mode not supported 1353 */ 1354 if (!ret && !las_supported) 1355 ret = -EPROTO; 1356 1357 if (ret) { 1358 dev_info(&dev->dev, "Limited Address Space for DDW not Supported, err: %d", ret); 1359 goto out; 1360 } 1361 1362 dn = pci_device_to_OF_node(dev); 1363 pdn = PCI_DN(dn); 1364 buid = pdn->phb->buid; 1365 cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); 1366 1367 ret = rtas_call(reset_dma_win, 4, 1, NULL, cfg_addr, BUID_HI(buid), 1368 BUID_LO(buid), 1); 1369 if (ret) 1370 dev_info(&dev->dev, 1371 "ibm,reset-pe-dma-windows(%x) for Limited Addr Support: %x %x %x returned %d ", 1372 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), 1373 ret); 1374 1375 out: 1376 return ret; 1377 } 1378 1379 /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ 1380 static int iommu_get_page_shift(u32 query_page_size) 1381 { 1382 /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */ 1383 const int shift[] = { 1384 __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M), 1385 __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M), 1386 __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M) 1387 }; 1388 1389 int i = ARRAY_SIZE(shift) - 1; 1390 int ret = 0; 1391 1392 /* 1393 * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field: 1394 * - bit 31 means 4k pages are supported, 1395 * - bit 30 means 64k pages are supported, and so on. 1396 * Larger pagesizes map more memory with the same amount of TCEs, so start probing them. 1397 */ 1398 for (; i >= 0 ; i--) { 1399 if (query_page_size & (1 << i)) 1400 ret = max(ret, shift[i]); 1401 } 1402 1403 return ret; 1404 } 1405 1406 static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, 1407 u32 page_shift, u32 window_shift) 1408 { 1409 struct dynamic_dma_window_prop *ddwprop; 1410 struct property *win64; 1411 1412 win64 = kzalloc(sizeof(*win64), GFP_KERNEL); 1413 if (!win64) 1414 return NULL; 1415 1416 win64->name = kstrdup(propname, GFP_KERNEL); 1417 ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); 1418 win64->value = ddwprop; 1419 win64->length = sizeof(*ddwprop); 1420 if (!win64->name || !win64->value) { 1421 kfree(win64->name); 1422 kfree(win64->value); 1423 kfree(win64); 1424 return NULL; 1425 } 1426 1427 ddwprop->liobn = cpu_to_be32(liobn); 1428 ddwprop->dma_base = cpu_to_be64(dma_addr); 1429 ddwprop->tce_shift = cpu_to_be32(page_shift); 1430 ddwprop->window_shift = cpu_to_be32(window_shift); 1431 1432 return win64; 1433 } 1434 1435 /* 1436 * If the PE supports dynamic dma windows, and there is space for a table 1437 * that can map all pages in a linear offset, then setup such a table, 1438 * and record the dma-offset in the struct device. 1439 * 1440 * dev: the pci device we are checking 1441 * pdn: the parent pe node with the ibm,dma_window property 1442 * Future: also check if we can remap the base window for our base page size 1443 * 1444 * returns true if can map all pages (direct mapping), false otherwise.. 1445 */ 1446 static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn, u64 dma_mask) 1447 { 1448 int len = 0, ret; 1449 int max_ram_len = order_base_2(ddw_memory_hotplug_max()); 1450 struct ddw_query_response query; 1451 struct ddw_create_response create; 1452 int page_shift; 1453 u64 win_addr, dynamic_offset = 0; 1454 const char *win_name; 1455 struct device_node *dn; 1456 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1457 struct dma_win *window; 1458 struct property *win64; 1459 struct failed_ddw_pdn *fpdn; 1460 bool default_win_removed = false, direct_mapping = false; 1461 bool dynamic_mapping = false; 1462 bool pmem_present; 1463 struct pci_dn *pci = PCI_DN(pdn); 1464 struct property *default_win = NULL; 1465 bool limited_addr_req = false, limited_addr_enabled = false; 1466 int dev_max_ddw; 1467 int ddw_sz; 1468 1469 dn = of_find_node_by_type(NULL, "ibm,pmemory"); 1470 pmem_present = dn != NULL; 1471 of_node_put(dn); 1472 1473 mutex_lock(&dma_win_init_mutex); 1474 1475 if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) 1476 goto out_unlock; 1477 1478 /* 1479 * If we already went through this for a previous function of 1480 * the same device and failed, we don't want to muck with the 1481 * DMA window again, as it will race with in-flight operations 1482 * and can lead to EEHs. The above mutex protects access to the 1483 * list. 1484 */ 1485 list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { 1486 if (fpdn->pdn == pdn) 1487 goto out_unlock; 1488 } 1489 1490 /* 1491 * the ibm,ddw-applicable property holds the tokens for: 1492 * ibm,query-pe-dma-window 1493 * ibm,create-pe-dma-window 1494 * for the given node in that order. 1495 * the property is actually in the parent, not the PE 1496 */ 1497 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 1498 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1499 if (ret) 1500 goto out_failed; 1501 1502 /* 1503 * Query if there is a second window of size to map the 1504 * whole partition. Query returns number of windows, largest 1505 * block assigned to PE (partition endpoint), and two bitmasks 1506 * of page sizes: supported and supported for migrate-dma. 1507 */ 1508 dn = pci_device_to_OF_node(dev); 1509 ret = query_ddw(dev, ddw_avail, &query, pdn); 1510 if (ret != 0) 1511 goto out_failed; 1512 1513 /* DMA Limited Addressing required? This is when the driver has 1514 * requested to create DDW but supports mask which is less than 64-bits 1515 */ 1516 limited_addr_req = (dma_mask != DMA_BIT_MASK(64)); 1517 1518 /* place the PHB in Limited Addressing mode */ 1519 if (limited_addr_req) { 1520 if (limited_dma_window(dev, pdn)) 1521 goto out_failed; 1522 1523 /* PHB is in Limited address mode */ 1524 limited_addr_enabled = true; 1525 } 1526 1527 /* 1528 * If there is no window available, remove the default DMA window, 1529 * if it's present. This will make all the resources available to the 1530 * new DDW window. 1531 * If anything fails after this, we need to restore it, so also check 1532 * for extensions presence. 1533 */ 1534 if (query.windows_available == 0) { 1535 int reset_win_ext; 1536 1537 /* DDW + IOMMU on single window may fail if there is any allocation */ 1538 if (iommu_table_in_use(pci->table_group->tables[0])) { 1539 dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); 1540 goto out_failed; 1541 } 1542 1543 default_win = of_find_property(pdn, "ibm,dma-window", NULL); 1544 if (!default_win) 1545 goto out_failed; 1546 1547 reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL); 1548 if (reset_win_ext) 1549 goto out_failed; 1550 1551 remove_dma_window(pdn, ddw_avail, default_win, true); 1552 default_win_removed = true; 1553 1554 /* Query again, to check if the window is available */ 1555 ret = query_ddw(dev, ddw_avail, &query, pdn); 1556 if (ret != 0) 1557 goto out_failed; 1558 1559 if (query.windows_available == 0) { 1560 /* no windows are available for this device. */ 1561 dev_dbg(&dev->dev, "no free dynamic windows"); 1562 goto out_failed; 1563 } 1564 } 1565 1566 page_shift = iommu_get_page_shift(query.page_size); 1567 if (!page_shift) { 1568 dev_dbg(&dev->dev, "no supported page size in mask %x", 1569 query.page_size); 1570 goto out_failed; 1571 } 1572 1573 /* Maximum DMA window size that the device can address (in log2) */ 1574 dev_max_ddw = fls64(dma_mask); 1575 1576 /* If the device DMA mask is less than 64-bits, make sure the DMA window 1577 * size is not bigger than what the device can access 1578 */ 1579 ddw_sz = min(order_base_2(query.largest_available_block << page_shift), 1580 dev_max_ddw); 1581 1582 /* 1583 * The "ibm,pmemory" can appear anywhere in the address space. 1584 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS 1585 * for the upper limit and fallback to max RAM otherwise but this 1586 * disables device::dma_ops_bypass. 1587 */ 1588 len = max_ram_len; 1589 if (pmem_present) { 1590 if (ddw_sz >= MAX_PHYSMEM_BITS) 1591 len = MAX_PHYSMEM_BITS; 1592 else 1593 dev_info(&dev->dev, "Skipping ibm,pmemory"); 1594 } 1595 1596 /* check if the available block * number of ptes will map everything */ 1597 if (ddw_sz < len) { 1598 dev_dbg(&dev->dev, 1599 "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1600 1ULL << len, 1601 query.largest_available_block, 1602 1ULL << page_shift); 1603 1604 len = ddw_sz; 1605 dynamic_mapping = true; 1606 } else { 1607 direct_mapping = !default_win_removed || 1608 (len == MAX_PHYSMEM_BITS) || 1609 (!pmem_present && (len == max_ram_len)); 1610 1611 /* DDW is big enough to direct map RAM. If there is vPMEM, check 1612 * if enough space is left in DDW where we can dynamically 1613 * allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW 1614 * is only for SR-IOV devices. 1615 */ 1616 if (default_win_removed && pmem_present && !direct_mapping) { 1617 /* DDW is big enough to be split */ 1618 if ((1ULL << ddw_sz) >= 1619 MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) { 1620 1621 direct_mapping = true; 1622 1623 /* offset of the Dynamic part of DDW */ 1624 dynamic_offset = 1ULL << max_ram_len; 1625 } 1626 1627 /* DDW will at least have dynamic allocation */ 1628 dynamic_mapping = true; 1629 1630 /* create max size DDW possible */ 1631 len = ddw_sz; 1632 } 1633 } 1634 1635 /* Even if the DDW is split into both direct mapped RAM and dynamically 1636 * mapped vPMEM, the DDW property in OF will be marked as Direct. 1637 */ 1638 win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; 1639 1640 ret = create_ddw(dev, ddw_avail, &create, page_shift, len); 1641 if (ret != 0) 1642 goto out_failed; 1643 1644 dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", 1645 create.liobn, dn); 1646 1647 win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; 1648 win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); 1649 1650 if (!win64) { 1651 dev_info(&dev->dev, 1652 "couldn't allocate property, property name, or value\n"); 1653 goto out_remove_win; 1654 } 1655 1656 ret = of_add_property(pdn, win64); 1657 if (ret) { 1658 dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", 1659 pdn, ret); 1660 goto out_free_prop; 1661 } 1662 1663 window = ddw_list_new_entry(pdn, win64->value); 1664 if (!window) 1665 goto out_del_prop; 1666 1667 window->direct = direct_mapping; 1668 1669 if (direct_mapping) { 1670 /* DDW maps the whole partition, so enable direct DMA mapping */ 1671 ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT, 1672 win64->value, tce_setrange_multi_pSeriesLP_walk); 1673 if (ret) { 1674 dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", 1675 dn, ret); 1676 1677 /* Make sure to clean DDW if any TCE was set*/ 1678 clean_dma_window(pdn, win64->value); 1679 goto out_del_list; 1680 } 1681 if (default_win_removed) { 1682 iommu_tce_table_put(pci->table_group->tables[0]); 1683 pci->table_group->tables[0] = NULL; 1684 set_iommu_table_base(&dev->dev, NULL); 1685 } 1686 } 1687 1688 if (dynamic_mapping) { 1689 struct iommu_table *newtbl; 1690 int i; 1691 unsigned long start = 0, end = 0; 1692 u64 dynamic_addr, dynamic_len; 1693 1694 for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { 1695 const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; 1696 1697 /* Look for MMIO32 */ 1698 if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { 1699 start = pci->phb->mem_resources[i].start; 1700 end = pci->phb->mem_resources[i].end; 1701 break; 1702 } 1703 } 1704 1705 /* New table for using DDW instead of the default DMA window */ 1706 newtbl = iommu_pseries_alloc_table(pci->phb->node); 1707 if (!newtbl) { 1708 dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); 1709 goto out_del_list; 1710 } 1711 1712 /* If the DDW is split between directly mapped RAM and Dynamic 1713 * mapped for TCES, offset into the DDW where the dynamic part 1714 * begins. 1715 */ 1716 dynamic_addr = win_addr + dynamic_offset; 1717 dynamic_len = (1UL << len) - dynamic_offset; 1718 iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, 1719 dynamic_addr, dynamic_len, page_shift, NULL, 1720 &iommu_table_lpar_multi_ops); 1721 iommu_init_table(newtbl, pci->phb->node, 1722 start >> page_shift, end >> page_shift); 1723 1724 pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl; 1725 1726 set_iommu_table_base(&dev->dev, newtbl); 1727 } 1728 1729 if (default_win_removed) { 1730 /* default_win is valid here because default_win_removed == true */ 1731 if (!of_find_property(pdn, "ibm,dma-window-saved", NULL)) 1732 copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved"); 1733 of_remove_property(pdn, default_win); 1734 dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn); 1735 } 1736 1737 spin_lock(&dma_win_list_lock); 1738 list_add(&window->list, &dma_win_list); 1739 spin_unlock(&dma_win_list_lock); 1740 1741 dev->dev.archdata.dma_offset = win_addr; 1742 goto out_unlock; 1743 1744 out_del_list: 1745 kfree(window); 1746 1747 out_del_prop: 1748 of_remove_property(pdn, win64); 1749 1750 out_free_prop: 1751 kfree(win64->name); 1752 kfree(win64->value); 1753 kfree(win64); 1754 1755 out_remove_win: 1756 /* DDW is clean, so it's ok to call this directly. */ 1757 __remove_dma_window(pdn, ddw_avail, create.liobn); 1758 1759 out_failed: 1760 if (default_win_removed || limited_addr_enabled) 1761 reset_dma_window(dev, pdn); 1762 1763 fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); 1764 if (!fpdn) 1765 goto out_unlock; 1766 fpdn->pdn = pdn; 1767 list_add(&fpdn->list, &failed_ddw_pdn_list); 1768 1769 out_unlock: 1770 mutex_unlock(&dma_win_init_mutex); 1771 1772 /* If we have persistent memory and the window size is not big enough 1773 * to directly map both RAM and vPMEM, then we need to set DMA limit. 1774 */ 1775 if (pmem_present && direct_mapping && len != MAX_PHYSMEM_BITS) 1776 dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + 1777 (1ULL << max_ram_len); 1778 1779 dev_info(&dev->dev, "lsa_required: %x, lsa_enabled: %x, direct mapping: %x\n", 1780 limited_addr_req, limited_addr_enabled, direct_mapping); 1781 1782 return direct_mapping; 1783 } 1784 1785 static __u64 query_page_size_to_mask(u32 query_page_size) 1786 { 1787 const long shift[] = { 1788 (SZ_4K), (SZ_64K), (SZ_16M), 1789 (SZ_32M), (SZ_64M), (SZ_128M), 1790 (SZ_256M), (SZ_16G), (SZ_2M) 1791 }; 1792 int i, ret = 0; 1793 1794 for (i = 0; i < ARRAY_SIZE(shift); i++) { 1795 if (query_page_size & (1 << i)) 1796 ret |= shift[i]; 1797 } 1798 1799 return ret; 1800 } 1801 1802 static void spapr_tce_init_table_group(struct pci_dev *pdev, 1803 struct device_node *pdn, 1804 struct dynamic_dma_window_prop prop) 1805 { 1806 struct iommu_table_group *table_group = PCI_DN(pdn)->table_group; 1807 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1808 1809 struct ddw_query_response query; 1810 int ret; 1811 1812 /* Only for normal boot with default window. Doesn't matter during 1813 * kdump, since these will not be used during kdump. 1814 */ 1815 if (is_kdump_kernel()) 1816 return; 1817 1818 if (table_group->max_dynamic_windows_supported != 0) 1819 return; /* already initialized */ 1820 1821 table_group->tce32_start = be64_to_cpu(prop.dma_base); 1822 table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); 1823 1824 if (!of_find_property(pdn, "ibm,dma-window", NULL)) 1825 dev_err(&pdev->dev, "default dma window missing!\n"); 1826 1827 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 1828 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1829 if (ret) { 1830 table_group->max_dynamic_windows_supported = -1; 1831 return; 1832 } 1833 1834 ret = query_ddw(pdev, ddw_avail, &query, pdn); 1835 if (ret) { 1836 dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__); 1837 table_group->max_dynamic_windows_supported = -1; 1838 return; 1839 } 1840 1841 if (query.windows_available == 0) 1842 table_group->max_dynamic_windows_supported = 1; 1843 else 1844 table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES; 1845 1846 table_group->max_levels = 1; 1847 table_group->pgsizes |= query_page_size_to_mask(query.page_size); 1848 } 1849 1850 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) 1851 { 1852 struct device_node *pdn, *dn; 1853 struct iommu_table *tbl; 1854 struct pci_dn *pci; 1855 struct dynamic_dma_window_prop prop; 1856 1857 pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); 1858 1859 /* dev setup for LPAR is a little tricky, since the device tree might 1860 * contain the dma-window properties per-device and not necessarily 1861 * for the bus. So we need to search upwards in the tree until we 1862 * either hit a dma-window property, OR find a parent with a table 1863 * already allocated. 1864 */ 1865 dn = pci_device_to_OF_node(dev); 1866 pr_debug(" node is %pOF\n", dn); 1867 1868 pdn = pci_dma_find(dn, &prop); 1869 if (!pdn || !PCI_DN(pdn)) { 1870 printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " 1871 "no DMA window found for pci dev=%s dn=%pOF\n", 1872 pci_name(dev), dn); 1873 return; 1874 } 1875 pr_debug(" parent is %pOF\n", pdn); 1876 1877 pci = PCI_DN(pdn); 1878 if (!pci->table_group) { 1879 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 1880 tbl = pci->table_group->tables[0]; 1881 1882 iommu_table_setparms_common(tbl, pci->phb->bus->number, 1883 be32_to_cpu(prop.liobn), 1884 be64_to_cpu(prop.dma_base), 1885 1ULL << be32_to_cpu(prop.window_shift), 1886 be32_to_cpu(prop.tce_shift), NULL, 1887 &iommu_table_lpar_multi_ops); 1888 1889 iommu_init_table(tbl, pci->phb->node, 0, 0); 1890 iommu_register_group(pci->table_group, 1891 pci_domain_nr(pci->phb->bus), 0); 1892 pr_debug(" created table: %p\n", pci->table_group); 1893 } else { 1894 pr_debug(" found DMA window, table: %p\n", pci->table_group); 1895 } 1896 1897 spapr_tce_init_table_group(dev, pdn, prop); 1898 1899 set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); 1900 iommu_add_device(pci->table_group, &dev->dev); 1901 } 1902 1903 static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) 1904 { 1905 struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; 1906 1907 /* For DDW, DMA mask should be more than 32-bits. For mask more then 1908 * 32-bits but less then 64-bits, DMA addressing is supported in 1909 * Limited Addressing mode. 1910 */ 1911 if (dma_mask <= DMA_BIT_MASK(32)) 1912 return false; 1913 1914 dev_dbg(&pdev->dev, "node is %pOF\n", dn); 1915 1916 /* 1917 * the device tree might contain the dma-window properties 1918 * per-device and not necessarily for the bus. So we need to 1919 * search upwards in the tree until we either hit a dma-window 1920 * property, OR find a parent with a table already allocated. 1921 */ 1922 pdn = pci_dma_find(dn, NULL); 1923 if (pdn && PCI_DN(pdn)) 1924 return enable_ddw(pdev, pdn, dma_mask); 1925 1926 return false; 1927 } 1928 1929 #ifdef CONFIG_IOMMU_API 1930 /* 1931 * A simple iommu_table_group_ops which only allows reusing the existing 1932 * iommu_table. This handles VFIO for POWER7 or the nested KVM. 1933 * The ops does not allow creating windows and only allows reusing the existing 1934 * one if it matches table_group->tce32_start/tce32_size/page_shift. 1935 */ 1936 static unsigned long spapr_tce_get_table_size(__u32 page_shift, 1937 __u64 window_size, __u32 levels) 1938 { 1939 unsigned long size; 1940 1941 if (levels > 1) 1942 return ~0U; 1943 size = window_size >> (page_shift - 3); 1944 return size; 1945 } 1946 1947 static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group) 1948 { 1949 struct pci_dev *pdev = NULL; 1950 int ret; 1951 1952 /* No IOMMU group ? */ 1953 if (!group) 1954 return NULL; 1955 1956 ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table); 1957 if (!ret || !pdev) 1958 return NULL; 1959 return pdev; 1960 } 1961 1962 static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn) 1963 { 1964 reset_dma_window(pdev, pdn); 1965 copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window"); 1966 } 1967 1968 static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn) 1969 { 1970 struct pci_dn *pci = PCI_DN(pdn); 1971 struct dma_win *window; 1972 bool direct_mapping; 1973 int len; 1974 1975 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) { 1976 remove_dma_window_named(pdn, true, direct_mapping ? 1977 DIRECT64_PROPNAME : DMA64_PROPNAME, true); 1978 if (!direct_mapping) { 1979 WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]); 1980 1981 if (pci->table_group->tables[1]) { 1982 iommu_tce_table_put(pci->table_group->tables[1]); 1983 pci->table_group->tables[1] = NULL; 1984 } else if (pci->table_group->tables[0]) { 1985 /* Default window was removed and only the DDW exists */ 1986 iommu_tce_table_put(pci->table_group->tables[0]); 1987 pci->table_group->tables[0] = NULL; 1988 } 1989 } 1990 spin_lock(&dma_win_list_lock); 1991 list_for_each_entry(window, &dma_win_list, list) { 1992 if (window->device == pdn) { 1993 list_del(&window->list); 1994 kfree(window); 1995 break; 1996 } 1997 } 1998 spin_unlock(&dma_win_list_lock); 1999 } 2000 2001 return 0; 2002 } 2003 2004 static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group, 2005 struct device *dev) 2006 { 2007 struct pci_dev *pdev = to_pci_dev(dev); 2008 const __be32 *default_prop; 2009 long liobn, offset, size; 2010 struct device_node *pdn; 2011 struct iommu_table *tbl; 2012 struct pci_dn *pci; 2013 2014 pdn = pci_dma_find_parent_node(pdev, table_group); 2015 if (!pdn || !PCI_DN(pdn)) { 2016 dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn); 2017 return -1; 2018 } 2019 pci = PCI_DN(pdn); 2020 2021 /* The default window is restored if not present already on removal of DDW. 2022 * However, if used by VFIO SPAPR sub driver, the user's order of removal of 2023 * windows might have been different to not leading to auto restoration, 2024 * suppose the DDW was removed first followed by the default one. 2025 * So, restore the default window with reset-pe-dma call explicitly. 2026 */ 2027 restore_default_dma_window(pdev, pdn); 2028 2029 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2030 of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); 2031 tbl = iommu_pseries_alloc_table(pci->phb->node); 2032 if (!tbl) { 2033 dev_err(&pdev->dev, "couldn't create new IOMMU table\n"); 2034 return -1; 2035 } 2036 2037 iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset, 2038 size, IOMMU_PAGE_SHIFT_4K, NULL, 2039 &iommu_table_lpar_multi_ops); 2040 iommu_init_table(tbl, pci->phb->node, 0, 0); 2041 2042 pci->table_group->tables[0] = tbl; 2043 set_iommu_table_base(&pdev->dev, tbl); 2044 2045 return 0; 2046 } 2047 2048 static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift, 2049 __u64 window_size) 2050 { 2051 if ((window_size <= table_group->tce32_size) && 2052 (page_shift == IOMMU_PAGE_SHIFT_4K)) 2053 return true; 2054 2055 return false; 2056 } 2057 2058 static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, 2059 __u32 page_shift, __u64 window_size, __u32 levels, 2060 struct iommu_table **ptbl) 2061 { 2062 struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); 2063 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 2064 struct ddw_create_response create; 2065 unsigned long liobn, offset, size; 2066 unsigned long start = 0, end = 0; 2067 struct ddw_query_response query; 2068 const __be32 *default_prop; 2069 struct failed_ddw_pdn *fpdn; 2070 unsigned int window_shift; 2071 struct device_node *pdn; 2072 struct iommu_table *tbl; 2073 struct dma_win *window; 2074 struct property *win64; 2075 struct pci_dn *pci; 2076 u64 win_addr; 2077 int len, i; 2078 long ret; 2079 2080 if (!is_power_of_2(window_size) || levels > 1) 2081 return -EINVAL; 2082 2083 window_shift = order_base_2(window_size); 2084 2085 mutex_lock(&dma_win_init_mutex); 2086 2087 ret = -ENODEV; 2088 2089 pdn = pci_dma_find_parent_node(pdev, table_group); 2090 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2091 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2092 goto out_failed; 2093 } 2094 pci = PCI_DN(pdn); 2095 2096 /* If the enable DDW failed for the pdn, dont retry! */ 2097 list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { 2098 if (fpdn->pdn == pdn) { 2099 dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn); 2100 goto out_unlock; 2101 } 2102 } 2103 2104 tbl = iommu_pseries_alloc_table(pci->phb->node); 2105 if (!tbl) { 2106 dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n"); 2107 goto out_unlock; 2108 } 2109 2110 if (num == 0) { 2111 bool direct_mapping; 2112 /* The request is not for default window? Ensure there is no DDW window already */ 2113 if (!is_default_window_request(table_group, page_shift, window_size)) { 2114 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, 2115 &direct_mapping)) { 2116 dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn); 2117 ret = -EPERM; 2118 goto out_unlock; 2119 } 2120 } else { 2121 /* Request is for Default window, ensure there is no DDW if there is a 2122 * need to reset. reset-pe otherwise removes the DDW also 2123 */ 2124 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2125 if (!default_prop) { 2126 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, 2127 &direct_mapping)) { 2128 dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window", 2129 pdn); 2130 ret = -EPERM; 2131 goto out_unlock; 2132 } 2133 2134 restore_default_dma_window(pdev, pdn); 2135 2136 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2137 of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); 2138 /* Limit the default window size to window_size */ 2139 iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, 2140 offset, 1UL << window_shift, 2141 IOMMU_PAGE_SHIFT_4K, NULL, 2142 &iommu_table_lpar_multi_ops); 2143 iommu_init_table(tbl, pci->phb->node, 2144 start >> IOMMU_PAGE_SHIFT_4K, 2145 end >> IOMMU_PAGE_SHIFT_4K); 2146 2147 table_group->tables[0] = tbl; 2148 2149 mutex_unlock(&dma_win_init_mutex); 2150 2151 goto exit; 2152 } 2153 } 2154 } 2155 2156 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 2157 &ddw_avail[0], DDW_APPLICABLE_SIZE); 2158 if (ret) { 2159 dev_info(&pdev->dev, "ibm,ddw-applicable not found\n"); 2160 goto out_failed; 2161 } 2162 ret = -ENODEV; 2163 2164 pr_err("%s: Calling query %pOF\n", __func__, pdn); 2165 ret = query_ddw(pdev, ddw_avail, &query, pdn); 2166 if (ret) 2167 goto out_failed; 2168 ret = -ENODEV; 2169 2170 len = window_shift; 2171 if (query.largest_available_block < (1ULL << (len - page_shift))) { 2172 dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n", 2173 1ULL << len, query.largest_available_block, 2174 1ULL << page_shift); 2175 ret = -EINVAL; /* Retry with smaller window size */ 2176 goto out_unlock; 2177 } 2178 2179 if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) { 2180 pr_err("%s: Create ddw failed %pOF\n", __func__, pdn); 2181 goto out_failed; 2182 } 2183 2184 win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; 2185 win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len); 2186 if (!win64) 2187 goto remove_window; 2188 2189 ret = of_add_property(pdn, win64); 2190 if (ret) { 2191 dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret); 2192 goto free_property; 2193 } 2194 ret = -ENODEV; 2195 2196 window = ddw_list_new_entry(pdn, win64->value); 2197 if (!window) 2198 goto remove_property; 2199 2200 window->direct = false; 2201 2202 for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { 2203 const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; 2204 2205 /* Look for MMIO32 */ 2206 if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { 2207 start = pci->phb->mem_resources[i].start; 2208 end = pci->phb->mem_resources[i].end; 2209 break; 2210 } 2211 } 2212 2213 /* New table for using DDW instead of the default DMA window */ 2214 iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr, 2215 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); 2216 iommu_init_table(tbl, pci->phb->node, start >> page_shift, end >> page_shift); 2217 2218 pci->table_group->tables[num] = tbl; 2219 set_iommu_table_base(&pdev->dev, tbl); 2220 pdev->dev.archdata.dma_offset = win_addr; 2221 2222 spin_lock(&dma_win_list_lock); 2223 list_add(&window->list, &dma_win_list); 2224 spin_unlock(&dma_win_list_lock); 2225 2226 mutex_unlock(&dma_win_init_mutex); 2227 2228 goto exit; 2229 2230 remove_property: 2231 of_remove_property(pdn, win64); 2232 free_property: 2233 kfree(win64->name); 2234 kfree(win64->value); 2235 kfree(win64); 2236 remove_window: 2237 __remove_dma_window(pdn, ddw_avail, create.liobn); 2238 2239 out_failed: 2240 fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); 2241 if (!fpdn) 2242 goto out_unlock; 2243 fpdn->pdn = pdn; 2244 list_add(&fpdn->list, &failed_ddw_pdn_list); 2245 2246 out_unlock: 2247 mutex_unlock(&dma_win_init_mutex); 2248 2249 return ret; 2250 exit: 2251 /* Allocate the userspace view */ 2252 pseries_tce_iommu_userspace_view_alloc(tbl); 2253 tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels); 2254 2255 *ptbl = iommu_tce_table_get(tbl); 2256 2257 return 0; 2258 } 2259 2260 static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl) 2261 { 2262 if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) && 2263 (tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K)) 2264 return true; 2265 2266 return false; 2267 } 2268 2269 static long spapr_tce_set_window(struct iommu_table_group *table_group, 2270 int num, struct iommu_table *tbl) 2271 { 2272 return tbl == table_group->tables[num] ? 0 : -EPERM; 2273 } 2274 2275 static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) 2276 { 2277 struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); 2278 struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; 2279 struct iommu_table *tbl = table_group->tables[num]; 2280 struct failed_ddw_pdn *fpdn; 2281 struct dma_win *window; 2282 const char *win_name; 2283 int ret = -ENODEV; 2284 2285 if (!tbl) /* The table was never created OR window was never opened */ 2286 return 0; 2287 2288 mutex_lock(&dma_win_init_mutex); 2289 2290 if ((num == 0) && is_default_window_table(table_group, tbl)) 2291 win_name = "ibm,dma-window"; 2292 else 2293 win_name = DMA64_PROPNAME; 2294 2295 pdn = pci_dma_find(dn, NULL); 2296 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2297 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2298 goto out_failed; 2299 } 2300 2301 /* Dont clear the TCEs, User should have done it */ 2302 if (remove_dma_window_named(pdn, true, win_name, false)) { 2303 pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn); 2304 goto out_failed; /* Could not remove it either! */ 2305 } 2306 2307 if (strcmp(win_name, DMA64_PROPNAME) == 0) { 2308 spin_lock(&dma_win_list_lock); 2309 list_for_each_entry(window, &dma_win_list, list) { 2310 if (window->device == pdn) { 2311 list_del(&window->list); 2312 kfree(window); 2313 break; 2314 } 2315 } 2316 spin_unlock(&dma_win_list_lock); 2317 } 2318 2319 iommu_tce_table_put(table_group->tables[num]); 2320 table_group->tables[num] = NULL; 2321 2322 ret = 0; 2323 2324 goto out_unlock; 2325 2326 out_failed: 2327 fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); 2328 if (!fpdn) 2329 goto out_unlock; 2330 fpdn->pdn = pdn; 2331 list_add(&fpdn->list, &failed_ddw_pdn_list); 2332 2333 out_unlock: 2334 mutex_unlock(&dma_win_init_mutex); 2335 2336 return ret; 2337 } 2338 2339 static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev) 2340 { 2341 struct iommu_table *tbl = table_group->tables[0]; 2342 struct pci_dev *pdev = to_pci_dev(dev); 2343 struct device_node *dn = pci_device_to_OF_node(pdev); 2344 struct device_node *pdn; 2345 2346 /* SRIOV VFs using direct map by the host driver OR multifunction devices 2347 * where the ownership was taken on the attempt by the first function 2348 */ 2349 if (!tbl && (table_group->max_dynamic_windows_supported != 1)) 2350 return 0; 2351 2352 mutex_lock(&dma_win_init_mutex); 2353 2354 pdn = pci_dma_find(dn, NULL); 2355 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2356 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2357 mutex_unlock(&dma_win_init_mutex); 2358 return -1; 2359 } 2360 2361 /* 2362 * Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table 2363 * if there are any. In case of direct map, the entries will be left over, which 2364 * is fine for PEs with 2 DMA windows where the second window is created with create-pe 2365 * at which point the table is cleared. However, on VFs having only one DMA window, the 2366 * default window would end up seeing the entries left over from the direct map done 2367 * on the second window. So, remove the ddw explicitly so that clean_dma_window() 2368 * cleans up the entries if any. 2369 */ 2370 if (remove_dynamic_dma_windows(pdev, pdn)) { 2371 dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn); 2372 mutex_unlock(&dma_win_init_mutex); 2373 return -1; 2374 } 2375 2376 /* The table_group->tables[0] is not null now, it must be the default window 2377 * Remove it, let the userspace create it as it needs. 2378 */ 2379 if (table_group->tables[0]) { 2380 remove_dma_window_named(pdn, true, "ibm,dma-window", true); 2381 iommu_tce_table_put(tbl); 2382 table_group->tables[0] = NULL; 2383 } 2384 set_iommu_table_base(dev, NULL); 2385 2386 mutex_unlock(&dma_win_init_mutex); 2387 2388 return 0; 2389 } 2390 2391 static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev) 2392 { 2393 struct iommu_table *tbl = table_group->tables[0]; 2394 2395 if (tbl) { /* Default window already restored */ 2396 return; 2397 } 2398 2399 mutex_lock(&dma_win_init_mutex); 2400 2401 /* Restore the default window */ 2402 pseries_setup_default_iommu_config(table_group, dev); 2403 2404 mutex_unlock(&dma_win_init_mutex); 2405 2406 return; 2407 } 2408 2409 static struct iommu_table_group_ops spapr_tce_table_group_ops = { 2410 .get_table_size = spapr_tce_get_table_size, 2411 .create_table = spapr_tce_create_table, 2412 .set_window = spapr_tce_set_window, 2413 .unset_window = spapr_tce_unset_window, 2414 .take_ownership = spapr_tce_take_ownership, 2415 .release_ownership = spapr_tce_release_ownership, 2416 }; 2417 #endif 2418 2419 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, 2420 void *data) 2421 { 2422 struct dma_win *window; 2423 struct memory_notify *arg = data; 2424 int ret = 0; 2425 2426 /* This notifier can get called when onlining persistent memory as well. 2427 * TCEs are not pre-mapped for persistent memory. Persistent memory will 2428 * always be above ddw_memory_hotplug_max() 2429 */ 2430 2431 switch (action) { 2432 case MEM_GOING_ONLINE: 2433 spin_lock(&dma_win_list_lock); 2434 list_for_each_entry(window, &dma_win_list, list) { 2435 if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2436 ddw_memory_hotplug_max()) { 2437 ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, 2438 arg->nr_pages, window->prop); 2439 } 2440 /* XXX log error */ 2441 } 2442 spin_unlock(&dma_win_list_lock); 2443 break; 2444 case MEM_CANCEL_ONLINE: 2445 case MEM_OFFLINE: 2446 spin_lock(&dma_win_list_lock); 2447 list_for_each_entry(window, &dma_win_list, list) { 2448 if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2449 ddw_memory_hotplug_max()) { 2450 ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, 2451 arg->nr_pages, window->prop); 2452 } 2453 /* XXX log error */ 2454 } 2455 spin_unlock(&dma_win_list_lock); 2456 break; 2457 default: 2458 break; 2459 } 2460 if (ret && action != MEM_CANCEL_ONLINE) 2461 return NOTIFY_BAD; 2462 2463 return NOTIFY_OK; 2464 } 2465 2466 static struct notifier_block iommu_mem_nb = { 2467 .notifier_call = iommu_mem_notifier, 2468 }; 2469 2470 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data) 2471 { 2472 int err = NOTIFY_OK; 2473 struct of_reconfig_data *rd = data; 2474 struct device_node *np = rd->dn; 2475 struct pci_dn *pci = PCI_DN(np); 2476 struct dma_win *window; 2477 2478 switch (action) { 2479 case OF_RECONFIG_DETACH_NODE: 2480 /* 2481 * Removing the property will invoke the reconfig 2482 * notifier again, which causes dead-lock on the 2483 * read-write semaphore of the notifier chain. So 2484 * we have to remove the property when releasing 2485 * the device node. 2486 */ 2487 if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true)) 2488 remove_dma_window_named(np, false, DMA64_PROPNAME, true); 2489 2490 if (pci && pci->table_group) 2491 iommu_pseries_free_group(pci->table_group, 2492 np->full_name); 2493 2494 spin_lock(&dma_win_list_lock); 2495 list_for_each_entry(window, &dma_win_list, list) { 2496 if (window->device == np) { 2497 list_del(&window->list); 2498 kfree(window); 2499 break; 2500 } 2501 } 2502 spin_unlock(&dma_win_list_lock); 2503 break; 2504 default: 2505 err = NOTIFY_DONE; 2506 break; 2507 } 2508 return err; 2509 } 2510 2511 static struct notifier_block iommu_reconfig_nb = { 2512 .notifier_call = iommu_reconfig_notifier, 2513 }; 2514 2515 /* These are called very early. */ 2516 void __init iommu_init_early_pSeries(void) 2517 { 2518 if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) 2519 return; 2520 2521 if (firmware_has_feature(FW_FEATURE_LPAR)) { 2522 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; 2523 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; 2524 if (!disable_ddw) 2525 pseries_pci_controller_ops.iommu_bypass_supported = 2526 iommu_bypass_supported_pSeriesLP; 2527 } else { 2528 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; 2529 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; 2530 } 2531 2532 2533 of_reconfig_notifier_register(&iommu_reconfig_nb); 2534 register_memory_notifier(&iommu_mem_nb); 2535 2536 set_pci_dma_ops(&dma_iommu_ops); 2537 } 2538 2539 static int __init disable_multitce(char *str) 2540 { 2541 if (strcmp(str, "off") == 0 && 2542 firmware_has_feature(FW_FEATURE_LPAR) && 2543 (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || 2544 firmware_has_feature(FW_FEATURE_STUFF_TCE))) { 2545 printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); 2546 powerpc_firmware_features &= 2547 ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); 2548 } 2549 return 1; 2550 } 2551 2552 __setup("multitce=", disable_multitce); 2553 2554 #ifdef CONFIG_SPAPR_TCE_IOMMU 2555 struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, 2556 struct pci_dev *pdev) 2557 { 2558 struct device_node *pdn, *dn = pdev->dev.of_node; 2559 struct iommu_group *grp; 2560 struct pci_dn *pci; 2561 2562 pdn = pci_dma_find(dn, NULL); 2563 if (!pdn || !PCI_DN(pdn)) 2564 return ERR_PTR(-ENODEV); 2565 2566 pci = PCI_DN(pdn); 2567 if (!pci->table_group) 2568 return ERR_PTR(-ENODEV); 2569 2570 grp = pci->table_group->group; 2571 if (!grp) 2572 return ERR_PTR(-ENODEV); 2573 2574 return iommu_group_ref_get(grp); 2575 } 2576 #endif 2577