1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation 4 * 5 * Rewrite, cleanup: 6 * 7 * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation 8 * Copyright (C) 2006 Olof Johansson <olof@lixom.net> 9 * 10 * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR. 11 */ 12 13 #include <linux/init.h> 14 #include <linux/types.h> 15 #include <linux/slab.h> 16 #include <linux/mm.h> 17 #include <linux/memblock.h> 18 #include <linux/spinlock.h> 19 #include <linux/string.h> 20 #include <linux/pci.h> 21 #include <linux/dma-mapping.h> 22 #include <linux/crash_dump.h> 23 #include <linux/memory.h> 24 #include <linux/vmalloc.h> 25 #include <linux/of.h> 26 #include <linux/of_address.h> 27 #include <linux/iommu.h> 28 #include <linux/rculist.h> 29 #include <asm/io.h> 30 #include <asm/prom.h> 31 #include <asm/rtas.h> 32 #include <asm/iommu.h> 33 #include <asm/pci-bridge.h> 34 #include <asm/machdep.h> 35 #include <asm/firmware.h> 36 #include <asm/tce.h> 37 #include <asm/ppc-pci.h> 38 #include <asm/udbg.h> 39 #include <asm/mmzone.h> 40 #include <asm/plpar_wrappers.h> 41 42 #include "pseries.h" 43 44 enum { 45 DDW_QUERY_PE_DMA_WIN = 0, 46 DDW_CREATE_PE_DMA_WIN = 1, 47 DDW_REMOVE_PE_DMA_WIN = 2, 48 49 DDW_APPLICABLE_SIZE 50 }; 51 52 enum { 53 DDW_EXT_SIZE = 0, 54 DDW_EXT_RESET_DMA_WIN = 1, 55 DDW_EXT_QUERY_OUT_SIZE = 2, 56 DDW_EXT_LIMITED_ADDR_MODE = 3 57 }; 58 59 static struct iommu_table *iommu_pseries_alloc_table(int node) 60 { 61 struct iommu_table *tbl; 62 63 tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); 64 if (!tbl) 65 return NULL; 66 67 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 68 kref_init(&tbl->it_kref); 69 return tbl; 70 } 71 72 #ifdef CONFIG_IOMMU_API 73 static struct iommu_table_group_ops spapr_tce_table_group_ops; 74 #endif 75 76 static struct iommu_table_group *iommu_pseries_alloc_group(int node) 77 { 78 struct iommu_table_group *table_group; 79 80 table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); 81 if (!table_group) 82 return NULL; 83 84 #ifdef CONFIG_IOMMU_API 85 table_group->ops = &spapr_tce_table_group_ops; 86 table_group->pgsizes = SZ_4K; 87 #endif 88 89 table_group->tables[0] = iommu_pseries_alloc_table(node); 90 if (table_group->tables[0]) 91 return table_group; 92 93 kfree(table_group); 94 return NULL; 95 } 96 97 static void iommu_pseries_free_group(struct iommu_table_group *table_group, 98 const char *node_name) 99 { 100 if (!table_group) 101 return; 102 103 #ifdef CONFIG_IOMMU_API 104 if (table_group->group) { 105 iommu_group_put(table_group->group); 106 BUG_ON(table_group->group); 107 } 108 #endif 109 110 /* Default DMA window table is at index 0, while DDW at 1. SR-IOV 111 * adapters only have table on index 0(if not direct mapped). 112 */ 113 if (table_group->tables[0]) 114 iommu_tce_table_put(table_group->tables[0]); 115 116 if (table_group->tables[1]) 117 iommu_tce_table_put(table_group->tables[1]); 118 119 kfree(table_group); 120 } 121 122 static int tce_build_pSeries(struct iommu_table *tbl, long index, 123 long npages, unsigned long uaddr, 124 enum dma_data_direction direction, 125 unsigned long attrs) 126 { 127 u64 proto_tce; 128 __be64 *tcep; 129 u64 rpn; 130 const unsigned long tceshift = tbl->it_page_shift; 131 const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); 132 133 proto_tce = TCE_PCI_READ; // Read allowed 134 135 if (direction != DMA_TO_DEVICE) 136 proto_tce |= TCE_PCI_WRITE; 137 138 tcep = ((__be64 *)tbl->it_base) + index; 139 140 while (npages--) { 141 /* can't move this out since we might cross MEMBLOCK boundary */ 142 rpn = __pa(uaddr) >> tceshift; 143 *tcep = cpu_to_be64(proto_tce | rpn << tceshift); 144 145 uaddr += pagesize; 146 tcep++; 147 } 148 return 0; 149 } 150 151 152 static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages) 153 { 154 __be64 *tcep; 155 156 tcep = ((__be64 *)tbl->it_base) + index; 157 158 while (npages--) 159 *(tcep++) = 0; 160 } 161 162 static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) 163 { 164 __be64 *tcep; 165 166 tcep = ((__be64 *)tbl->it_base) + index; 167 168 return be64_to_cpu(*tcep); 169 } 170 171 #ifdef CONFIG_IOMMU_API 172 static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl) 173 { 174 unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); 175 unsigned long *uas; 176 177 if (tbl->it_indirect_levels) /* Impossible */ 178 return -EPERM; 179 180 WARN_ON(tbl->it_userspace); 181 182 uas = vzalloc(cb); 183 if (!uas) 184 return -ENOMEM; 185 186 tbl->it_userspace = (__be64 *) uas; 187 188 return 0; 189 } 190 #endif 191 192 static void tce_iommu_userspace_view_free(struct iommu_table *tbl) 193 { 194 vfree(tbl->it_userspace); 195 tbl->it_userspace = NULL; 196 } 197 198 static void tce_free_pSeries(struct iommu_table *tbl) 199 { 200 if (tbl->it_userspace) 201 tce_iommu_userspace_view_free(tbl); 202 } 203 204 static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); 205 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); 206 207 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, 208 long npages, unsigned long uaddr, 209 enum dma_data_direction direction, 210 unsigned long attrs) 211 { 212 u64 rc = 0; 213 u64 proto_tce, tce; 214 u64 rpn; 215 int ret = 0; 216 long tcenum_start = tcenum, npages_start = npages; 217 218 rpn = __pa(uaddr) >> tceshift; 219 proto_tce = TCE_PCI_READ; 220 if (direction != DMA_TO_DEVICE) 221 proto_tce |= TCE_PCI_WRITE; 222 223 while (npages--) { 224 tce = proto_tce | rpn << tceshift; 225 rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); 226 227 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { 228 ret = (int)rc; 229 tce_free_pSeriesLP(liobn, tcenum_start, tceshift, 230 (npages_start - (npages + 1))); 231 break; 232 } 233 234 if (rc && printk_ratelimit()) { 235 printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 236 printk("\tindex = 0x%llx\n", (u64)liobn); 237 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 238 printk("\ttce val = 0x%llx\n", tce ); 239 dump_stack(); 240 } 241 242 tcenum++; 243 rpn++; 244 } 245 return ret; 246 } 247 248 static DEFINE_PER_CPU(__be64 *, tce_page); 249 250 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, 251 long npages, unsigned long uaddr, 252 enum dma_data_direction direction, 253 unsigned long attrs) 254 { 255 u64 rc = 0; 256 u64 proto_tce; 257 __be64 *tcep; 258 u64 rpn; 259 long l, limit; 260 long tcenum_start = tcenum, npages_start = npages; 261 int ret = 0; 262 unsigned long flags; 263 const unsigned long tceshift = tbl->it_page_shift; 264 265 if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { 266 return tce_build_pSeriesLP(tbl->it_index, tcenum, 267 tceshift, npages, uaddr, 268 direction, attrs); 269 } 270 271 local_irq_save(flags); /* to protect tcep and the page behind it */ 272 273 tcep = __this_cpu_read(tce_page); 274 275 /* This is safe to do since interrupts are off when we're called 276 * from iommu_alloc{,_sg}() 277 */ 278 if (!tcep) { 279 tcep = (__be64 *)__get_free_page(GFP_ATOMIC); 280 /* If allocation fails, fall back to the loop implementation */ 281 if (!tcep) { 282 local_irq_restore(flags); 283 return tce_build_pSeriesLP(tbl->it_index, tcenum, 284 tceshift, 285 npages, uaddr, direction, attrs); 286 } 287 __this_cpu_write(tce_page, tcep); 288 } 289 290 rpn = __pa(uaddr) >> tceshift; 291 proto_tce = TCE_PCI_READ; 292 if (direction != DMA_TO_DEVICE) 293 proto_tce |= TCE_PCI_WRITE; 294 295 /* We can map max one pageful of TCEs at a time */ 296 do { 297 /* 298 * Set up the page with TCE data, looping through and setting 299 * the values. 300 */ 301 limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE); 302 303 for (l = 0; l < limit; l++) { 304 tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); 305 rpn++; 306 } 307 308 rc = plpar_tce_put_indirect((u64)tbl->it_index, 309 (u64)tcenum << tceshift, 310 (u64)__pa(tcep), 311 limit); 312 313 npages -= limit; 314 tcenum += limit; 315 } while (npages > 0 && !rc); 316 317 local_irq_restore(flags); 318 319 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { 320 ret = (int)rc; 321 tce_freemulti_pSeriesLP(tbl, tcenum_start, 322 (npages_start - (npages + limit))); 323 return ret; 324 } 325 326 if (rc && printk_ratelimit()) { 327 printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 328 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 329 printk("\tnpages = 0x%llx\n", (u64)npages); 330 printk("\ttce[0] val = 0x%llx\n", tcep[0]); 331 dump_stack(); 332 } 333 return ret; 334 } 335 336 static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, 337 long npages) 338 { 339 u64 rc; 340 341 while (npages--) { 342 rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); 343 344 if (rc && printk_ratelimit()) { 345 printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); 346 printk("\tindex = 0x%llx\n", (u64)liobn); 347 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 348 dump_stack(); 349 } 350 351 tcenum++; 352 } 353 } 354 355 356 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) 357 { 358 u64 rc; 359 long rpages = npages; 360 unsigned long limit; 361 362 if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) 363 return tce_free_pSeriesLP(tbl->it_index, tcenum, 364 tbl->it_page_shift, npages); 365 366 do { 367 limit = min_t(unsigned long, rpages, 512); 368 369 rc = plpar_tce_stuff((u64)tbl->it_index, 370 (u64)tcenum << tbl->it_page_shift, 0, limit); 371 372 rpages -= limit; 373 tcenum += limit; 374 } while (rpages > 0 && !rc); 375 376 if (rc && printk_ratelimit()) { 377 printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); 378 printk("\trc = %lld\n", rc); 379 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 380 printk("\tnpages = 0x%llx\n", (u64)npages); 381 dump_stack(); 382 } 383 } 384 385 static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) 386 { 387 u64 rc; 388 unsigned long tce_ret; 389 390 rc = plpar_tce_get((u64)tbl->it_index, 391 (u64)tcenum << tbl->it_page_shift, &tce_ret); 392 393 if (rc && printk_ratelimit()) { 394 printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); 395 printk("\tindex = 0x%llx\n", (u64)tbl->it_index); 396 printk("\ttcenum = 0x%llx\n", (u64)tcenum); 397 dump_stack(); 398 } 399 400 return tce_ret; 401 } 402 403 /* this is compatible with cells for the device tree property */ 404 struct dynamic_dma_window_prop { 405 __be32 liobn; /* tce table number */ 406 __be64 dma_base; /* address hi,lo */ 407 __be32 tce_shift; /* ilog2(tce_page_size) */ 408 __be32 window_shift; /* ilog2(tce_window_size) */ 409 }; 410 411 struct dma_win { 412 struct device_node *device; 413 const struct dynamic_dma_window_prop *prop; 414 bool direct; 415 struct list_head list; 416 }; 417 418 /* Dynamic DMA Window support */ 419 struct ddw_query_response { 420 u32 windows_available; 421 u64 largest_available_block; 422 u32 page_size; 423 u32 migration_capable; 424 }; 425 426 struct ddw_create_response { 427 u32 liobn; 428 u32 addr_hi; 429 u32 addr_lo; 430 }; 431 432 static LIST_HEAD(dma_win_list); 433 /* prevents races between memory on/offline and window creation */ 434 static DEFINE_SPINLOCK(dma_win_list_lock); 435 /* protects initializing window twice for same device */ 436 static DEFINE_MUTEX(dma_win_init_mutex); 437 438 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, 439 unsigned long num_pfn, const void *arg) 440 { 441 const struct dynamic_dma_window_prop *maprange = arg; 442 int rc; 443 u64 tce_size, num_tce, dma_offset, next; 444 u32 tce_shift; 445 long limit; 446 447 tce_shift = be32_to_cpu(maprange->tce_shift); 448 tce_size = 1ULL << tce_shift; 449 next = start_pfn << PAGE_SHIFT; 450 num_tce = num_pfn << PAGE_SHIFT; 451 452 /* round back to the beginning of the tce page size */ 453 num_tce += next & (tce_size - 1); 454 next &= ~(tce_size - 1); 455 456 /* covert to number of tces */ 457 num_tce |= tce_size - 1; 458 num_tce >>= tce_shift; 459 460 do { 461 /* 462 * Set up the page with TCE data, looping through and setting 463 * the values. 464 */ 465 limit = min_t(long, num_tce, 512); 466 dma_offset = next + be64_to_cpu(maprange->dma_base); 467 468 rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), 469 dma_offset, 470 0, limit); 471 next += limit * tce_size; 472 num_tce -= limit; 473 } while (num_tce > 0 && !rc); 474 475 return rc; 476 } 477 478 static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, 479 unsigned long num_pfn, const void *arg) 480 { 481 const struct dynamic_dma_window_prop *maprange = arg; 482 u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn; 483 __be64 *tcep; 484 u32 tce_shift; 485 u64 rc = 0; 486 long l, limit; 487 488 if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { 489 unsigned long tceshift = be32_to_cpu(maprange->tce_shift); 490 unsigned long dmastart = (start_pfn << PAGE_SHIFT) + 491 be64_to_cpu(maprange->dma_base); 492 unsigned long tcenum = dmastart >> tceshift; 493 unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; 494 void *uaddr = __va(start_pfn << PAGE_SHIFT); 495 496 return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), 497 tcenum, tceshift, npages, (unsigned long) uaddr, 498 DMA_BIDIRECTIONAL, 0); 499 } 500 501 local_irq_disable(); /* to protect tcep and the page behind it */ 502 tcep = __this_cpu_read(tce_page); 503 504 if (!tcep) { 505 tcep = (__be64 *)__get_free_page(GFP_ATOMIC); 506 if (!tcep) { 507 local_irq_enable(); 508 return -ENOMEM; 509 } 510 __this_cpu_write(tce_page, tcep); 511 } 512 513 proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; 514 515 liobn = (u64)be32_to_cpu(maprange->liobn); 516 tce_shift = be32_to_cpu(maprange->tce_shift); 517 tce_size = 1ULL << tce_shift; 518 next = start_pfn << PAGE_SHIFT; 519 num_tce = num_pfn << PAGE_SHIFT; 520 521 /* round back to the beginning of the tce page size */ 522 num_tce += next & (tce_size - 1); 523 next &= ~(tce_size - 1); 524 525 /* covert to number of tces */ 526 num_tce |= tce_size - 1; 527 num_tce >>= tce_shift; 528 529 /* We can map max one pageful of TCEs at a time */ 530 do { 531 /* 532 * Set up the page with TCE data, looping through and setting 533 * the values. 534 */ 535 limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE); 536 dma_offset = next + be64_to_cpu(maprange->dma_base); 537 538 for (l = 0; l < limit; l++) { 539 tcep[l] = cpu_to_be64(proto_tce | next); 540 next += tce_size; 541 } 542 543 rc = plpar_tce_put_indirect(liobn, 544 dma_offset, 545 (u64)__pa(tcep), 546 limit); 547 548 num_tce -= limit; 549 } while (num_tce > 0 && !rc); 550 551 /* error cleanup: caller will clear whole range */ 552 553 local_irq_enable(); 554 return rc; 555 } 556 557 static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, 558 unsigned long num_pfn, void *arg) 559 { 560 return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); 561 } 562 563 static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, 564 unsigned long liobn, unsigned long win_addr, 565 unsigned long window_size, unsigned long page_shift, 566 void *base, struct iommu_table_ops *table_ops) 567 { 568 tbl->it_busno = busno; 569 tbl->it_index = liobn; 570 tbl->it_offset = win_addr >> page_shift; 571 tbl->it_size = window_size >> page_shift; 572 tbl->it_page_shift = page_shift; 573 tbl->it_base = (unsigned long)base; 574 tbl->it_blocksize = 16; 575 tbl->it_type = TCE_PCI; 576 tbl->it_ops = table_ops; 577 } 578 579 struct iommu_table_ops iommu_table_pseries_ops; 580 581 static void iommu_table_setparms(struct pci_controller *phb, 582 struct device_node *dn, 583 struct iommu_table *tbl) 584 { 585 struct device_node *node; 586 const unsigned long *basep; 587 const u32 *sizep; 588 589 /* Test if we are going over 2GB of DMA space */ 590 if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { 591 udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 592 panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 593 } 594 595 node = phb->dn; 596 basep = of_get_property(node, "linux,tce-base", NULL); 597 sizep = of_get_property(node, "linux,tce-size", NULL); 598 if (basep == NULL || sizep == NULL) { 599 printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has " 600 "missing tce entries !\n", dn); 601 return; 602 } 603 604 iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, 605 phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, 606 __va(*basep), &iommu_table_pseries_ops); 607 608 if (!is_kdump_kernel()) 609 memset((void *)tbl->it_base, 0, *sizep); 610 611 phb->dma_window_base_cur += phb->dma_window_size; 612 } 613 614 struct iommu_table_ops iommu_table_lpar_multi_ops; 615 616 struct iommu_table_ops iommu_table_pseries_ops = { 617 .set = tce_build_pSeries, 618 .clear = tce_clear_pSeries, 619 .get = tce_get_pseries 620 }; 621 622 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) 623 { 624 struct device_node *dn; 625 struct iommu_table *tbl; 626 struct device_node *isa_dn, *isa_dn_orig; 627 struct device_node *tmp; 628 struct pci_dn *pci; 629 int children; 630 631 dn = pci_bus_to_OF_node(bus); 632 633 pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn); 634 635 if (bus->self) { 636 /* This is not a root bus, any setup will be done for the 637 * device-side of the bridge in iommu_dev_setup_pSeries(). 638 */ 639 return; 640 } 641 pci = PCI_DN(dn); 642 643 /* Check if the ISA bus on the system is under 644 * this PHB. 645 */ 646 isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa"); 647 648 while (isa_dn && isa_dn != dn) 649 isa_dn = isa_dn->parent; 650 651 of_node_put(isa_dn_orig); 652 653 /* Count number of direct PCI children of the PHB. */ 654 for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling) 655 children++; 656 657 pr_debug("Children: %d\n", children); 658 659 /* Calculate amount of DMA window per slot. Each window must be 660 * a power of two (due to pci_alloc_consistent requirements). 661 * 662 * Keep 256MB aside for PHBs with ISA. 663 */ 664 665 if (!isa_dn) { 666 /* No ISA/IDE - just set window size and return */ 667 pci->phb->dma_window_size = 0x80000000ul; /* To be divided */ 668 669 while (pci->phb->dma_window_size * children > 0x80000000ul) 670 pci->phb->dma_window_size >>= 1; 671 pr_debug("No ISA/IDE, window size is 0x%llx\n", 672 pci->phb->dma_window_size); 673 pci->phb->dma_window_base_cur = 0; 674 675 return; 676 } 677 678 /* If we have ISA, then we probably have an IDE 679 * controller too. Allocate a 128MB table but 680 * skip the first 128MB to avoid stepping on ISA 681 * space. 682 */ 683 pci->phb->dma_window_size = 0x8000000ul; 684 pci->phb->dma_window_base_cur = 0x8000000ul; 685 686 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 687 tbl = pci->table_group->tables[0]; 688 689 iommu_table_setparms(pci->phb, dn, tbl); 690 691 if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) 692 panic("Failed to initialize iommu table"); 693 694 /* Divide the rest (1.75GB) among the children */ 695 pci->phb->dma_window_size = 0x80000000ul; 696 while (pci->phb->dma_window_size * children > 0x70000000ul) 697 pci->phb->dma_window_size >>= 1; 698 699 pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); 700 } 701 702 #ifdef CONFIG_IOMMU_API 703 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 704 long *tce, enum dma_data_direction *direction) 705 { 706 long rc; 707 unsigned long ioba = (unsigned long) index << tbl->it_page_shift; 708 unsigned long flags, oldtce = 0; 709 u64 proto_tce = iommu_direction_to_tce_perm(*direction); 710 unsigned long newtce = *tce | proto_tce; 711 712 spin_lock_irqsave(&tbl->large_pool.lock, flags); 713 714 rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce); 715 if (!rc) 716 rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce); 717 718 if (!rc) { 719 *direction = iommu_tce_direction(oldtce); 720 *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); 721 } 722 723 spin_unlock_irqrestore(&tbl->large_pool.lock, flags); 724 725 return rc; 726 } 727 728 static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index, 729 bool __always_unused alloc) 730 { 731 return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL; 732 } 733 #endif 734 735 struct iommu_table_ops iommu_table_lpar_multi_ops = { 736 .set = tce_buildmulti_pSeriesLP, 737 #ifdef CONFIG_IOMMU_API 738 .xchg_no_kill = tce_exchange_pseries, 739 .useraddrptr = tce_useraddr_pSeriesLP, 740 #endif 741 .clear = tce_freemulti_pSeriesLP, 742 .get = tce_get_pSeriesLP, 743 .free = tce_free_pSeries 744 }; 745 746 #ifdef CONFIG_IOMMU_API 747 /* 748 * When the DMA window properties might have been removed, 749 * the parent node has the table_group setup on it. 750 */ 751 static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev, 752 struct iommu_table_group *table_group) 753 { 754 struct device_node *dn = pci_device_to_OF_node(dev); 755 struct pci_dn *rpdn; 756 757 for (; dn && PCI_DN(dn); dn = dn->parent) { 758 rpdn = PCI_DN(dn); 759 760 if (table_group == rpdn->table_group) 761 return dn; 762 } 763 764 return NULL; 765 } 766 #endif 767 768 /* 769 * Find nearest ibm,dma-window (default DMA window) or direct DMA window or 770 * dynamic 64bit DMA window, walking up the device tree. 771 */ 772 static struct device_node *pci_dma_find(struct device_node *dn, 773 struct dynamic_dma_window_prop *prop) 774 { 775 const __be32 *default_prop = NULL; 776 const __be32 *ddw_prop = NULL; 777 struct device_node *rdn = NULL; 778 bool default_win = false, ddw_win = false; 779 780 for ( ; dn && PCI_DN(dn); dn = dn->parent) { 781 default_prop = of_get_property(dn, "ibm,dma-window", NULL); 782 if (default_prop) { 783 rdn = dn; 784 default_win = true; 785 } 786 ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); 787 if (ddw_prop) { 788 rdn = dn; 789 ddw_win = true; 790 break; 791 } 792 ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); 793 if (ddw_prop) { 794 rdn = dn; 795 ddw_win = true; 796 break; 797 } 798 799 /* At least found default window, which is the case for normal boot */ 800 if (default_win) 801 break; 802 } 803 804 /* For PCI devices there will always be a DMA window, either on the device 805 * or parent bus 806 */ 807 WARN_ON(!(default_win | ddw_win)); 808 809 /* caller doesn't want to get DMA window property */ 810 if (!prop) 811 return rdn; 812 813 /* parse DMA window property. During normal system boot, only default 814 * DMA window is passed in OF. But, for kdump, a dedicated adapter might 815 * have both default and DDW in FDT. In this scenario, DDW takes precedence 816 * over default window. 817 */ 818 if (ddw_win) { 819 struct dynamic_dma_window_prop *p; 820 821 p = (struct dynamic_dma_window_prop *)ddw_prop; 822 prop->liobn = p->liobn; 823 prop->dma_base = p->dma_base; 824 prop->tce_shift = p->tce_shift; 825 prop->window_shift = p->window_shift; 826 } else if (default_win) { 827 unsigned long offset, size, liobn; 828 829 of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); 830 831 prop->liobn = cpu_to_be32((u32)liobn); 832 prop->dma_base = cpu_to_be64(offset); 833 prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); 834 prop->window_shift = cpu_to_be32(order_base_2(size)); 835 } 836 837 return rdn; 838 } 839 840 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) 841 { 842 struct iommu_table *tbl; 843 struct device_node *dn, *pdn; 844 struct pci_dn *ppci; 845 struct dynamic_dma_window_prop prop; 846 847 dn = pci_bus_to_OF_node(bus); 848 849 pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", 850 dn); 851 852 pdn = pci_dma_find(dn, &prop); 853 854 /* In PPC architecture, there will always be DMA window on bus or one of the 855 * parent bus. During reboot, there will be ibm,dma-window property to 856 * define DMA window. For kdump, there will at least be default window or DDW 857 * or both. 858 * There is an exception to the above. In case the PE goes into frozen 859 * state, firmware may not provide ibm,dma-window property at the time 860 * of LPAR boot up. 861 */ 862 863 if (!pdn) { 864 pr_debug(" no ibm,dma-window property !\n"); 865 return; 866 } 867 868 ppci = PCI_DN(pdn); 869 870 pr_debug(" parent is %pOF, iommu_table: 0x%p\n", 871 pdn, ppci->table_group); 872 873 if (!ppci->table_group) { 874 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); 875 tbl = ppci->table_group->tables[0]; 876 877 iommu_table_setparms_common(tbl, ppci->phb->bus->number, 878 be32_to_cpu(prop.liobn), 879 be64_to_cpu(prop.dma_base), 880 1ULL << be32_to_cpu(prop.window_shift), 881 be32_to_cpu(prop.tce_shift), NULL, 882 &iommu_table_lpar_multi_ops); 883 884 if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) 885 panic("Failed to initialize iommu table"); 886 887 iommu_register_group(ppci->table_group, 888 pci_domain_nr(bus), 0); 889 pr_debug(" created table: %p\n", ppci->table_group); 890 } 891 } 892 893 894 static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) 895 { 896 struct device_node *dn; 897 struct iommu_table *tbl; 898 899 pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev)); 900 901 dn = dev->dev.of_node; 902 903 /* If we're the direct child of a root bus, then we need to allocate 904 * an iommu table ourselves. The bus setup code should have setup 905 * the window sizes already. 906 */ 907 if (!dev->bus->self) { 908 struct pci_controller *phb = PCI_DN(dn)->phb; 909 910 pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); 911 PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); 912 tbl = PCI_DN(dn)->table_group->tables[0]; 913 iommu_table_setparms(phb, dn, tbl); 914 915 if (!iommu_init_table(tbl, phb->node, 0, 0)) 916 panic("Failed to initialize iommu table"); 917 918 set_iommu_table_base(&dev->dev, tbl); 919 return; 920 } 921 922 /* If this device is further down the bus tree, search upwards until 923 * an already allocated iommu table is found and use that. 924 */ 925 926 while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) 927 dn = dn->parent; 928 929 if (dn && PCI_DN(dn)) 930 set_iommu_table_base(&dev->dev, 931 PCI_DN(dn)->table_group->tables[0]); 932 else 933 printk(KERN_WARNING "iommu: Device %s has no iommu table\n", 934 pci_name(dev)); 935 } 936 937 static int __read_mostly disable_ddw; 938 939 static int __init disable_ddw_setup(char *str) 940 { 941 disable_ddw = 1; 942 printk(KERN_INFO "ppc iommu: disabling ddw.\n"); 943 944 return 0; 945 } 946 947 early_param("disable_ddw", disable_ddw_setup); 948 949 static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) 950 { 951 int ret; 952 953 ret = tce_clearrange_multi_pSeriesLP(0, 954 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); 955 if (ret) 956 pr_warn("%pOF failed to clear tces in window.\n", 957 np); 958 else 959 pr_debug("%pOF successfully cleared tces in window.\n", 960 np); 961 } 962 963 /* 964 * Call only if DMA window is clean. 965 */ 966 static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) 967 { 968 int ret; 969 970 ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); 971 if (ret) 972 pr_warn("%pOF: failed to remove DMA window: rtas returned " 973 "%d to ibm,remove-pe-dma-window(%x) %llx\n", 974 np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); 975 else 976 pr_debug("%pOF: successfully removed DMA window: rtas returned " 977 "%d to ibm,remove-pe-dma-window(%x) %llx\n", 978 np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); 979 } 980 981 static void remove_dma_window(struct device_node *np, u32 *ddw_avail, 982 struct property *win, bool cleanup) 983 { 984 struct dynamic_dma_window_prop *dwp; 985 u64 liobn; 986 987 dwp = win->value; 988 liobn = (u64)be32_to_cpu(dwp->liobn); 989 990 if (cleanup) 991 clean_dma_window(np, dwp); 992 __remove_dma_window(np, ddw_avail, liobn); 993 } 994 995 static void copy_property(struct device_node *pdn, const char *from, const char *to) 996 { 997 struct property *src, *dst; 998 999 src = of_find_property(pdn, from, NULL); 1000 if (!src) 1001 return; 1002 1003 dst = kzalloc_obj(*dst); 1004 if (!dst) 1005 return; 1006 1007 dst->name = kstrdup(to, GFP_KERNEL); 1008 dst->value = kmemdup(src->value, src->length, GFP_KERNEL); 1009 dst->length = src->length; 1010 if (!dst->name || !dst->value) 1011 return; 1012 1013 if (of_add_property(pdn, dst)) { 1014 pr_err("Unable to add DMA window property for %pOF", pdn); 1015 goto free_prop; 1016 } 1017 1018 return; 1019 1020 free_prop: 1021 kfree(dst->name); 1022 kfree(dst->value); 1023 kfree(dst); 1024 } 1025 1026 static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name, 1027 bool cleanup) 1028 { 1029 struct property *win; 1030 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1031 int ret = 0; 1032 1033 win = of_find_property(np, win_name, NULL); 1034 if (!win) 1035 return -EINVAL; 1036 1037 ret = of_property_read_u32_array(np, "ibm,ddw-applicable", 1038 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1039 if (ret) 1040 return 0; 1041 1042 if (win->length >= sizeof(struct dynamic_dma_window_prop)) 1043 remove_dma_window(np, ddw_avail, win, cleanup); 1044 1045 if (!remove_prop) 1046 return 0; 1047 1048 /* Default window property if removed is lost as reset-pe doesn't restore it. 1049 * Though FDT has a copy of it, the DLPAR hotplugged devices will not have a 1050 * node on FDT until next reboot. So, back it up. 1051 */ 1052 if ((strcmp(win_name, "ibm,dma-window") == 0) && 1053 !of_find_property(np, "ibm,dma-window-saved", NULL)) 1054 copy_property(np, win_name, "ibm,dma-window-saved"); 1055 1056 ret = of_remove_property(np, win); 1057 if (ret) 1058 pr_warn("%pOF: failed to remove DMA window property: %d\n", 1059 np, ret); 1060 return 0; 1061 } 1062 1063 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift, 1064 bool *direct_mapping) 1065 { 1066 struct dma_win *window; 1067 const struct dynamic_dma_window_prop *dma64; 1068 bool found = false; 1069 1070 spin_lock(&dma_win_list_lock); 1071 /* check if we already created a window and dupe that config if so */ 1072 list_for_each_entry(window, &dma_win_list, list) { 1073 if (window->device == pdn) { 1074 dma64 = window->prop; 1075 *dma_addr = be64_to_cpu(dma64->dma_base); 1076 *window_shift = be32_to_cpu(dma64->window_shift); 1077 *direct_mapping = window->direct; 1078 found = true; 1079 break; 1080 } 1081 } 1082 spin_unlock(&dma_win_list_lock); 1083 1084 return found; 1085 } 1086 1087 static struct dma_win *ddw_list_new_entry(struct device_node *pdn, 1088 const struct dynamic_dma_window_prop *dma64) 1089 { 1090 struct dma_win *window; 1091 1092 window = kzalloc_obj(*window); 1093 if (!window) 1094 return NULL; 1095 1096 window->device = pdn; 1097 window->prop = dma64; 1098 window->direct = false; 1099 1100 return window; 1101 } 1102 1103 static void find_existing_ddw_windows_named(const char *name) 1104 { 1105 int len; 1106 struct device_node *pdn; 1107 struct dma_win *window; 1108 const struct dynamic_dma_window_prop *dma64; 1109 1110 for_each_node_with_property(pdn, name) { 1111 dma64 = of_get_property(pdn, name, &len); 1112 if (!dma64 || len < sizeof(*dma64)) { 1113 remove_dma_window_named(pdn, true, name, true); 1114 continue; 1115 } 1116 1117 /* If at the time of system initialization, there are DDWs in OF, 1118 * it means this is during kexec. DDW could be direct or dynamic. 1119 * We will just mark DDWs as "dynamic" since this is kdump path, 1120 * no need to worry about perforance. ddw_list_new_entry() will 1121 * set window->direct = false. 1122 */ 1123 window = ddw_list_new_entry(pdn, dma64); 1124 if (!window) { 1125 of_node_put(pdn); 1126 break; 1127 } 1128 1129 spin_lock(&dma_win_list_lock); 1130 list_add(&window->list, &dma_win_list); 1131 spin_unlock(&dma_win_list_lock); 1132 } 1133 } 1134 1135 static int find_existing_ddw_windows(void) 1136 { 1137 if (!firmware_has_feature(FW_FEATURE_LPAR)) 1138 return 0; 1139 1140 find_existing_ddw_windows_named(DIRECT64_PROPNAME); 1141 find_existing_ddw_windows_named(DMA64_PROPNAME); 1142 1143 return 0; 1144 } 1145 machine_arch_initcall(pseries, find_existing_ddw_windows); 1146 1147 /** 1148 * ddw_read_ext - Get the value of an DDW extension 1149 * @np: device node from which the extension value is to be read. 1150 * @extnum: index number of the extension. 1151 * @value: pointer to return value, modified when extension is available. 1152 * 1153 * Checks if "ibm,ddw-extensions" exists for this node, and get the value 1154 * on index 'extnum'. 1155 * It can be used only to check if a property exists, passing value == NULL. 1156 * 1157 * Returns: 1158 * 0 if extension successfully read 1159 * -EINVAL if the "ibm,ddw-extensions" does not exist, 1160 * -ENODATA if "ibm,ddw-extensions" does not have a value, and 1161 * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension. 1162 */ 1163 static inline int ddw_read_ext(const struct device_node *np, int extnum, 1164 u32 *value) 1165 { 1166 static const char propname[] = "ibm,ddw-extensions"; 1167 u32 count; 1168 int ret; 1169 1170 ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count); 1171 if (ret) 1172 return ret; 1173 1174 if (count < extnum) 1175 return -EOVERFLOW; 1176 1177 if (!value) 1178 value = &count; 1179 1180 return of_property_read_u32_index(np, propname, extnum, value); 1181 } 1182 1183 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, 1184 struct ddw_query_response *query, 1185 struct device_node *parent) 1186 { 1187 struct device_node *dn; 1188 struct pci_dn *pdn; 1189 u32 cfg_addr, ext_query, query_out[5]; 1190 u64 buid; 1191 int ret, out_sz; 1192 1193 /* 1194 * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many 1195 * output parameters ibm,query-pe-dma-windows will have, ranging from 1196 * 5 to 6. 1197 */ 1198 ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query); 1199 if (!ret && ext_query == 1) 1200 out_sz = 6; 1201 else 1202 out_sz = 5; 1203 1204 /* 1205 * Get the config address and phb buid of the PE window. 1206 * Rely on eeh to retrieve this for us. 1207 * Retrieve them from the pci device, not the node with the 1208 * dma-window property 1209 */ 1210 dn = pci_device_to_OF_node(dev); 1211 pdn = PCI_DN(dn); 1212 buid = pdn->phb->buid; 1213 cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); 1214 1215 ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out, 1216 cfg_addr, BUID_HI(buid), BUID_LO(buid)); 1217 1218 switch (out_sz) { 1219 case 5: 1220 query->windows_available = query_out[0]; 1221 query->largest_available_block = query_out[1]; 1222 query->page_size = query_out[2]; 1223 query->migration_capable = query_out[3]; 1224 break; 1225 case 6: 1226 query->windows_available = query_out[0]; 1227 query->largest_available_block = ((u64)query_out[1] << 32) | 1228 query_out[2]; 1229 query->page_size = query_out[3]; 1230 query->migration_capable = query_out[4]; 1231 break; 1232 } 1233 1234 dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n", 1235 ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid), 1236 BUID_LO(buid), ret, query->largest_available_block, 1237 query->page_size, query->windows_available); 1238 1239 return ret; 1240 } 1241 1242 static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, 1243 struct ddw_create_response *create, int page_shift, 1244 int window_shift) 1245 { 1246 struct device_node *dn; 1247 struct pci_dn *pdn; 1248 u32 cfg_addr; 1249 u64 buid; 1250 int ret; 1251 1252 /* 1253 * Get the config address and phb buid of the PE window. 1254 * Rely on eeh to retrieve this for us. 1255 * Retrieve them from the pci device, not the node with the 1256 * dma-window property 1257 */ 1258 dn = pci_device_to_OF_node(dev); 1259 pdn = PCI_DN(dn); 1260 buid = pdn->phb->buid; 1261 cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8)); 1262 1263 do { 1264 /* extra outputs are LIOBN and dma-addr (hi, lo) */ 1265 ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4, 1266 (u32 *)create, cfg_addr, BUID_HI(buid), 1267 BUID_LO(buid), page_shift, window_shift); 1268 } while (rtas_busy_delay(ret)); 1269 dev_info(&dev->dev, 1270 "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " 1271 "(liobn = 0x%x starting addr = %x %x)\n", 1272 ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid), 1273 BUID_LO(buid), page_shift, window_shift, ret, create->liobn, 1274 create->addr_hi, create->addr_lo); 1275 1276 return ret; 1277 } 1278 1279 struct failed_ddw_pdn { 1280 struct device_node *pdn; 1281 struct list_head list; 1282 }; 1283 1284 static LIST_HEAD(failed_ddw_pdn_list); 1285 1286 static phys_addr_t ddw_memory_hotplug_max(void) 1287 { 1288 resource_size_t max_addr; 1289 1290 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1291 max_addr = hot_add_drconf_memory_max(); 1292 #else 1293 max_addr = memblock_end_of_DRAM(); 1294 #endif 1295 1296 return max_addr; 1297 } 1298 1299 /* 1300 * Platforms supporting the DDW option starting with LoPAR level 2.7 implement 1301 * ibm,ddw-extensions, which carries the rtas token for 1302 * ibm,reset-pe-dma-windows. 1303 * That rtas-call can be used to restore the default DMA window for the device. 1304 */ 1305 static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) 1306 { 1307 int ret; 1308 u32 cfg_addr, reset_dma_win; 1309 u64 buid; 1310 struct device_node *dn; 1311 struct pci_dn *pdn; 1312 1313 ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); 1314 if (ret) 1315 return; 1316 1317 dn = pci_device_to_OF_node(dev); 1318 pdn = PCI_DN(dn); 1319 buid = pdn->phb->buid; 1320 cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); 1321 1322 ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid), 1323 BUID_LO(buid)); 1324 if (ret) 1325 dev_info(&dev->dev, 1326 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ", 1327 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), 1328 ret); 1329 } 1330 1331 /* 1332 * Platforms support placing PHB in limited address mode starting with LoPAR 1333 * level 2.13 implement. In this mode, the DMA address returned by DDW is over 1334 * 4GB but, less than 64-bits. This benefits IO adapters that don't support 1335 * 64-bits for DMA addresses. 1336 */ 1337 static int limited_dma_window(struct pci_dev *dev, struct device_node *par_dn) 1338 { 1339 int ret; 1340 u32 cfg_addr, reset_dma_win, las_supported; 1341 u64 buid; 1342 struct device_node *dn; 1343 struct pci_dn *pdn; 1344 1345 ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win); 1346 if (ret) 1347 goto out; 1348 1349 ret = ddw_read_ext(par_dn, DDW_EXT_LIMITED_ADDR_MODE, &las_supported); 1350 1351 /* Limited Address Space extension available on the platform but DDW in 1352 * limited addressing mode not supported 1353 */ 1354 if (!ret && !las_supported) 1355 ret = -EPROTO; 1356 1357 if (ret) { 1358 dev_info(&dev->dev, "Limited Address Space for DDW not Supported, err: %d", ret); 1359 goto out; 1360 } 1361 1362 dn = pci_device_to_OF_node(dev); 1363 pdn = PCI_DN(dn); 1364 buid = pdn->phb->buid; 1365 cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8); 1366 1367 ret = rtas_call(reset_dma_win, 4, 1, NULL, cfg_addr, BUID_HI(buid), 1368 BUID_LO(buid), 1); 1369 if (ret) 1370 dev_info(&dev->dev, 1371 "ibm,reset-pe-dma-windows(%x) for Limited Addr Support: %x %x %x returned %d ", 1372 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid), 1373 ret); 1374 1375 out: 1376 return ret; 1377 } 1378 1379 /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ 1380 static int iommu_get_page_shift(u32 query_page_size) 1381 { 1382 /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */ 1383 const int shift[] = { 1384 __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M), 1385 __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M), 1386 __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M) 1387 }; 1388 1389 int i = ARRAY_SIZE(shift) - 1; 1390 int ret = 0; 1391 1392 /* 1393 * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field: 1394 * - bit 31 means 4k pages are supported, 1395 * - bit 30 means 64k pages are supported, and so on. 1396 * Larger pagesizes map more memory with the same amount of TCEs, so start probing them. 1397 */ 1398 for (; i >= 0 ; i--) { 1399 if (query_page_size & (1 << i)) 1400 ret = max(ret, shift[i]); 1401 } 1402 1403 return ret; 1404 } 1405 1406 static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, 1407 u32 page_shift, u32 window_shift) 1408 { 1409 struct dynamic_dma_window_prop *ddwprop; 1410 struct property *win64; 1411 1412 win64 = kzalloc_obj(*win64); 1413 if (!win64) 1414 return NULL; 1415 1416 win64->name = kstrdup(propname, GFP_KERNEL); 1417 ddwprop = kzalloc_obj(*ddwprop); 1418 win64->value = ddwprop; 1419 win64->length = sizeof(*ddwprop); 1420 if (!win64->name || !win64->value) { 1421 kfree(win64->name); 1422 kfree(win64->value); 1423 kfree(win64); 1424 return NULL; 1425 } 1426 1427 ddwprop->liobn = cpu_to_be32(liobn); 1428 ddwprop->dma_base = cpu_to_be64(dma_addr); 1429 ddwprop->tce_shift = cpu_to_be32(page_shift); 1430 ddwprop->window_shift = cpu_to_be32(window_shift); 1431 1432 return win64; 1433 } 1434 1435 /* 1436 * If the PE supports dynamic dma windows, and there is space for a table 1437 * that can map all pages in a linear offset, then setup such a table, 1438 * and record the dma-offset in the struct device. 1439 * 1440 * dev: the pci device we are checking 1441 * pdn: the parent pe node with the ibm,dma_window property 1442 * Future: also check if we can remap the base window for our base page size 1443 * 1444 * returns true if can map all pages (direct mapping), false otherwise.. 1445 */ 1446 static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn, u64 dma_mask) 1447 { 1448 int len = 0, ret; 1449 int max_ram_len = order_base_2(ddw_memory_hotplug_max()); 1450 struct ddw_query_response query; 1451 struct ddw_create_response create; 1452 int page_shift; 1453 u64 win_addr, dynamic_offset = 0; 1454 const char *win_name; 1455 struct device_node *dn; 1456 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1457 struct dma_win *window; 1458 struct property *win64; 1459 struct failed_ddw_pdn *fpdn; 1460 bool default_win_removed = false, direct_mapping = false; 1461 bool dynamic_mapping = false; 1462 bool pmem_present; 1463 struct pci_dn *pci = PCI_DN(pdn); 1464 struct property *default_win = NULL; 1465 bool limited_addr_req = false, limited_addr_enabled = false; 1466 int dev_max_ddw; 1467 int ddw_sz; 1468 1469 dn = of_find_node_by_type(NULL, "ibm,pmemory"); 1470 pmem_present = dn != NULL; 1471 of_node_put(dn); 1472 1473 mutex_lock(&dma_win_init_mutex); 1474 1475 if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) 1476 goto out_unlock; 1477 1478 /* 1479 * If we already went through this for a previous function of 1480 * the same device and failed, we don't want to muck with the 1481 * DMA window again, as it will race with in-flight operations 1482 * and can lead to EEHs. The above mutex protects access to the 1483 * list. 1484 */ 1485 list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { 1486 if (fpdn->pdn == pdn) 1487 goto out_unlock; 1488 } 1489 1490 /* 1491 * the ibm,ddw-applicable property holds the tokens for: 1492 * ibm,query-pe-dma-window 1493 * ibm,create-pe-dma-window 1494 * for the given node in that order. 1495 * the property is actually in the parent, not the PE 1496 */ 1497 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 1498 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1499 if (ret) 1500 goto out_failed; 1501 1502 /* 1503 * Query if there is a second window of size to map the 1504 * whole partition. Query returns number of windows, largest 1505 * block assigned to PE (partition endpoint), and two bitmasks 1506 * of page sizes: supported and supported for migrate-dma. 1507 */ 1508 dn = pci_device_to_OF_node(dev); 1509 ret = query_ddw(dev, ddw_avail, &query, pdn); 1510 if (ret != 0) 1511 goto out_failed; 1512 1513 /* DMA Limited Addressing required? This is when the driver has 1514 * requested to create DDW but supports mask which is less than 64-bits 1515 */ 1516 limited_addr_req = (dma_mask != DMA_BIT_MASK(64)); 1517 1518 /* place the PHB in Limited Addressing mode */ 1519 if (limited_addr_req) { 1520 if (limited_dma_window(dev, pdn)) 1521 goto out_failed; 1522 1523 /* PHB is in Limited address mode */ 1524 limited_addr_enabled = true; 1525 } 1526 1527 /* 1528 * If there is no window available, remove the default DMA window, 1529 * if it's present. This will make all the resources available to the 1530 * new DDW window. 1531 * If anything fails after this, we need to restore it, so also check 1532 * for extensions presence. 1533 */ 1534 if (query.windows_available == 0) { 1535 int reset_win_ext; 1536 1537 /* DDW + IOMMU on single window may fail if there is any allocation */ 1538 if (iommu_table_in_use(pci->table_group->tables[0])) { 1539 dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); 1540 goto out_failed; 1541 } 1542 1543 default_win = of_find_property(pdn, "ibm,dma-window", NULL); 1544 if (!default_win) 1545 goto out_failed; 1546 1547 reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL); 1548 if (reset_win_ext) 1549 goto out_failed; 1550 1551 remove_dma_window(pdn, ddw_avail, default_win, true); 1552 default_win_removed = true; 1553 1554 /* Query again, to check if the window is available */ 1555 ret = query_ddw(dev, ddw_avail, &query, pdn); 1556 if (ret != 0) 1557 goto out_failed; 1558 1559 if (query.windows_available == 0) { 1560 /* no windows are available for this device. */ 1561 dev_dbg(&dev->dev, "no free dynamic windows"); 1562 goto out_failed; 1563 } 1564 } 1565 1566 page_shift = iommu_get_page_shift(query.page_size); 1567 if (!page_shift) { 1568 dev_dbg(&dev->dev, "no supported page size in mask %x", 1569 query.page_size); 1570 goto out_failed; 1571 } 1572 1573 /* Maximum DMA window size that the device can address (in log2) */ 1574 dev_max_ddw = fls64(dma_mask); 1575 1576 /* If the device DMA mask is less than 64-bits, make sure the DMA window 1577 * size is not bigger than what the device can access 1578 */ 1579 ddw_sz = min(order_base_2(query.largest_available_block << page_shift), 1580 dev_max_ddw); 1581 1582 /* 1583 * The "ibm,pmemory" can appear anywhere in the address space. 1584 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS 1585 * for the upper limit and fallback to max RAM otherwise but this 1586 * disables device::dma_ops_bypass. 1587 */ 1588 len = max_ram_len; 1589 if (pmem_present) { 1590 if (ddw_sz >= MAX_PHYSMEM_BITS) 1591 len = MAX_PHYSMEM_BITS; 1592 else 1593 dev_info(&dev->dev, "Skipping ibm,pmemory"); 1594 } 1595 1596 /* check if the available block * number of ptes will map everything */ 1597 if (ddw_sz < len) { 1598 dev_dbg(&dev->dev, 1599 "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1600 1ULL << len, 1601 query.largest_available_block, 1602 1ULL << page_shift); 1603 1604 len = ddw_sz; 1605 dynamic_mapping = true; 1606 } else { 1607 direct_mapping = !default_win_removed || 1608 (len == MAX_PHYSMEM_BITS) || 1609 (!pmem_present && (len == max_ram_len)); 1610 1611 /* DDW is big enough to direct map RAM. If there is vPMEM, check 1612 * if enough space is left in DDW where we can dynamically 1613 * allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW 1614 * is only for SR-IOV devices. 1615 */ 1616 if (default_win_removed && pmem_present && !direct_mapping) { 1617 /* DDW is big enough to be split */ 1618 if ((1ULL << ddw_sz) >= 1619 MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) { 1620 1621 direct_mapping = true; 1622 1623 /* offset of the Dynamic part of DDW */ 1624 dynamic_offset = 1ULL << max_ram_len; 1625 } 1626 1627 /* DDW will at least have dynamic allocation */ 1628 dynamic_mapping = true; 1629 1630 /* create max size DDW possible */ 1631 len = ddw_sz; 1632 } 1633 } 1634 1635 /* Even if the DDW is split into both direct mapped RAM and dynamically 1636 * mapped vPMEM, the DDW property in OF will be marked as Direct. 1637 */ 1638 win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; 1639 1640 ret = create_ddw(dev, ddw_avail, &create, page_shift, len); 1641 if (ret != 0) 1642 goto out_failed; 1643 1644 dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", 1645 create.liobn, dn); 1646 1647 win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; 1648 win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); 1649 1650 if (!win64) { 1651 dev_info(&dev->dev, 1652 "couldn't allocate property, property name, or value\n"); 1653 goto out_remove_win; 1654 } 1655 1656 ret = of_add_property(pdn, win64); 1657 if (ret) { 1658 dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", 1659 pdn, ret); 1660 goto out_free_prop; 1661 } 1662 1663 window = ddw_list_new_entry(pdn, win64->value); 1664 if (!window) 1665 goto out_del_prop; 1666 1667 window->direct = direct_mapping; 1668 1669 if (direct_mapping) { 1670 /* DDW maps the whole partition, so enable direct DMA mapping */ 1671 ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT, 1672 win64->value, tce_setrange_multi_pSeriesLP_walk); 1673 if (ret) { 1674 dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", 1675 dn, ret); 1676 1677 /* Make sure to clean DDW if any TCE was set*/ 1678 clean_dma_window(pdn, win64->value); 1679 goto out_del_list; 1680 } 1681 if (default_win_removed) { 1682 iommu_tce_table_put(pci->table_group->tables[0]); 1683 pci->table_group->tables[0] = NULL; 1684 set_iommu_table_base(&dev->dev, NULL); 1685 } 1686 } 1687 1688 if (dynamic_mapping) { 1689 struct iommu_table *newtbl; 1690 int i; 1691 unsigned long start = 0, end = 0; 1692 u64 dynamic_addr, dynamic_len; 1693 1694 for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { 1695 const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; 1696 1697 /* Look for MMIO32 */ 1698 if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { 1699 start = pci->phb->mem_resources[i].start; 1700 end = pci->phb->mem_resources[i].end; 1701 break; 1702 } 1703 } 1704 1705 /* New table for using DDW instead of the default DMA window */ 1706 newtbl = iommu_pseries_alloc_table(pci->phb->node); 1707 if (!newtbl) { 1708 dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); 1709 goto out_del_list; 1710 } 1711 1712 /* If the DDW is split between directly mapped RAM and Dynamic 1713 * mapped for TCES, offset into the DDW where the dynamic part 1714 * begins. 1715 */ 1716 dynamic_addr = win_addr + dynamic_offset; 1717 dynamic_len = (1UL << len) - dynamic_offset; 1718 iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, 1719 dynamic_addr, dynamic_len, page_shift, NULL, 1720 &iommu_table_lpar_multi_ops); 1721 iommu_init_table(newtbl, pci->phb->node, 1722 start >> page_shift, end >> page_shift); 1723 1724 pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl; 1725 1726 set_iommu_table_base(&dev->dev, newtbl); 1727 } 1728 1729 if (default_win_removed) { 1730 /* default_win is valid here because default_win_removed == true */ 1731 if (!of_find_property(pdn, "ibm,dma-window-saved", NULL)) 1732 copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved"); 1733 of_remove_property(pdn, default_win); 1734 dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn); 1735 } 1736 1737 spin_lock(&dma_win_list_lock); 1738 list_add(&window->list, &dma_win_list); 1739 spin_unlock(&dma_win_list_lock); 1740 1741 dev->dev.archdata.dma_offset = win_addr; 1742 goto out_unlock; 1743 1744 out_del_list: 1745 kfree(window); 1746 1747 out_del_prop: 1748 of_remove_property(pdn, win64); 1749 1750 out_free_prop: 1751 kfree(win64->name); 1752 kfree(win64->value); 1753 kfree(win64); 1754 1755 out_remove_win: 1756 /* DDW is clean, so it's ok to call this directly. */ 1757 __remove_dma_window(pdn, ddw_avail, create.liobn); 1758 1759 out_failed: 1760 if (default_win_removed || limited_addr_enabled) 1761 reset_dma_window(dev, pdn); 1762 1763 fpdn = kzalloc_obj(*fpdn); 1764 if (!fpdn) 1765 goto out_unlock; 1766 fpdn->pdn = pdn; 1767 list_add(&fpdn->list, &failed_ddw_pdn_list); 1768 1769 out_unlock: 1770 mutex_unlock(&dma_win_init_mutex); 1771 1772 /* For pre-mapped memory, set bus_dma_limit to the max RAM */ 1773 if (direct_mapping) 1774 dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + 1775 (1ULL << max_ram_len); 1776 1777 dev_info(&dev->dev, "lsa_required: %x, lsa_enabled: %x, direct mapping: %x\n", 1778 limited_addr_req, limited_addr_enabled, direct_mapping); 1779 1780 return direct_mapping; 1781 } 1782 1783 static __u64 query_page_size_to_mask(u32 query_page_size) 1784 { 1785 const long shift[] = { 1786 (SZ_4K), (SZ_64K), (SZ_16M), 1787 (SZ_32M), (SZ_64M), (SZ_128M), 1788 (SZ_256M), (SZ_16G), (SZ_2M) 1789 }; 1790 int i, ret = 0; 1791 1792 for (i = 0; i < ARRAY_SIZE(shift); i++) { 1793 if (query_page_size & (1 << i)) 1794 ret |= shift[i]; 1795 } 1796 1797 return ret; 1798 } 1799 1800 static void spapr_tce_init_table_group(struct pci_dev *pdev, 1801 struct device_node *pdn, 1802 struct dynamic_dma_window_prop prop) 1803 { 1804 struct iommu_table_group *table_group = PCI_DN(pdn)->table_group; 1805 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 1806 1807 struct ddw_query_response query; 1808 int ret; 1809 1810 /* Only for normal boot with default window. Doesn't matter during 1811 * kdump, since these will not be used during kdump. 1812 */ 1813 if (is_kdump_kernel()) 1814 return; 1815 1816 if (table_group->max_dynamic_windows_supported != 0) 1817 return; /* already initialized */ 1818 1819 table_group->tce32_start = be64_to_cpu(prop.dma_base); 1820 table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); 1821 1822 if (!of_find_property(pdn, "ibm,dma-window", NULL)) 1823 dev_err(&pdev->dev, "default dma window missing!\n"); 1824 1825 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 1826 &ddw_avail[0], DDW_APPLICABLE_SIZE); 1827 if (ret) { 1828 table_group->max_dynamic_windows_supported = -1; 1829 return; 1830 } 1831 1832 ret = query_ddw(pdev, ddw_avail, &query, pdn); 1833 if (ret) { 1834 dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__); 1835 table_group->max_dynamic_windows_supported = -1; 1836 return; 1837 } 1838 1839 if (query.windows_available == 0) 1840 table_group->max_dynamic_windows_supported = 1; 1841 else 1842 table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES; 1843 1844 table_group->max_levels = 1; 1845 table_group->pgsizes |= query_page_size_to_mask(query.page_size); 1846 } 1847 1848 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) 1849 { 1850 struct device_node *pdn, *dn; 1851 struct iommu_table *tbl; 1852 struct pci_dn *pci; 1853 struct dynamic_dma_window_prop prop; 1854 1855 pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); 1856 1857 /* dev setup for LPAR is a little tricky, since the device tree might 1858 * contain the dma-window properties per-device and not necessarily 1859 * for the bus. So we need to search upwards in the tree until we 1860 * either hit a dma-window property, OR find a parent with a table 1861 * already allocated. 1862 */ 1863 dn = pci_device_to_OF_node(dev); 1864 pr_debug(" node is %pOF\n", dn); 1865 1866 pdn = pci_dma_find(dn, &prop); 1867 if (!pdn || !PCI_DN(pdn)) { 1868 printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " 1869 "no DMA window found for pci dev=%s dn=%pOF\n", 1870 pci_name(dev), dn); 1871 return; 1872 } 1873 pr_debug(" parent is %pOF\n", pdn); 1874 1875 pci = PCI_DN(pdn); 1876 if (!pci->table_group) { 1877 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 1878 tbl = pci->table_group->tables[0]; 1879 1880 iommu_table_setparms_common(tbl, pci->phb->bus->number, 1881 be32_to_cpu(prop.liobn), 1882 be64_to_cpu(prop.dma_base), 1883 1ULL << be32_to_cpu(prop.window_shift), 1884 be32_to_cpu(prop.tce_shift), NULL, 1885 &iommu_table_lpar_multi_ops); 1886 1887 iommu_init_table(tbl, pci->phb->node, 0, 0); 1888 iommu_register_group(pci->table_group, 1889 pci_domain_nr(pci->phb->bus), 0); 1890 pr_debug(" created table: %p\n", pci->table_group); 1891 } else { 1892 pr_debug(" found DMA window, table: %p\n", pci->table_group); 1893 } 1894 1895 spapr_tce_init_table_group(dev, pdn, prop); 1896 1897 set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); 1898 iommu_add_device(pci->table_group, &dev->dev); 1899 } 1900 1901 static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) 1902 { 1903 struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; 1904 1905 /* For DDW, DMA mask should be more than 32-bits. For mask more then 1906 * 32-bits but less then 64-bits, DMA addressing is supported in 1907 * Limited Addressing mode. 1908 */ 1909 if (dma_mask <= DMA_BIT_MASK(32)) 1910 return false; 1911 1912 dev_dbg(&pdev->dev, "node is %pOF\n", dn); 1913 1914 /* 1915 * the device tree might contain the dma-window properties 1916 * per-device and not necessarily for the bus. So we need to 1917 * search upwards in the tree until we either hit a dma-window 1918 * property, OR find a parent with a table already allocated. 1919 */ 1920 pdn = pci_dma_find(dn, NULL); 1921 if (pdn && PCI_DN(pdn)) 1922 return enable_ddw(pdev, pdn, dma_mask); 1923 1924 return false; 1925 } 1926 1927 #ifdef CONFIG_IOMMU_API 1928 /* 1929 * A simple iommu_table_group_ops which only allows reusing the existing 1930 * iommu_table. This handles VFIO for POWER7 or the nested KVM. 1931 * The ops does not allow creating windows and only allows reusing the existing 1932 * one if it matches table_group->tce32_start/tce32_size/page_shift. 1933 */ 1934 static unsigned long spapr_tce_get_table_size(__u32 page_shift, 1935 __u64 window_size, __u32 levels) 1936 { 1937 unsigned long size; 1938 1939 if (levels > 1) 1940 return ~0U; 1941 size = window_size >> (page_shift - 3); 1942 return size; 1943 } 1944 1945 static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group) 1946 { 1947 struct pci_dev *pdev = NULL; 1948 int ret; 1949 1950 /* No IOMMU group ? */ 1951 if (!group) 1952 return NULL; 1953 1954 ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table); 1955 if (!ret || !pdev) 1956 return NULL; 1957 return pdev; 1958 } 1959 1960 static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn) 1961 { 1962 reset_dma_window(pdev, pdn); 1963 copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window"); 1964 } 1965 1966 static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn) 1967 { 1968 struct pci_dn *pci = PCI_DN(pdn); 1969 struct dma_win *window; 1970 bool direct_mapping; 1971 int len; 1972 1973 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) { 1974 remove_dma_window_named(pdn, true, direct_mapping ? 1975 DIRECT64_PROPNAME : DMA64_PROPNAME, true); 1976 if (!direct_mapping) { 1977 WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]); 1978 1979 if (pci->table_group->tables[1]) { 1980 iommu_tce_table_put(pci->table_group->tables[1]); 1981 pci->table_group->tables[1] = NULL; 1982 } else if (pci->table_group->tables[0]) { 1983 /* Default window was removed and only the DDW exists */ 1984 iommu_tce_table_put(pci->table_group->tables[0]); 1985 pci->table_group->tables[0] = NULL; 1986 } 1987 } 1988 spin_lock(&dma_win_list_lock); 1989 list_for_each_entry(window, &dma_win_list, list) { 1990 if (window->device == pdn) { 1991 list_del(&window->list); 1992 kfree(window); 1993 break; 1994 } 1995 } 1996 spin_unlock(&dma_win_list_lock); 1997 } 1998 1999 return 0; 2000 } 2001 2002 static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group, 2003 struct device *dev) 2004 { 2005 struct pci_dev *pdev = to_pci_dev(dev); 2006 const __be32 *default_prop; 2007 long liobn, offset, size; 2008 struct device_node *pdn; 2009 struct iommu_table *tbl; 2010 struct pci_dn *pci; 2011 2012 pdn = pci_dma_find_parent_node(pdev, table_group); 2013 if (!pdn || !PCI_DN(pdn)) { 2014 dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn); 2015 return -1; 2016 } 2017 pci = PCI_DN(pdn); 2018 2019 /* The default window is restored if not present already on removal of DDW. 2020 * However, if used by VFIO SPAPR sub driver, the user's order of removal of 2021 * windows might have been different to not leading to auto restoration, 2022 * suppose the DDW was removed first followed by the default one. 2023 * So, restore the default window with reset-pe-dma call explicitly. 2024 */ 2025 restore_default_dma_window(pdev, pdn); 2026 2027 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2028 of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); 2029 tbl = iommu_pseries_alloc_table(pci->phb->node); 2030 if (!tbl) { 2031 dev_err(&pdev->dev, "couldn't create new IOMMU table\n"); 2032 return -1; 2033 } 2034 2035 iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset, 2036 size, IOMMU_PAGE_SHIFT_4K, NULL, 2037 &iommu_table_lpar_multi_ops); 2038 iommu_init_table(tbl, pci->phb->node, 0, 0); 2039 2040 pci->table_group->tables[0] = tbl; 2041 set_iommu_table_base(&pdev->dev, tbl); 2042 2043 return 0; 2044 } 2045 2046 static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift, 2047 __u64 window_size) 2048 { 2049 if ((window_size <= table_group->tce32_size) && 2050 (page_shift == IOMMU_PAGE_SHIFT_4K)) 2051 return true; 2052 2053 return false; 2054 } 2055 2056 static long spapr_tce_create_table(struct iommu_table_group *table_group, int num, 2057 __u32 page_shift, __u64 window_size, __u32 levels, 2058 struct iommu_table **ptbl) 2059 { 2060 struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); 2061 u32 ddw_avail[DDW_APPLICABLE_SIZE]; 2062 struct ddw_create_response create; 2063 unsigned long liobn, offset, size; 2064 unsigned long start = 0, end = 0; 2065 struct ddw_query_response query; 2066 const __be32 *default_prop; 2067 struct failed_ddw_pdn *fpdn; 2068 unsigned int window_shift; 2069 struct device_node *pdn; 2070 struct iommu_table *tbl; 2071 struct dma_win *window; 2072 struct property *win64; 2073 struct pci_dn *pci; 2074 u64 win_addr; 2075 int len, i; 2076 long ret; 2077 2078 if (!is_power_of_2(window_size) || levels > 1) 2079 return -EINVAL; 2080 2081 window_shift = order_base_2(window_size); 2082 2083 mutex_lock(&dma_win_init_mutex); 2084 2085 ret = -ENODEV; 2086 2087 pdn = pci_dma_find_parent_node(pdev, table_group); 2088 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2089 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2090 goto out_failed; 2091 } 2092 pci = PCI_DN(pdn); 2093 2094 /* If the enable DDW failed for the pdn, dont retry! */ 2095 list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { 2096 if (fpdn->pdn == pdn) { 2097 dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn); 2098 goto out_unlock; 2099 } 2100 } 2101 2102 tbl = iommu_pseries_alloc_table(pci->phb->node); 2103 if (!tbl) { 2104 dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n"); 2105 goto out_unlock; 2106 } 2107 2108 if (num == 0) { 2109 bool direct_mapping; 2110 /* The request is not for default window? Ensure there is no DDW window already */ 2111 if (!is_default_window_request(table_group, page_shift, window_size)) { 2112 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, 2113 &direct_mapping)) { 2114 dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn); 2115 ret = -EPERM; 2116 goto out_unlock; 2117 } 2118 } else { 2119 /* Request is for Default window, ensure there is no DDW if there is a 2120 * need to reset. reset-pe otherwise removes the DDW also 2121 */ 2122 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2123 if (!default_prop) { 2124 if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, 2125 &direct_mapping)) { 2126 dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window", 2127 pdn); 2128 ret = -EPERM; 2129 goto out_unlock; 2130 } 2131 2132 restore_default_dma_window(pdev, pdn); 2133 2134 default_prop = of_get_property(pdn, "ibm,dma-window", NULL); 2135 of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size); 2136 /* Limit the default window size to window_size */ 2137 iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, 2138 offset, 1UL << window_shift, 2139 IOMMU_PAGE_SHIFT_4K, NULL, 2140 &iommu_table_lpar_multi_ops); 2141 iommu_init_table(tbl, pci->phb->node, 2142 start >> IOMMU_PAGE_SHIFT_4K, 2143 end >> IOMMU_PAGE_SHIFT_4K); 2144 2145 table_group->tables[0] = tbl; 2146 2147 mutex_unlock(&dma_win_init_mutex); 2148 2149 goto exit; 2150 } 2151 } 2152 } 2153 2154 ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", 2155 &ddw_avail[0], DDW_APPLICABLE_SIZE); 2156 if (ret) { 2157 dev_info(&pdev->dev, "ibm,ddw-applicable not found\n"); 2158 goto out_failed; 2159 } 2160 ret = -ENODEV; 2161 2162 pr_err("%s: Calling query %pOF\n", __func__, pdn); 2163 ret = query_ddw(pdev, ddw_avail, &query, pdn); 2164 if (ret) 2165 goto out_failed; 2166 ret = -ENODEV; 2167 2168 len = window_shift; 2169 if (query.largest_available_block < (1ULL << (len - page_shift))) { 2170 dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n", 2171 1ULL << len, query.largest_available_block, 2172 1ULL << page_shift); 2173 ret = -EINVAL; /* Retry with smaller window size */ 2174 goto out_unlock; 2175 } 2176 2177 if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) { 2178 pr_err("%s: Create ddw failed %pOF\n", __func__, pdn); 2179 goto out_failed; 2180 } 2181 2182 win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; 2183 win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len); 2184 if (!win64) 2185 goto remove_window; 2186 2187 ret = of_add_property(pdn, win64); 2188 if (ret) { 2189 dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret); 2190 goto free_property; 2191 } 2192 ret = -ENODEV; 2193 2194 window = ddw_list_new_entry(pdn, win64->value); 2195 if (!window) 2196 goto remove_property; 2197 2198 window->direct = false; 2199 2200 for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { 2201 const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; 2202 2203 /* Look for MMIO32 */ 2204 if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) { 2205 start = pci->phb->mem_resources[i].start; 2206 end = pci->phb->mem_resources[i].end; 2207 break; 2208 } 2209 } 2210 2211 /* New table for using DDW instead of the default DMA window */ 2212 iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr, 2213 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); 2214 iommu_init_table(tbl, pci->phb->node, start >> page_shift, end >> page_shift); 2215 2216 pci->table_group->tables[num] = tbl; 2217 set_iommu_table_base(&pdev->dev, tbl); 2218 pdev->dev.archdata.dma_offset = win_addr; 2219 2220 spin_lock(&dma_win_list_lock); 2221 list_add(&window->list, &dma_win_list); 2222 spin_unlock(&dma_win_list_lock); 2223 2224 mutex_unlock(&dma_win_init_mutex); 2225 2226 goto exit; 2227 2228 remove_property: 2229 of_remove_property(pdn, win64); 2230 free_property: 2231 kfree(win64->name); 2232 kfree(win64->value); 2233 kfree(win64); 2234 remove_window: 2235 __remove_dma_window(pdn, ddw_avail, create.liobn); 2236 2237 out_failed: 2238 fpdn = kzalloc_obj(*fpdn); 2239 if (!fpdn) 2240 goto out_unlock; 2241 fpdn->pdn = pdn; 2242 list_add(&fpdn->list, &failed_ddw_pdn_list); 2243 2244 out_unlock: 2245 mutex_unlock(&dma_win_init_mutex); 2246 2247 return ret; 2248 exit: 2249 /* Allocate the userspace view */ 2250 pseries_tce_iommu_userspace_view_alloc(tbl); 2251 tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels); 2252 2253 *ptbl = iommu_tce_table_get(tbl); 2254 2255 return 0; 2256 } 2257 2258 static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl) 2259 { 2260 if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) && 2261 (tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K)) 2262 return true; 2263 2264 return false; 2265 } 2266 2267 static long spapr_tce_set_window(struct iommu_table_group *table_group, 2268 int num, struct iommu_table *tbl) 2269 { 2270 return tbl == table_group->tables[num] ? 0 : -EPERM; 2271 } 2272 2273 static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num) 2274 { 2275 struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group); 2276 struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; 2277 struct iommu_table *tbl = table_group->tables[num]; 2278 struct failed_ddw_pdn *fpdn; 2279 struct dma_win *window; 2280 const char *win_name; 2281 int ret = -ENODEV; 2282 2283 if (!tbl) /* The table was never created OR window was never opened */ 2284 return 0; 2285 2286 mutex_lock(&dma_win_init_mutex); 2287 2288 if ((num == 0) && is_default_window_table(table_group, tbl)) 2289 win_name = "ibm,dma-window"; 2290 else 2291 win_name = DMA64_PROPNAME; 2292 2293 pdn = pci_dma_find(dn, NULL); 2294 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2295 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2296 goto out_failed; 2297 } 2298 2299 /* Dont clear the TCEs, User should have done it */ 2300 if (remove_dma_window_named(pdn, true, win_name, false)) { 2301 pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn); 2302 goto out_failed; /* Could not remove it either! */ 2303 } 2304 2305 if (strcmp(win_name, DMA64_PROPNAME) == 0) { 2306 spin_lock(&dma_win_list_lock); 2307 list_for_each_entry(window, &dma_win_list, list) { 2308 if (window->device == pdn) { 2309 list_del(&window->list); 2310 kfree(window); 2311 break; 2312 } 2313 } 2314 spin_unlock(&dma_win_list_lock); 2315 } 2316 2317 iommu_tce_table_put(table_group->tables[num]); 2318 table_group->tables[num] = NULL; 2319 2320 ret = 0; 2321 2322 goto out_unlock; 2323 2324 out_failed: 2325 fpdn = kzalloc_obj(*fpdn); 2326 if (!fpdn) 2327 goto out_unlock; 2328 fpdn->pdn = pdn; 2329 list_add(&fpdn->list, &failed_ddw_pdn_list); 2330 2331 out_unlock: 2332 mutex_unlock(&dma_win_init_mutex); 2333 2334 return ret; 2335 } 2336 2337 static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev) 2338 { 2339 struct iommu_table *tbl = table_group->tables[0]; 2340 struct pci_dev *pdev = to_pci_dev(dev); 2341 struct device_node *dn = pci_device_to_OF_node(pdev); 2342 struct device_node *pdn; 2343 2344 /* SRIOV VFs using direct map by the host driver OR multifunction devices 2345 * where the ownership was taken on the attempt by the first function 2346 */ 2347 if (!tbl && (table_group->max_dynamic_windows_supported != 1)) 2348 return 0; 2349 2350 mutex_lock(&dma_win_init_mutex); 2351 2352 pdn = pci_dma_find(dn, NULL); 2353 if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */ 2354 dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn); 2355 mutex_unlock(&dma_win_init_mutex); 2356 return -1; 2357 } 2358 2359 /* 2360 * Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table 2361 * if there are any. In case of direct map, the entries will be left over, which 2362 * is fine for PEs with 2 DMA windows where the second window is created with create-pe 2363 * at which point the table is cleared. However, on VFs having only one DMA window, the 2364 * default window would end up seeing the entries left over from the direct map done 2365 * on the second window. So, remove the ddw explicitly so that clean_dma_window() 2366 * cleans up the entries if any. 2367 */ 2368 if (remove_dynamic_dma_windows(pdev, pdn)) { 2369 dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn); 2370 mutex_unlock(&dma_win_init_mutex); 2371 return -1; 2372 } 2373 2374 /* The table_group->tables[0] is not null now, it must be the default window 2375 * Remove it, let the userspace create it as it needs. 2376 */ 2377 if (table_group->tables[0]) { 2378 remove_dma_window_named(pdn, true, "ibm,dma-window", true); 2379 iommu_tce_table_put(tbl); 2380 table_group->tables[0] = NULL; 2381 } 2382 set_iommu_table_base(dev, NULL); 2383 2384 mutex_unlock(&dma_win_init_mutex); 2385 2386 return 0; 2387 } 2388 2389 static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev) 2390 { 2391 struct iommu_table *tbl = table_group->tables[0]; 2392 2393 if (tbl) { /* Default window already restored */ 2394 return; 2395 } 2396 2397 mutex_lock(&dma_win_init_mutex); 2398 2399 /* Restore the default window */ 2400 pseries_setup_default_iommu_config(table_group, dev); 2401 2402 mutex_unlock(&dma_win_init_mutex); 2403 2404 return; 2405 } 2406 2407 static struct iommu_table_group_ops spapr_tce_table_group_ops = { 2408 .get_table_size = spapr_tce_get_table_size, 2409 .create_table = spapr_tce_create_table, 2410 .set_window = spapr_tce_set_window, 2411 .unset_window = spapr_tce_unset_window, 2412 .take_ownership = spapr_tce_take_ownership, 2413 .release_ownership = spapr_tce_release_ownership, 2414 }; 2415 #endif 2416 2417 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, 2418 void *data) 2419 { 2420 struct dma_win *window; 2421 struct memory_notify *arg = data; 2422 int ret = 0; 2423 2424 /* This notifier can get called when onlining persistent memory as well. 2425 * TCEs are not pre-mapped for persistent memory. Persistent memory will 2426 * always be above ddw_memory_hotplug_max() 2427 */ 2428 2429 switch (action) { 2430 case MEM_GOING_ONLINE: 2431 spin_lock(&dma_win_list_lock); 2432 list_for_each_entry(window, &dma_win_list, list) { 2433 if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2434 ddw_memory_hotplug_max()) { 2435 ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, 2436 arg->nr_pages, window->prop); 2437 } 2438 /* XXX log error */ 2439 } 2440 spin_unlock(&dma_win_list_lock); 2441 break; 2442 case MEM_CANCEL_ONLINE: 2443 case MEM_OFFLINE: 2444 spin_lock(&dma_win_list_lock); 2445 list_for_each_entry(window, &dma_win_list, list) { 2446 if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2447 ddw_memory_hotplug_max()) { 2448 ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, 2449 arg->nr_pages, window->prop); 2450 } 2451 /* XXX log error */ 2452 } 2453 spin_unlock(&dma_win_list_lock); 2454 break; 2455 default: 2456 break; 2457 } 2458 if (ret && action != MEM_CANCEL_ONLINE) 2459 return NOTIFY_BAD; 2460 2461 return NOTIFY_OK; 2462 } 2463 2464 static struct notifier_block iommu_mem_nb = { 2465 .notifier_call = iommu_mem_notifier, 2466 }; 2467 2468 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data) 2469 { 2470 int err = NOTIFY_OK; 2471 struct of_reconfig_data *rd = data; 2472 struct device_node *np = rd->dn; 2473 struct pci_dn *pci = PCI_DN(np); 2474 struct dma_win *window; 2475 2476 switch (action) { 2477 case OF_RECONFIG_DETACH_NODE: 2478 /* 2479 * Removing the property will invoke the reconfig 2480 * notifier again, which causes dead-lock on the 2481 * read-write semaphore of the notifier chain. So 2482 * we have to remove the property when releasing 2483 * the device node. 2484 */ 2485 if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true)) 2486 remove_dma_window_named(np, false, DMA64_PROPNAME, true); 2487 2488 if (pci && pci->table_group) 2489 iommu_pseries_free_group(pci->table_group, 2490 np->full_name); 2491 2492 spin_lock(&dma_win_list_lock); 2493 list_for_each_entry(window, &dma_win_list, list) { 2494 if (window->device == np) { 2495 list_del(&window->list); 2496 kfree(window); 2497 break; 2498 } 2499 } 2500 spin_unlock(&dma_win_list_lock); 2501 break; 2502 default: 2503 err = NOTIFY_DONE; 2504 break; 2505 } 2506 return err; 2507 } 2508 2509 static struct notifier_block iommu_reconfig_nb = { 2510 .notifier_call = iommu_reconfig_notifier, 2511 }; 2512 2513 /* These are called very early. */ 2514 void __init iommu_init_early_pSeries(void) 2515 { 2516 if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) 2517 return; 2518 2519 if (firmware_has_feature(FW_FEATURE_LPAR)) { 2520 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; 2521 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; 2522 if (!disable_ddw) 2523 pseries_pci_controller_ops.iommu_bypass_supported = 2524 iommu_bypass_supported_pSeriesLP; 2525 } else { 2526 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; 2527 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; 2528 } 2529 2530 2531 of_reconfig_notifier_register(&iommu_reconfig_nb); 2532 register_memory_notifier(&iommu_mem_nb); 2533 2534 set_pci_dma_ops(&dma_iommu_ops); 2535 } 2536 2537 static int __init disable_multitce(char *str) 2538 { 2539 if (strcmp(str, "off") == 0 && 2540 firmware_has_feature(FW_FEATURE_LPAR) && 2541 (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || 2542 firmware_has_feature(FW_FEATURE_STUFF_TCE))) { 2543 printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); 2544 powerpc_firmware_features &= 2545 ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); 2546 } 2547 return 1; 2548 } 2549 2550 __setup("multitce=", disable_multitce); 2551 2552 #ifdef CONFIG_SPAPR_TCE_IOMMU 2553 struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, 2554 struct pci_dev *pdev) 2555 { 2556 struct device_node *pdn, *dn = pdev->dev.of_node; 2557 struct iommu_group *grp; 2558 struct pci_dn *pci; 2559 2560 pdn = pci_dma_find(dn, NULL); 2561 if (!pdn || !PCI_DN(pdn)) 2562 return ERR_PTR(-ENODEV); 2563 2564 pci = PCI_DN(pdn); 2565 if (!pci->table_group) 2566 return ERR_PTR(-ENODEV); 2567 2568 grp = pci->table_group->group; 2569 if (!grp) 2570 return ERR_PTR(-ENODEV); 2571 2572 return iommu_group_ref_get(grp); 2573 } 2574 #endif 2575