1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2010 4 * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 5 * 6 * This code provides a IOMMU for Xen PV guests with PCI passthrough. 7 * 8 * PV guests under Xen are running in an non-contiguous memory architecture. 9 * 10 * When PCI pass-through is utilized, this necessitates an IOMMU for 11 * translating bus (DMA) to virtual and vice-versa and also providing a 12 * mechanism to have contiguous pages for device drivers operations (say DMA 13 * operations). 14 * 15 * Specifically, under Xen the Linux idea of pages is an illusion. It 16 * assumes that pages start at zero and go up to the available memory. To 17 * help with that, the Linux Xen MMU provides a lookup mechanism to 18 * translate the page frame numbers (PFN) to machine frame numbers (MFN) 19 * and vice-versa. The MFN are the "real" frame numbers. Furthermore 20 * memory is not contiguous. Xen hypervisor stitches memory for guests 21 * from different pools, which means there is no guarantee that PFN==MFN 22 * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are 23 * allocated in descending order (high to low), meaning the guest might 24 * never get any MFN's under the 4GB mark. 25 */ 26 27 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 28 29 #include <linux/memblock.h> 30 #include <linux/dma-direct.h> 31 #include <linux/dma-noncoherent.h> 32 #include <linux/export.h> 33 #include <xen/swiotlb-xen.h> 34 #include <xen/page.h> 35 #include <xen/xen-ops.h> 36 #include <xen/hvc-console.h> 37 38 #include <asm/dma-mapping.h> 39 #include <asm/xen/page-coherent.h> 40 41 #include <trace/events/swiotlb.h> 42 #define MAX_DMA_BITS 32 43 /* 44 * Used to do a quick range check in swiotlb_tbl_unmap_single and 45 * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this 46 * API. 47 */ 48 49 static char *xen_io_tlb_start, *xen_io_tlb_end; 50 static unsigned long xen_io_tlb_nslabs; 51 /* 52 * Quick lookup value of the bus address of the IOTLB. 53 */ 54 55 static u64 start_dma_addr; 56 57 /* 58 * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t 59 * can be 32bit when dma_addr_t is 64bit leading to a loss in 60 * information if the shift is done before casting to 64bit. 61 */ 62 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) 63 { 64 unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); 65 dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; 66 67 dma |= paddr & ~XEN_PAGE_MASK; 68 69 return dma; 70 } 71 72 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) 73 { 74 unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); 75 dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; 76 phys_addr_t paddr = dma; 77 78 paddr |= baddr & ~XEN_PAGE_MASK; 79 80 return paddr; 81 } 82 83 static inline dma_addr_t xen_virt_to_bus(void *address) 84 { 85 return xen_phys_to_bus(virt_to_phys(address)); 86 } 87 88 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) 89 { 90 unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); 91 unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); 92 93 next_bfn = pfn_to_bfn(xen_pfn); 94 95 for (i = 1; i < nr_pages; i++) 96 if (pfn_to_bfn(++xen_pfn) != ++next_bfn) 97 return 1; 98 99 return 0; 100 } 101 102 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) 103 { 104 unsigned long bfn = XEN_PFN_DOWN(dma_addr); 105 unsigned long xen_pfn = bfn_to_local_pfn(bfn); 106 phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); 107 108 /* If the address is outside our domain, it CAN 109 * have the same virtual address as another address 110 * in our domain. Therefore _only_ check address within our domain. 111 */ 112 if (pfn_valid(PFN_DOWN(paddr))) { 113 return paddr >= virt_to_phys(xen_io_tlb_start) && 114 paddr < virt_to_phys(xen_io_tlb_end); 115 } 116 return 0; 117 } 118 119 static int 120 xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) 121 { 122 int i, rc; 123 int dma_bits; 124 dma_addr_t dma_handle; 125 phys_addr_t p = virt_to_phys(buf); 126 127 dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; 128 129 i = 0; 130 do { 131 int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); 132 133 do { 134 rc = xen_create_contiguous_region( 135 p + (i << IO_TLB_SHIFT), 136 get_order(slabs << IO_TLB_SHIFT), 137 dma_bits, &dma_handle); 138 } while (rc && dma_bits++ < MAX_DMA_BITS); 139 if (rc) 140 return rc; 141 142 i += slabs; 143 } while (i < nslabs); 144 return 0; 145 } 146 static unsigned long xen_set_nslabs(unsigned long nr_tbl) 147 { 148 if (!nr_tbl) { 149 xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); 150 xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); 151 } else 152 xen_io_tlb_nslabs = nr_tbl; 153 154 return xen_io_tlb_nslabs << IO_TLB_SHIFT; 155 } 156 157 enum xen_swiotlb_err { 158 XEN_SWIOTLB_UNKNOWN = 0, 159 XEN_SWIOTLB_ENOMEM, 160 XEN_SWIOTLB_EFIXUP 161 }; 162 163 static const char *xen_swiotlb_error(enum xen_swiotlb_err err) 164 { 165 switch (err) { 166 case XEN_SWIOTLB_ENOMEM: 167 return "Cannot allocate Xen-SWIOTLB buffer\n"; 168 case XEN_SWIOTLB_EFIXUP: 169 return "Failed to get contiguous memory for DMA from Xen!\n"\ 170 "You either: don't have the permissions, do not have"\ 171 " enough free memory under 4GB, or the hypervisor memory"\ 172 " is too fragmented!"; 173 default: 174 break; 175 } 176 return ""; 177 } 178 int __ref xen_swiotlb_init(int verbose, bool early) 179 { 180 unsigned long bytes, order; 181 int rc = -ENOMEM; 182 enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; 183 unsigned int repeat = 3; 184 185 xen_io_tlb_nslabs = swiotlb_nr_tbl(); 186 retry: 187 bytes = xen_set_nslabs(xen_io_tlb_nslabs); 188 order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); 189 190 /* 191 * IO TLB memory already allocated. Just use it. 192 */ 193 if (io_tlb_start != 0) { 194 xen_io_tlb_start = phys_to_virt(io_tlb_start); 195 goto end; 196 } 197 198 /* 199 * Get IO TLB memory from any location. 200 */ 201 if (early) { 202 xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), 203 PAGE_SIZE); 204 if (!xen_io_tlb_start) 205 panic("%s: Failed to allocate %lu bytes align=0x%lx\n", 206 __func__, PAGE_ALIGN(bytes), PAGE_SIZE); 207 } else { 208 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) 209 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) 210 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 211 xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); 212 if (xen_io_tlb_start) 213 break; 214 order--; 215 } 216 if (order != get_order(bytes)) { 217 pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", 218 (PAGE_SIZE << order) >> 20); 219 xen_io_tlb_nslabs = SLABS_PER_PAGE << order; 220 bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; 221 } 222 } 223 if (!xen_io_tlb_start) { 224 m_ret = XEN_SWIOTLB_ENOMEM; 225 goto error; 226 } 227 /* 228 * And replace that memory with pages under 4GB. 229 */ 230 rc = xen_swiotlb_fixup(xen_io_tlb_start, 231 bytes, 232 xen_io_tlb_nslabs); 233 if (rc) { 234 if (early) 235 memblock_free(__pa(xen_io_tlb_start), 236 PAGE_ALIGN(bytes)); 237 else { 238 free_pages((unsigned long)xen_io_tlb_start, order); 239 xen_io_tlb_start = NULL; 240 } 241 m_ret = XEN_SWIOTLB_EFIXUP; 242 goto error; 243 } 244 start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); 245 if (early) { 246 if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, 247 verbose)) 248 panic("Cannot allocate SWIOTLB buffer"); 249 rc = 0; 250 } else 251 rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); 252 253 end: 254 xen_io_tlb_end = xen_io_tlb_start + bytes; 255 if (!rc) 256 swiotlb_set_max_segment(PAGE_SIZE); 257 258 return rc; 259 error: 260 if (repeat--) { 261 xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ 262 (xen_io_tlb_nslabs >> 1)); 263 pr_info("Lowering to %luMB\n", 264 (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); 265 goto retry; 266 } 267 pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); 268 if (early) 269 panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); 270 else 271 free_pages((unsigned long)xen_io_tlb_start, order); 272 return rc; 273 } 274 275 static void * 276 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 277 dma_addr_t *dma_handle, gfp_t flags, 278 unsigned long attrs) 279 { 280 void *ret; 281 int order = get_order(size); 282 u64 dma_mask = DMA_BIT_MASK(32); 283 phys_addr_t phys; 284 dma_addr_t dev_addr; 285 286 /* 287 * Ignore region specifiers - the kernel's ideas of 288 * pseudo-phys memory layout has nothing to do with the 289 * machine physical layout. We can't allocate highmem 290 * because we can't return a pointer to it. 291 */ 292 flags &= ~(__GFP_DMA | __GFP_HIGHMEM); 293 294 /* Convert the size to actually allocated. */ 295 size = 1UL << (order + XEN_PAGE_SHIFT); 296 297 /* On ARM this function returns an ioremap'ped virtual address for 298 * which virt_to_phys doesn't return the corresponding physical 299 * address. In fact on ARM virt_to_phys only works for kernel direct 300 * mapped RAM memory. Also see comment below. 301 */ 302 ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); 303 304 if (!ret) 305 return ret; 306 307 if (hwdev && hwdev->coherent_dma_mask) 308 dma_mask = hwdev->coherent_dma_mask; 309 310 /* At this point dma_handle is the physical address, next we are 311 * going to set it to the machine address. 312 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond 313 * to *dma_handle. */ 314 phys = *dma_handle; 315 dev_addr = xen_phys_to_bus(phys); 316 if (((dev_addr + size - 1 <= dma_mask)) && 317 !range_straddles_page_boundary(phys, size)) 318 *dma_handle = dev_addr; 319 else { 320 if (xen_create_contiguous_region(phys, order, 321 fls64(dma_mask), dma_handle) != 0) { 322 xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); 323 return NULL; 324 } 325 SetPageXenRemapped(virt_to_page(ret)); 326 } 327 memset(ret, 0, size); 328 return ret; 329 } 330 331 static void 332 xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, 333 dma_addr_t dev_addr, unsigned long attrs) 334 { 335 int order = get_order(size); 336 phys_addr_t phys; 337 u64 dma_mask = DMA_BIT_MASK(32); 338 339 if (hwdev && hwdev->coherent_dma_mask) 340 dma_mask = hwdev->coherent_dma_mask; 341 342 /* do not use virt_to_phys because on ARM it doesn't return you the 343 * physical address */ 344 phys = xen_bus_to_phys(dev_addr); 345 346 /* Convert the size to actually allocated. */ 347 size = 1UL << (order + XEN_PAGE_SHIFT); 348 349 if (!WARN_ON((dev_addr + size - 1 > dma_mask) || 350 range_straddles_page_boundary(phys, size)) && 351 TestClearPageXenRemapped(virt_to_page(vaddr))) 352 xen_destroy_contiguous_region(phys, order); 353 354 xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); 355 } 356 357 /* 358 * Map a single buffer of the indicated size for DMA in streaming mode. The 359 * physical address to use is returned. 360 * 361 * Once the device is given the dma address, the device owns this memory until 362 * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. 363 */ 364 static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, 365 unsigned long offset, size_t size, 366 enum dma_data_direction dir, 367 unsigned long attrs) 368 { 369 phys_addr_t map, phys = page_to_phys(page) + offset; 370 dma_addr_t dev_addr = xen_phys_to_bus(phys); 371 372 BUG_ON(dir == DMA_NONE); 373 /* 374 * If the address happens to be in the device's DMA window, 375 * we can safely return the device addr and not worry about bounce 376 * buffering it. 377 */ 378 if (dma_capable(dev, dev_addr, size, true) && 379 !range_straddles_page_boundary(phys, size) && 380 !xen_arch_need_swiotlb(dev, phys, dev_addr) && 381 swiotlb_force != SWIOTLB_FORCE) 382 goto done; 383 384 /* 385 * Oh well, have to allocate and map a bounce buffer. 386 */ 387 trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); 388 389 map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, 390 size, size, dir, attrs); 391 if (map == (phys_addr_t)DMA_MAPPING_ERROR) 392 return DMA_MAPPING_ERROR; 393 394 phys = map; 395 dev_addr = xen_phys_to_bus(map); 396 397 /* 398 * Ensure that the address returned is DMA'ble 399 */ 400 if (unlikely(!dma_capable(dev, dev_addr, size, true))) { 401 swiotlb_tbl_unmap_single(dev, map, size, size, dir, 402 attrs | DMA_ATTR_SKIP_CPU_SYNC); 403 return DMA_MAPPING_ERROR; 404 } 405 406 done: 407 if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 408 xen_dma_sync_for_device(dev_addr, phys, size, dir); 409 return dev_addr; 410 } 411 412 /* 413 * Unmap a single streaming mode DMA translation. The dma_addr and size must 414 * match what was provided for in a previous xen_swiotlb_map_page call. All 415 * other usages are undefined. 416 * 417 * After this call, reads by the cpu to the buffer are guaranteed to see 418 * whatever the device wrote there. 419 */ 420 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 421 size_t size, enum dma_data_direction dir, unsigned long attrs) 422 { 423 phys_addr_t paddr = xen_bus_to_phys(dev_addr); 424 425 BUG_ON(dir == DMA_NONE); 426 427 if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 428 xen_dma_sync_for_cpu(dev_addr, paddr, size, dir); 429 430 /* NOTE: We use dev_addr here, not paddr! */ 431 if (is_xen_swiotlb_buffer(dev_addr)) 432 swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs); 433 } 434 435 static void 436 xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, 437 size_t size, enum dma_data_direction dir) 438 { 439 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 440 441 if (!dev_is_dma_coherent(dev)) 442 xen_dma_sync_for_cpu(dma_addr, paddr, size, dir); 443 444 if (is_xen_swiotlb_buffer(dma_addr)) 445 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); 446 } 447 448 static void 449 xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, 450 size_t size, enum dma_data_direction dir) 451 { 452 phys_addr_t paddr = xen_bus_to_phys(dma_addr); 453 454 if (is_xen_swiotlb_buffer(dma_addr)) 455 swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); 456 457 if (!dev_is_dma_coherent(dev)) 458 xen_dma_sync_for_device(dma_addr, paddr, size, dir); 459 } 460 461 /* 462 * Unmap a set of streaming mode DMA translations. Again, cpu read rules 463 * concerning calls here are the same as for swiotlb_unmap_page() above. 464 */ 465 static void 466 xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, 467 enum dma_data_direction dir, unsigned long attrs) 468 { 469 struct scatterlist *sg; 470 int i; 471 472 BUG_ON(dir == DMA_NONE); 473 474 for_each_sg(sgl, sg, nelems, i) 475 xen_swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), 476 dir, attrs); 477 478 } 479 480 static int 481 xen_swiotlb_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, 482 enum dma_data_direction dir, unsigned long attrs) 483 { 484 struct scatterlist *sg; 485 int i; 486 487 BUG_ON(dir == DMA_NONE); 488 489 for_each_sg(sgl, sg, nelems, i) { 490 sg->dma_address = xen_swiotlb_map_page(dev, sg_page(sg), 491 sg->offset, sg->length, dir, attrs); 492 if (sg->dma_address == DMA_MAPPING_ERROR) 493 goto out_unmap; 494 sg_dma_len(sg) = sg->length; 495 } 496 497 return nelems; 498 out_unmap: 499 xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 500 sg_dma_len(sgl) = 0; 501 return 0; 502 } 503 504 static void 505 xen_swiotlb_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, 506 int nelems, enum dma_data_direction dir) 507 { 508 struct scatterlist *sg; 509 int i; 510 511 for_each_sg(sgl, sg, nelems, i) { 512 xen_swiotlb_sync_single_for_cpu(dev, sg->dma_address, 513 sg->length, dir); 514 } 515 } 516 517 static void 518 xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, 519 int nelems, enum dma_data_direction dir) 520 { 521 struct scatterlist *sg; 522 int i; 523 524 for_each_sg(sgl, sg, nelems, i) { 525 xen_swiotlb_sync_single_for_device(dev, sg->dma_address, 526 sg->length, dir); 527 } 528 } 529 530 /* 531 * Return whether the given device DMA address mask can be supported 532 * properly. For example, if your device can only drive the low 24-bits 533 * during bus mastering, then you would pass 0x00ffffff as the mask to 534 * this function. 535 */ 536 static int 537 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) 538 { 539 return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; 540 } 541 542 const struct dma_map_ops xen_swiotlb_dma_ops = { 543 .alloc = xen_swiotlb_alloc_coherent, 544 .free = xen_swiotlb_free_coherent, 545 .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, 546 .sync_single_for_device = xen_swiotlb_sync_single_for_device, 547 .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, 548 .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, 549 .map_sg = xen_swiotlb_map_sg, 550 .unmap_sg = xen_swiotlb_unmap_sg, 551 .map_page = xen_swiotlb_map_page, 552 .unmap_page = xen_swiotlb_unmap_page, 553 .dma_supported = xen_swiotlb_dma_supported, 554 .mmap = dma_common_mmap, 555 .get_sgtable = dma_common_get_sgtable, 556 }; 557