1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/memremap.h> 16 #include <linux/mutex.h> 17 #include <linux/rwsem.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/highmem.h> 21 #include <linux/delay.h> 22 #include <linux/pagemap.h> 23 #include <linux/hmm.h> 24 #include <linux/vmalloc.h> 25 #include <linux/swap.h> 26 #include <linux/swapops.h> 27 #include <linux/sched/mm.h> 28 #include <linux/platform_device.h> 29 #include <linux/rmap.h> 30 #include <linux/mmu_notifier.h> 31 #include <linux/migrate.h> 32 33 #include "test_hmm_uapi.h" 34 35 #define DMIRROR_NDEVICES 4 36 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 37 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 38 #define DEVMEM_CHUNKS_RESERVE 16 39 40 /* 41 * For device_private pages, dpage is just a dummy struct page 42 * representing a piece of device memory. dmirror_devmem_alloc_page 43 * allocates a real system memory page as backing storage to fake a 44 * real device. zone_device_data points to that backing page. But 45 * for device_coherent memory, the struct page represents real 46 * physical CPU-accessible memory that we can use directly. 47 */ 48 #define BACKING_PAGE(page) (is_device_private_page((page)) ? \ 49 (page)->zone_device_data : (page)) 50 51 static unsigned long spm_addr_dev0; 52 module_param(spm_addr_dev0, long, 0644); 53 MODULE_PARM_DESC(spm_addr_dev0, 54 "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); 55 56 static unsigned long spm_addr_dev1; 57 module_param(spm_addr_dev1, long, 0644); 58 MODULE_PARM_DESC(spm_addr_dev1, 59 "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); 60 61 static const struct dev_pagemap_ops dmirror_devmem_ops; 62 static const struct mmu_interval_notifier_ops dmirror_min_ops; 63 static dev_t dmirror_dev; 64 65 struct dmirror_device; 66 67 struct dmirror_bounce { 68 void *ptr; 69 unsigned long size; 70 unsigned long addr; 71 unsigned long cpages; 72 }; 73 74 #define DPT_XA_TAG_ATOMIC 1UL 75 #define DPT_XA_TAG_WRITE 3UL 76 77 /* 78 * Data structure to track address ranges and register for mmu interval 79 * notifier updates. 80 */ 81 struct dmirror_interval { 82 struct mmu_interval_notifier notifier; 83 struct dmirror *dmirror; 84 }; 85 86 /* 87 * Data attached to the open device file. 88 * Note that it might be shared after a fork(). 89 */ 90 struct dmirror { 91 struct dmirror_device *mdevice; 92 struct xarray pt; 93 struct mmu_interval_notifier notifier; 94 struct mutex mutex; 95 __u64 flags; 96 }; 97 98 /* 99 * ZONE_DEVICE pages for migration and simulating device memory. 100 */ 101 struct dmirror_chunk { 102 struct dev_pagemap pagemap; 103 struct dmirror_device *mdevice; 104 bool remove; 105 }; 106 107 /* 108 * Per device data. 109 */ 110 struct dmirror_device { 111 struct cdev cdevice; 112 unsigned int zone_device_type; 113 struct device device; 114 115 unsigned int devmem_capacity; 116 unsigned int devmem_count; 117 struct dmirror_chunk **devmem_chunks; 118 struct mutex devmem_lock; /* protects the above */ 119 120 unsigned long calloc; 121 unsigned long cfree; 122 struct page *free_pages; 123 struct folio *free_folios; 124 spinlock_t lock; /* protects the above */ 125 }; 126 127 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 128 129 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 130 unsigned long addr, 131 unsigned long size) 132 { 133 bounce->addr = addr; 134 bounce->size = size; 135 bounce->cpages = 0; 136 bounce->ptr = vmalloc(size); 137 if (!bounce->ptr) 138 return -ENOMEM; 139 return 0; 140 } 141 142 static bool dmirror_is_private_zone(struct dmirror_device *mdevice) 143 { 144 return (mdevice->zone_device_type == 145 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE); 146 } 147 148 static enum migrate_vma_direction 149 dmirror_select_device(struct dmirror *dmirror) 150 { 151 return (dmirror->mdevice->zone_device_type == 152 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? 153 MIGRATE_VMA_SELECT_DEVICE_PRIVATE : 154 MIGRATE_VMA_SELECT_DEVICE_COHERENT; 155 } 156 157 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 158 { 159 vfree(bounce->ptr); 160 } 161 162 static int dmirror_fops_open(struct inode *inode, struct file *filp) 163 { 164 struct cdev *cdev = inode->i_cdev; 165 struct dmirror *dmirror; 166 int ret; 167 168 /* Mirror this process address space */ 169 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 170 if (dmirror == NULL) 171 return -ENOMEM; 172 173 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 174 mutex_init(&dmirror->mutex); 175 xa_init(&dmirror->pt); 176 177 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 178 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 179 if (ret) { 180 kfree(dmirror); 181 return ret; 182 } 183 184 filp->private_data = dmirror; 185 return 0; 186 } 187 188 static int dmirror_fops_release(struct inode *inode, struct file *filp) 189 { 190 struct dmirror *dmirror = filp->private_data; 191 192 mmu_interval_notifier_remove(&dmirror->notifier); 193 xa_destroy(&dmirror->pt); 194 kfree(dmirror); 195 return 0; 196 } 197 198 static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page) 199 { 200 return container_of(page_pgmap(page), struct dmirror_chunk, 201 pagemap); 202 } 203 204 static struct dmirror_device *dmirror_page_to_device(struct page *page) 205 206 { 207 return dmirror_page_to_chunk(page)->mdevice; 208 } 209 210 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 211 { 212 unsigned long *pfns = range->hmm_pfns; 213 unsigned long pfn; 214 215 for (pfn = (range->start >> PAGE_SHIFT); 216 pfn < (range->end >> PAGE_SHIFT); 217 pfn++, pfns++) { 218 struct page *page; 219 void *entry; 220 221 /* 222 * Since we asked for hmm_range_fault() to populate pages, 223 * it shouldn't return an error entry on success. 224 */ 225 WARN_ON(*pfns & HMM_PFN_ERROR); 226 WARN_ON(!(*pfns & HMM_PFN_VALID)); 227 228 page = hmm_pfn_to_page(*pfns); 229 WARN_ON(!page); 230 231 entry = page; 232 if (*pfns & HMM_PFN_WRITE) 233 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 234 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 235 return -EFAULT; 236 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 237 if (xa_is_err(entry)) 238 return xa_err(entry); 239 } 240 241 return 0; 242 } 243 244 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 245 unsigned long end) 246 { 247 unsigned long pfn; 248 void *entry; 249 250 /* 251 * The XArray doesn't hold references to pages since it relies on 252 * the mmu notifier to clear page pointers when they become stale. 253 * Therefore, it is OK to just clear the entry. 254 */ 255 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 256 end >> PAGE_SHIFT) 257 xa_erase(&dmirror->pt, pfn); 258 } 259 260 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 261 const struct mmu_notifier_range *range, 262 unsigned long cur_seq) 263 { 264 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 265 266 /* 267 * Ignore invalidation callbacks for device private pages since 268 * the invalidation is handled as part of the migration process. 269 */ 270 if (range->event == MMU_NOTIFY_MIGRATE && 271 range->owner == dmirror->mdevice) 272 return true; 273 274 if (mmu_notifier_range_blockable(range)) 275 mutex_lock(&dmirror->mutex); 276 else if (!mutex_trylock(&dmirror->mutex)) 277 return false; 278 279 mmu_interval_set_seq(mni, cur_seq); 280 dmirror_do_update(dmirror, range->start, range->end); 281 282 mutex_unlock(&dmirror->mutex); 283 return true; 284 } 285 286 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 287 .invalidate = dmirror_interval_invalidate, 288 }; 289 290 static int dmirror_range_fault(struct dmirror *dmirror, 291 struct hmm_range *range) 292 { 293 struct mm_struct *mm = dmirror->notifier.mm; 294 unsigned long timeout = 295 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 296 int ret; 297 298 while (true) { 299 if (time_after(jiffies, timeout)) { 300 ret = -EBUSY; 301 goto out; 302 } 303 304 range->notifier_seq = mmu_interval_read_begin(range->notifier); 305 mmap_read_lock(mm); 306 ret = hmm_range_fault(range); 307 mmap_read_unlock(mm); 308 if (ret) { 309 if (ret == -EBUSY) 310 continue; 311 goto out; 312 } 313 314 mutex_lock(&dmirror->mutex); 315 if (mmu_interval_read_retry(range->notifier, 316 range->notifier_seq)) { 317 mutex_unlock(&dmirror->mutex); 318 continue; 319 } 320 break; 321 } 322 323 ret = dmirror_do_fault(dmirror, range); 324 325 mutex_unlock(&dmirror->mutex); 326 out: 327 return ret; 328 } 329 330 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 331 unsigned long end, bool write) 332 { 333 struct mm_struct *mm = dmirror->notifier.mm; 334 unsigned long addr; 335 unsigned long pfns[32]; 336 struct hmm_range range = { 337 .notifier = &dmirror->notifier, 338 .hmm_pfns = pfns, 339 .pfn_flags_mask = 0, 340 .default_flags = 341 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 342 .dev_private_owner = dmirror->mdevice, 343 }; 344 int ret = 0; 345 346 /* Since the mm is for the mirrored process, get a reference first. */ 347 if (!mmget_not_zero(mm)) 348 return 0; 349 350 for (addr = start; addr < end; addr = range.end) { 351 range.start = addr; 352 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 353 354 ret = dmirror_range_fault(dmirror, &range); 355 if (ret) 356 break; 357 } 358 359 mmput(mm); 360 return ret; 361 } 362 363 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 364 unsigned long end, struct dmirror_bounce *bounce) 365 { 366 unsigned long pfn; 367 void *ptr; 368 369 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 370 371 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 372 void *entry; 373 struct page *page; 374 375 entry = xa_load(&dmirror->pt, pfn); 376 page = xa_untag_pointer(entry); 377 if (!page) 378 return -ENOENT; 379 380 memcpy_from_page(ptr, page, 0, PAGE_SIZE); 381 382 ptr += PAGE_SIZE; 383 bounce->cpages++; 384 } 385 386 return 0; 387 } 388 389 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 390 { 391 struct dmirror_bounce bounce; 392 unsigned long start, end; 393 unsigned long size = cmd->npages << PAGE_SHIFT; 394 int ret; 395 396 start = cmd->addr; 397 end = start + size; 398 if (end < start) 399 return -EINVAL; 400 401 ret = dmirror_bounce_init(&bounce, start, size); 402 if (ret) 403 return ret; 404 405 while (1) { 406 mutex_lock(&dmirror->mutex); 407 ret = dmirror_do_read(dmirror, start, end, &bounce); 408 mutex_unlock(&dmirror->mutex); 409 if (ret != -ENOENT) 410 break; 411 412 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 413 ret = dmirror_fault(dmirror, start, end, false); 414 if (ret) 415 break; 416 cmd->faults++; 417 } 418 419 if (ret == 0) { 420 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 421 bounce.size)) 422 ret = -EFAULT; 423 } 424 cmd->cpages = bounce.cpages; 425 dmirror_bounce_fini(&bounce); 426 return ret; 427 } 428 429 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 430 unsigned long end, struct dmirror_bounce *bounce) 431 { 432 unsigned long pfn; 433 void *ptr; 434 435 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 436 437 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 438 void *entry; 439 struct page *page; 440 441 entry = xa_load(&dmirror->pt, pfn); 442 page = xa_untag_pointer(entry); 443 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 444 return -ENOENT; 445 446 memcpy_to_page(page, 0, ptr, PAGE_SIZE); 447 448 ptr += PAGE_SIZE; 449 bounce->cpages++; 450 } 451 452 return 0; 453 } 454 455 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 456 { 457 struct dmirror_bounce bounce; 458 unsigned long start, end; 459 unsigned long size = cmd->npages << PAGE_SHIFT; 460 int ret; 461 462 start = cmd->addr; 463 end = start + size; 464 if (end < start) 465 return -EINVAL; 466 467 ret = dmirror_bounce_init(&bounce, start, size); 468 if (ret) 469 return ret; 470 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 471 bounce.size)) { 472 ret = -EFAULT; 473 goto fini; 474 } 475 476 while (1) { 477 mutex_lock(&dmirror->mutex); 478 ret = dmirror_do_write(dmirror, start, end, &bounce); 479 mutex_unlock(&dmirror->mutex); 480 if (ret != -ENOENT) 481 break; 482 483 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 484 ret = dmirror_fault(dmirror, start, end, true); 485 if (ret) 486 break; 487 cmd->faults++; 488 } 489 490 fini: 491 cmd->cpages = bounce.cpages; 492 dmirror_bounce_fini(&bounce); 493 return ret; 494 } 495 496 static int dmirror_allocate_chunk(struct dmirror_device *mdevice, 497 struct page **ppage, bool is_large) 498 { 499 struct dmirror_chunk *devmem; 500 struct resource *res = NULL; 501 unsigned long pfn; 502 unsigned long pfn_first; 503 unsigned long pfn_last; 504 void *ptr; 505 int ret = -ENOMEM; 506 507 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 508 if (!devmem) 509 return ret; 510 511 switch (mdevice->zone_device_type) { 512 case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE: 513 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 514 "hmm_dmirror"); 515 if (IS_ERR_OR_NULL(res)) 516 goto err_devmem; 517 devmem->pagemap.range.start = res->start; 518 devmem->pagemap.range.end = res->end; 519 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 520 break; 521 case HMM_DMIRROR_MEMORY_DEVICE_COHERENT: 522 devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ? 523 spm_addr_dev0 : 524 spm_addr_dev1; 525 devmem->pagemap.range.end = devmem->pagemap.range.start + 526 DEVMEM_CHUNK_SIZE - 1; 527 devmem->pagemap.type = MEMORY_DEVICE_COHERENT; 528 break; 529 default: 530 ret = -EINVAL; 531 goto err_devmem; 532 } 533 534 devmem->pagemap.nr_range = 1; 535 devmem->pagemap.ops = &dmirror_devmem_ops; 536 devmem->pagemap.owner = mdevice; 537 538 mutex_lock(&mdevice->devmem_lock); 539 540 if (mdevice->devmem_count == mdevice->devmem_capacity) { 541 struct dmirror_chunk **new_chunks; 542 unsigned int new_capacity; 543 544 new_capacity = mdevice->devmem_capacity + 545 DEVMEM_CHUNKS_RESERVE; 546 new_chunks = krealloc(mdevice->devmem_chunks, 547 sizeof(new_chunks[0]) * new_capacity, 548 GFP_KERNEL); 549 if (!new_chunks) 550 goto err_release; 551 mdevice->devmem_capacity = new_capacity; 552 mdevice->devmem_chunks = new_chunks; 553 } 554 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 555 if (IS_ERR_OR_NULL(ptr)) { 556 if (ptr) 557 ret = PTR_ERR(ptr); 558 else 559 ret = -EFAULT; 560 goto err_release; 561 } 562 563 devmem->mdevice = mdevice; 564 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 565 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 566 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 567 568 mutex_unlock(&mdevice->devmem_lock); 569 570 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 571 DEVMEM_CHUNK_SIZE / (1024 * 1024), 572 mdevice->devmem_count, 573 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 574 pfn_first, pfn_last); 575 576 spin_lock(&mdevice->lock); 577 for (pfn = pfn_first; pfn < pfn_last; ) { 578 struct page *page = pfn_to_page(pfn); 579 580 if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR) 581 && (pfn + HPAGE_PMD_NR <= pfn_last)) { 582 page->zone_device_data = mdevice->free_folios; 583 mdevice->free_folios = page_folio(page); 584 pfn += HPAGE_PMD_NR; 585 continue; 586 } 587 588 page->zone_device_data = mdevice->free_pages; 589 mdevice->free_pages = page; 590 pfn++; 591 } 592 593 ret = 0; 594 if (ppage) { 595 if (is_large) { 596 if (!mdevice->free_folios) { 597 ret = -ENOMEM; 598 goto err_unlock; 599 } 600 *ppage = folio_page(mdevice->free_folios, 0); 601 mdevice->free_folios = (*ppage)->zone_device_data; 602 mdevice->calloc += HPAGE_PMD_NR; 603 } else if (mdevice->free_pages) { 604 *ppage = mdevice->free_pages; 605 mdevice->free_pages = (*ppage)->zone_device_data; 606 mdevice->calloc++; 607 } else { 608 ret = -ENOMEM; 609 goto err_unlock; 610 } 611 } 612 err_unlock: 613 spin_unlock(&mdevice->lock); 614 615 return ret; 616 617 err_release: 618 mutex_unlock(&mdevice->devmem_lock); 619 if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) 620 release_mem_region(devmem->pagemap.range.start, 621 range_len(&devmem->pagemap.range)); 622 err_devmem: 623 kfree(devmem); 624 625 return ret; 626 } 627 628 static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror, 629 bool is_large) 630 { 631 struct page *dpage = NULL; 632 struct page *rpage = NULL; 633 unsigned int order = is_large ? HPAGE_PMD_ORDER : 0; 634 struct dmirror_device *mdevice = dmirror->mdevice; 635 636 /* 637 * For ZONE_DEVICE private type, this is a fake device so we allocate 638 * real system memory to store our device memory. 639 * For ZONE_DEVICE coherent type we use the actual dpage to store the 640 * data and ignore rpage. 641 */ 642 if (dmirror_is_private_zone(mdevice)) { 643 rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0); 644 if (!rpage) 645 return NULL; 646 } 647 spin_lock(&mdevice->lock); 648 649 if (is_large && mdevice->free_folios) { 650 dpage = folio_page(mdevice->free_folios, 0); 651 mdevice->free_folios = dpage->zone_device_data; 652 mdevice->calloc += 1 << order; 653 spin_unlock(&mdevice->lock); 654 } else if (!is_large && mdevice->free_pages) { 655 dpage = mdevice->free_pages; 656 mdevice->free_pages = dpage->zone_device_data; 657 mdevice->calloc++; 658 spin_unlock(&mdevice->lock); 659 } else { 660 spin_unlock(&mdevice->lock); 661 if (dmirror_allocate_chunk(mdevice, &dpage, is_large)) 662 goto error; 663 } 664 665 zone_device_folio_init(page_folio(dpage), order); 666 dpage->zone_device_data = rpage; 667 return dpage; 668 669 error: 670 if (rpage) 671 __free_pages(rpage, order); 672 return NULL; 673 } 674 675 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 676 struct dmirror *dmirror) 677 { 678 const unsigned long *src = args->src; 679 unsigned long *dst = args->dst; 680 unsigned long addr; 681 682 for (addr = args->start; addr < args->end; ) { 683 struct page *spage; 684 struct page *dpage; 685 struct page *rpage; 686 bool is_large = *src & MIGRATE_PFN_COMPOUND; 687 int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0; 688 unsigned long nr = 1; 689 690 if (!(*src & MIGRATE_PFN_MIGRATE)) 691 goto next; 692 693 /* 694 * Note that spage might be NULL which is OK since it is an 695 * unallocated pte_none() or read-only zero page. 696 */ 697 spage = migrate_pfn_to_page(*src); 698 if (WARN(spage && is_zone_device_page(spage), 699 "page already in device spage pfn: 0x%lx\n", 700 page_to_pfn(spage))) 701 goto next; 702 703 if (dmirror->flags & HMM_DMIRROR_FLAG_FAIL_ALLOC) { 704 dmirror->flags &= ~HMM_DMIRROR_FLAG_FAIL_ALLOC; 705 dpage = NULL; 706 } else 707 dpage = dmirror_devmem_alloc_page(dmirror, is_large); 708 709 if (!dpage) { 710 struct folio *folio; 711 unsigned long i; 712 unsigned long spfn = *src >> MIGRATE_PFN_SHIFT; 713 struct page *src_page; 714 715 if (!is_large) 716 goto next; 717 718 if (!spage && is_large) { 719 nr = HPAGE_PMD_NR; 720 } else { 721 folio = page_folio(spage); 722 nr = folio_nr_pages(folio); 723 } 724 725 for (i = 0; i < nr && addr < args->end; i++) { 726 dpage = dmirror_devmem_alloc_page(dmirror, false); 727 rpage = BACKING_PAGE(dpage); 728 rpage->zone_device_data = dmirror; 729 730 *dst = migrate_pfn(page_to_pfn(dpage)) | write; 731 src_page = pfn_to_page(spfn + i); 732 733 if (spage) 734 copy_highpage(rpage, src_page); 735 else 736 clear_highpage(rpage); 737 src++; 738 dst++; 739 addr += PAGE_SIZE; 740 } 741 continue; 742 } 743 744 rpage = BACKING_PAGE(dpage); 745 746 /* 747 * Normally, a device would use the page->zone_device_data to 748 * point to the mirror but here we use it to hold the page for 749 * the simulated device memory and that page holds the pointer 750 * to the mirror. 751 */ 752 rpage->zone_device_data = dmirror; 753 754 pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", 755 page_to_pfn(spage), page_to_pfn(dpage)); 756 757 *dst = migrate_pfn(page_to_pfn(dpage)) | write; 758 759 if (is_large) { 760 int i; 761 struct folio *folio = page_folio(dpage); 762 *dst |= MIGRATE_PFN_COMPOUND; 763 764 if (folio_test_large(folio)) { 765 for (i = 0; i < folio_nr_pages(folio); i++) { 766 struct page *dst_page = 767 pfn_to_page(page_to_pfn(rpage) + i); 768 struct page *src_page = 769 pfn_to_page(page_to_pfn(spage) + i); 770 771 if (spage) 772 copy_highpage(dst_page, src_page); 773 else 774 clear_highpage(dst_page); 775 src++; 776 dst++; 777 addr += PAGE_SIZE; 778 } 779 continue; 780 } 781 } 782 783 if (spage) 784 copy_highpage(rpage, spage); 785 else 786 clear_highpage(rpage); 787 788 next: 789 src++; 790 dst++; 791 addr += PAGE_SIZE; 792 } 793 } 794 795 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 796 unsigned long end) 797 { 798 unsigned long pfn; 799 800 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 801 void *entry; 802 803 entry = xa_load(&dmirror->pt, pfn); 804 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 805 return -EPERM; 806 } 807 808 return 0; 809 } 810 811 static int dmirror_atomic_map(unsigned long addr, struct page *page, 812 struct dmirror *dmirror) 813 { 814 void *entry; 815 816 /* Map the migrated pages into the device's page tables. */ 817 mutex_lock(&dmirror->mutex); 818 819 entry = xa_tag_pointer(page, DPT_XA_TAG_ATOMIC); 820 entry = xa_store(&dmirror->pt, addr >> PAGE_SHIFT, entry, GFP_ATOMIC); 821 if (xa_is_err(entry)) { 822 mutex_unlock(&dmirror->mutex); 823 return xa_err(entry); 824 } 825 826 mutex_unlock(&dmirror->mutex); 827 return 0; 828 } 829 830 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 831 struct dmirror *dmirror) 832 { 833 unsigned long start = args->start; 834 unsigned long end = args->end; 835 const unsigned long *src = args->src; 836 const unsigned long *dst = args->dst; 837 unsigned long pfn; 838 const unsigned long start_pfn = start >> PAGE_SHIFT; 839 const unsigned long end_pfn = end >> PAGE_SHIFT; 840 841 /* Map the migrated pages into the device's page tables. */ 842 mutex_lock(&dmirror->mutex); 843 844 for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) { 845 struct page *dpage; 846 void *entry; 847 int nr, i; 848 struct page *rpage; 849 850 if (!(*src & MIGRATE_PFN_MIGRATE)) 851 continue; 852 853 dpage = migrate_pfn_to_page(*dst); 854 if (!dpage) 855 continue; 856 857 if (*dst & MIGRATE_PFN_COMPOUND) 858 nr = folio_nr_pages(page_folio(dpage)); 859 else 860 nr = 1; 861 862 WARN_ON_ONCE(end_pfn < start_pfn + nr); 863 864 rpage = BACKING_PAGE(dpage); 865 VM_WARN_ON(folio_nr_pages(page_folio(rpage)) != nr); 866 867 for (i = 0; i < nr; i++) { 868 entry = folio_page(page_folio(rpage), i); 869 if (*dst & MIGRATE_PFN_WRITE) 870 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 871 entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC); 872 if (xa_is_err(entry)) { 873 mutex_unlock(&dmirror->mutex); 874 return xa_err(entry); 875 } 876 } 877 } 878 879 mutex_unlock(&dmirror->mutex); 880 return 0; 881 } 882 883 static int dmirror_exclusive(struct dmirror *dmirror, 884 struct hmm_dmirror_cmd *cmd) 885 { 886 unsigned long start, end, addr; 887 unsigned long size = cmd->npages << PAGE_SHIFT; 888 struct mm_struct *mm = dmirror->notifier.mm; 889 struct dmirror_bounce bounce; 890 int ret = 0; 891 892 start = cmd->addr; 893 end = start + size; 894 if (end < start) 895 return -EINVAL; 896 897 /* Since the mm is for the mirrored process, get a reference first. */ 898 if (!mmget_not_zero(mm)) 899 return -EINVAL; 900 901 mmap_read_lock(mm); 902 for (addr = start; !ret && addr < end; addr += PAGE_SIZE) { 903 struct folio *folio; 904 struct page *page; 905 906 page = make_device_exclusive(mm, addr, NULL, &folio); 907 if (IS_ERR(page)) { 908 ret = PTR_ERR(page); 909 break; 910 } 911 912 ret = dmirror_atomic_map(addr, page, dmirror); 913 folio_unlock(folio); 914 folio_put(folio); 915 } 916 mmap_read_unlock(mm); 917 mmput(mm); 918 919 if (ret) 920 return ret; 921 922 /* Return the migrated data for verification. */ 923 ret = dmirror_bounce_init(&bounce, start, size); 924 if (ret) 925 return ret; 926 mutex_lock(&dmirror->mutex); 927 ret = dmirror_do_read(dmirror, start, end, &bounce); 928 mutex_unlock(&dmirror->mutex); 929 if (ret == 0) { 930 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 931 bounce.size)) 932 ret = -EFAULT; 933 } 934 935 cmd->cpages = bounce.cpages; 936 dmirror_bounce_fini(&bounce); 937 return ret; 938 } 939 940 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 941 struct dmirror *dmirror) 942 { 943 const unsigned long *src = args->src; 944 unsigned long *dst = args->dst; 945 unsigned long start = args->start; 946 unsigned long end = args->end; 947 unsigned long addr; 948 unsigned int order = 0; 949 int i; 950 951 for (addr = start; addr < end; ) { 952 struct page *dpage, *spage; 953 954 spage = migrate_pfn_to_page(*src); 955 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) { 956 addr += PAGE_SIZE; 957 goto next; 958 } 959 960 if (WARN_ON(!is_device_private_page(spage) && 961 !is_device_coherent_page(spage))) { 962 addr += PAGE_SIZE; 963 goto next; 964 } 965 966 spage = BACKING_PAGE(spage); 967 order = folio_order(page_folio(spage)); 968 if (order) 969 *dst = MIGRATE_PFN_COMPOUND; 970 if (*src & MIGRATE_PFN_WRITE) 971 *dst |= MIGRATE_PFN_WRITE; 972 973 if (dmirror->flags & HMM_DMIRROR_FLAG_FAIL_ALLOC) { 974 dmirror->flags &= ~HMM_DMIRROR_FLAG_FAIL_ALLOC; 975 *dst &= ~MIGRATE_PFN_COMPOUND; 976 dpage = NULL; 977 } else if (order) { 978 dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 979 order, args->vma, addr), 0); 980 } else { 981 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 982 } 983 984 if (!dpage && !order) 985 return VM_FAULT_OOM; 986 987 pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", 988 page_to_pfn(spage), page_to_pfn(dpage)); 989 990 if (dpage) { 991 lock_page(dpage); 992 *dst |= migrate_pfn(page_to_pfn(dpage)); 993 } 994 995 for (i = 0; i < (1 << order); i++) { 996 struct page *src_page; 997 struct page *dst_page; 998 999 /* Try with smaller pages if large allocation fails */ 1000 if (!dpage && order) { 1001 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1002 lock_page(dpage); 1003 dst[i] = migrate_pfn(page_to_pfn(dpage)); 1004 dst_page = pfn_to_page(page_to_pfn(dpage)); 1005 dpage = NULL; /* For the next iteration */ 1006 } else { 1007 dst_page = pfn_to_page(page_to_pfn(dpage) + i); 1008 } 1009 1010 src_page = pfn_to_page(page_to_pfn(spage) + i); 1011 1012 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1013 addr += PAGE_SIZE; 1014 copy_highpage(dst_page, src_page); 1015 } 1016 next: 1017 src += 1 << order; 1018 dst += 1 << order; 1019 } 1020 return 0; 1021 } 1022 1023 static unsigned long 1024 dmirror_successful_migrated_pages(struct migrate_vma *migrate) 1025 { 1026 unsigned long cpages = 0; 1027 unsigned long i; 1028 1029 for (i = 0; i < migrate->npages; i++) { 1030 if (migrate->src[i] & MIGRATE_PFN_VALID && 1031 migrate->src[i] & MIGRATE_PFN_MIGRATE) 1032 cpages++; 1033 } 1034 return cpages; 1035 } 1036 1037 static int dmirror_migrate_to_system(struct dmirror *dmirror, 1038 struct hmm_dmirror_cmd *cmd) 1039 { 1040 unsigned long start, end, addr; 1041 unsigned long size = cmd->npages << PAGE_SHIFT; 1042 struct mm_struct *mm = dmirror->notifier.mm; 1043 struct vm_area_struct *vma; 1044 struct migrate_vma args = { 0 }; 1045 unsigned long next; 1046 int ret; 1047 unsigned long *src_pfns; 1048 unsigned long *dst_pfns; 1049 1050 src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); 1051 dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); 1052 1053 start = cmd->addr; 1054 end = start + size; 1055 if (end < start) 1056 return -EINVAL; 1057 1058 /* Since the mm is for the mirrored process, get a reference first. */ 1059 if (!mmget_not_zero(mm)) 1060 return -EINVAL; 1061 1062 cmd->cpages = 0; 1063 mmap_read_lock(mm); 1064 for (addr = start; addr < end; addr = next) { 1065 vma = vma_lookup(mm, addr); 1066 if (!vma || !(vma->vm_flags & VM_READ)) { 1067 ret = -EINVAL; 1068 goto out; 1069 } 1070 next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); 1071 if (next > vma->vm_end) 1072 next = vma->vm_end; 1073 1074 args.vma = vma; 1075 args.src = src_pfns; 1076 args.dst = dst_pfns; 1077 args.start = addr; 1078 args.end = next; 1079 args.pgmap_owner = dmirror->mdevice; 1080 args.flags = dmirror_select_device(dmirror) | MIGRATE_VMA_SELECT_COMPOUND; 1081 1082 ret = migrate_vma_setup(&args); 1083 if (ret) 1084 goto out; 1085 1086 pr_debug("Migrating from device mem to sys mem\n"); 1087 dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1088 1089 migrate_vma_pages(&args); 1090 cmd->cpages += dmirror_successful_migrated_pages(&args); 1091 migrate_vma_finalize(&args); 1092 } 1093 out: 1094 mmap_read_unlock(mm); 1095 mmput(mm); 1096 kvfree(src_pfns); 1097 kvfree(dst_pfns); 1098 1099 return ret; 1100 } 1101 1102 static int dmirror_migrate_to_device(struct dmirror *dmirror, 1103 struct hmm_dmirror_cmd *cmd) 1104 { 1105 unsigned long start, end, addr; 1106 unsigned long size = cmd->npages << PAGE_SHIFT; 1107 struct mm_struct *mm = dmirror->notifier.mm; 1108 struct vm_area_struct *vma; 1109 struct dmirror_bounce bounce; 1110 struct migrate_vma args = { 0 }; 1111 unsigned long next; 1112 int ret; 1113 unsigned long *src_pfns = NULL; 1114 unsigned long *dst_pfns = NULL; 1115 1116 start = cmd->addr; 1117 end = start + size; 1118 if (end < start) 1119 return -EINVAL; 1120 1121 /* Since the mm is for the mirrored process, get a reference first. */ 1122 if (!mmget_not_zero(mm)) 1123 return -EINVAL; 1124 1125 ret = -ENOMEM; 1126 src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), 1127 GFP_KERNEL | __GFP_NOFAIL); 1128 if (!src_pfns) 1129 goto free_mem; 1130 1131 dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), 1132 GFP_KERNEL | __GFP_NOFAIL); 1133 if (!dst_pfns) 1134 goto free_mem; 1135 1136 ret = 0; 1137 mmap_read_lock(mm); 1138 for (addr = start; addr < end; addr = next) { 1139 vma = vma_lookup(mm, addr); 1140 if (!vma || !(vma->vm_flags & VM_READ)) { 1141 ret = -EINVAL; 1142 goto out; 1143 } 1144 next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); 1145 if (next > vma->vm_end) 1146 next = vma->vm_end; 1147 1148 args.vma = vma; 1149 args.src = src_pfns; 1150 args.dst = dst_pfns; 1151 args.start = addr; 1152 args.end = next; 1153 args.pgmap_owner = dmirror->mdevice; 1154 args.flags = MIGRATE_VMA_SELECT_SYSTEM | 1155 MIGRATE_VMA_SELECT_COMPOUND; 1156 ret = migrate_vma_setup(&args); 1157 if (ret) 1158 goto out; 1159 1160 pr_debug("Migrating from sys mem to device mem\n"); 1161 dmirror_migrate_alloc_and_copy(&args, dmirror); 1162 migrate_vma_pages(&args); 1163 dmirror_migrate_finalize_and_map(&args, dmirror); 1164 migrate_vma_finalize(&args); 1165 } 1166 mmap_read_unlock(mm); 1167 mmput(mm); 1168 1169 /* 1170 * Return the migrated data for verification. 1171 * Only for pages in device zone 1172 */ 1173 ret = dmirror_bounce_init(&bounce, start, size); 1174 if (ret) 1175 goto free_mem; 1176 mutex_lock(&dmirror->mutex); 1177 ret = dmirror_do_read(dmirror, start, end, &bounce); 1178 mutex_unlock(&dmirror->mutex); 1179 if (ret == 0) { 1180 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 1181 bounce.size)) 1182 ret = -EFAULT; 1183 } 1184 cmd->cpages = bounce.cpages; 1185 dmirror_bounce_fini(&bounce); 1186 goto free_mem; 1187 1188 out: 1189 mmap_read_unlock(mm); 1190 mmput(mm); 1191 free_mem: 1192 kfree(src_pfns); 1193 kfree(dst_pfns); 1194 return ret; 1195 } 1196 1197 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 1198 unsigned char *perm, unsigned long entry) 1199 { 1200 struct page *page; 1201 1202 if (entry & HMM_PFN_ERROR) { 1203 *perm = HMM_DMIRROR_PROT_ERROR; 1204 return; 1205 } 1206 if (!(entry & HMM_PFN_VALID)) { 1207 *perm = HMM_DMIRROR_PROT_NONE; 1208 return; 1209 } 1210 1211 page = hmm_pfn_to_page(entry); 1212 if (is_device_private_page(page)) { 1213 /* Is the page migrated to this device or some other? */ 1214 if (dmirror->mdevice == dmirror_page_to_device(page)) 1215 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 1216 else 1217 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 1218 } else if (is_device_coherent_page(page)) { 1219 /* Is the page migrated to this device or some other? */ 1220 if (dmirror->mdevice == dmirror_page_to_device(page)) 1221 *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; 1222 else 1223 *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; 1224 } else if (is_zero_pfn(page_to_pfn(page))) 1225 *perm = HMM_DMIRROR_PROT_ZERO; 1226 else 1227 *perm = HMM_DMIRROR_PROT_NONE; 1228 if (entry & HMM_PFN_WRITE) 1229 *perm |= HMM_DMIRROR_PROT_WRITE; 1230 else 1231 *perm |= HMM_DMIRROR_PROT_READ; 1232 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 1233 *perm |= HMM_DMIRROR_PROT_PMD; 1234 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 1235 *perm |= HMM_DMIRROR_PROT_PUD; 1236 } 1237 1238 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 1239 const struct mmu_notifier_range *range, 1240 unsigned long cur_seq) 1241 { 1242 struct dmirror_interval *dmi = 1243 container_of(mni, struct dmirror_interval, notifier); 1244 struct dmirror *dmirror = dmi->dmirror; 1245 1246 if (mmu_notifier_range_blockable(range)) 1247 mutex_lock(&dmirror->mutex); 1248 else if (!mutex_trylock(&dmirror->mutex)) 1249 return false; 1250 1251 /* 1252 * Snapshots only need to set the sequence number since any 1253 * invalidation in the interval invalidates the whole snapshot. 1254 */ 1255 mmu_interval_set_seq(mni, cur_seq); 1256 1257 mutex_unlock(&dmirror->mutex); 1258 return true; 1259 } 1260 1261 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 1262 .invalidate = dmirror_snapshot_invalidate, 1263 }; 1264 1265 static int dmirror_range_snapshot(struct dmirror *dmirror, 1266 struct hmm_range *range, 1267 unsigned char *perm) 1268 { 1269 struct mm_struct *mm = dmirror->notifier.mm; 1270 struct dmirror_interval notifier; 1271 unsigned long timeout = 1272 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 1273 unsigned long i; 1274 unsigned long n; 1275 int ret = 0; 1276 1277 notifier.dmirror = dmirror; 1278 range->notifier = ¬ifier.notifier; 1279 1280 ret = mmu_interval_notifier_insert(range->notifier, mm, 1281 range->start, range->end - range->start, 1282 &dmirror_mrn_ops); 1283 if (ret) 1284 return ret; 1285 1286 while (true) { 1287 if (time_after(jiffies, timeout)) { 1288 ret = -EBUSY; 1289 goto out; 1290 } 1291 1292 range->notifier_seq = mmu_interval_read_begin(range->notifier); 1293 1294 mmap_read_lock(mm); 1295 ret = hmm_range_fault(range); 1296 mmap_read_unlock(mm); 1297 if (ret) { 1298 if (ret == -EBUSY) 1299 continue; 1300 goto out; 1301 } 1302 1303 mutex_lock(&dmirror->mutex); 1304 if (mmu_interval_read_retry(range->notifier, 1305 range->notifier_seq)) { 1306 mutex_unlock(&dmirror->mutex); 1307 continue; 1308 } 1309 break; 1310 } 1311 1312 n = (range->end - range->start) >> PAGE_SHIFT; 1313 for (i = 0; i < n; i++) 1314 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 1315 1316 mutex_unlock(&dmirror->mutex); 1317 out: 1318 mmu_interval_notifier_remove(range->notifier); 1319 return ret; 1320 } 1321 1322 static int dmirror_snapshot(struct dmirror *dmirror, 1323 struct hmm_dmirror_cmd *cmd) 1324 { 1325 struct mm_struct *mm = dmirror->notifier.mm; 1326 unsigned long start, end; 1327 unsigned long size = cmd->npages << PAGE_SHIFT; 1328 unsigned long addr; 1329 unsigned long next; 1330 unsigned long pfns[32]; 1331 unsigned char perm[32]; 1332 char __user *uptr; 1333 struct hmm_range range = { 1334 .hmm_pfns = pfns, 1335 .dev_private_owner = dmirror->mdevice, 1336 }; 1337 int ret = 0; 1338 1339 start = cmd->addr; 1340 end = start + size; 1341 if (end < start) 1342 return -EINVAL; 1343 1344 /* Since the mm is for the mirrored process, get a reference first. */ 1345 if (!mmget_not_zero(mm)) 1346 return -EINVAL; 1347 1348 /* 1349 * Register a temporary notifier to detect invalidations even if it 1350 * overlaps with other mmu_interval_notifiers. 1351 */ 1352 uptr = u64_to_user_ptr(cmd->ptr); 1353 for (addr = start; addr < end; addr = next) { 1354 unsigned long n; 1355 1356 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1357 range.start = addr; 1358 range.end = next; 1359 1360 ret = dmirror_range_snapshot(dmirror, &range, perm); 1361 if (ret) 1362 break; 1363 1364 n = (range.end - range.start) >> PAGE_SHIFT; 1365 if (copy_to_user(uptr, perm, n)) { 1366 ret = -EFAULT; 1367 break; 1368 } 1369 1370 cmd->cpages += n; 1371 uptr += n; 1372 } 1373 mmput(mm); 1374 1375 return ret; 1376 } 1377 1378 static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) 1379 { 1380 unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; 1381 unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; 1382 unsigned long npages = end_pfn - start_pfn + 1; 1383 unsigned long i; 1384 unsigned long *src_pfns; 1385 unsigned long *dst_pfns; 1386 unsigned int order = 0; 1387 1388 src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); 1389 dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); 1390 1391 migrate_device_range(src_pfns, start_pfn, npages); 1392 for (i = 0; i < npages; i++) { 1393 struct page *dpage, *spage; 1394 1395 spage = migrate_pfn_to_page(src_pfns[i]); 1396 if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) 1397 continue; 1398 1399 if (WARN_ON(!is_device_private_page(spage) && 1400 !is_device_coherent_page(spage))) 1401 continue; 1402 1403 order = folio_order(page_folio(spage)); 1404 spage = BACKING_PAGE(spage); 1405 if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { 1406 dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, 1407 order), 0); 1408 } else { 1409 dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); 1410 order = 0; 1411 } 1412 1413 /* TODO Support splitting here */ 1414 lock_page(dpage); 1415 dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); 1416 if (src_pfns[i] & MIGRATE_PFN_WRITE) 1417 dst_pfns[i] |= MIGRATE_PFN_WRITE; 1418 if (order) 1419 dst_pfns[i] |= MIGRATE_PFN_COMPOUND; 1420 folio_copy(page_folio(dpage), page_folio(spage)); 1421 } 1422 migrate_device_pages(src_pfns, dst_pfns, npages); 1423 migrate_device_finalize(src_pfns, dst_pfns, npages); 1424 kvfree(src_pfns); 1425 kvfree(dst_pfns); 1426 } 1427 1428 /* Removes free pages from the free list so they can't be re-allocated */ 1429 static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) 1430 { 1431 struct dmirror_device *mdevice = devmem->mdevice; 1432 struct page *page; 1433 struct folio *folio; 1434 1435 1436 for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio)) 1437 if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem) 1438 mdevice->free_folios = folio_zone_device_data(folio); 1439 for (page = mdevice->free_pages; page; page = page->zone_device_data) 1440 if (dmirror_page_to_chunk(page) == devmem) 1441 mdevice->free_pages = page->zone_device_data; 1442 } 1443 1444 static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) 1445 { 1446 unsigned int i; 1447 1448 mutex_lock(&mdevice->devmem_lock); 1449 if (mdevice->devmem_chunks) { 1450 for (i = 0; i < mdevice->devmem_count; i++) { 1451 struct dmirror_chunk *devmem = 1452 mdevice->devmem_chunks[i]; 1453 1454 spin_lock(&mdevice->lock); 1455 devmem->remove = true; 1456 dmirror_remove_free_pages(devmem); 1457 spin_unlock(&mdevice->lock); 1458 1459 dmirror_device_evict_chunk(devmem); 1460 memunmap_pages(&devmem->pagemap); 1461 if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) 1462 release_mem_region(devmem->pagemap.range.start, 1463 range_len(&devmem->pagemap.range)); 1464 kfree(devmem); 1465 } 1466 mdevice->devmem_count = 0; 1467 mdevice->devmem_capacity = 0; 1468 mdevice->free_pages = NULL; 1469 mdevice->free_folios = NULL; 1470 kfree(mdevice->devmem_chunks); 1471 mdevice->devmem_chunks = NULL; 1472 } 1473 mutex_unlock(&mdevice->devmem_lock); 1474 } 1475 1476 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1477 unsigned int command, 1478 unsigned long arg) 1479 { 1480 void __user *uarg = (void __user *)arg; 1481 struct hmm_dmirror_cmd cmd; 1482 struct dmirror *dmirror; 1483 int ret; 1484 1485 dmirror = filp->private_data; 1486 if (!dmirror) 1487 return -EINVAL; 1488 1489 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1490 return -EFAULT; 1491 1492 if (cmd.addr & ~PAGE_MASK) 1493 return -EINVAL; 1494 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1495 return -EINVAL; 1496 1497 cmd.cpages = 0; 1498 cmd.faults = 0; 1499 1500 switch (command) { 1501 case HMM_DMIRROR_READ: 1502 ret = dmirror_read(dmirror, &cmd); 1503 break; 1504 1505 case HMM_DMIRROR_WRITE: 1506 ret = dmirror_write(dmirror, &cmd); 1507 break; 1508 1509 case HMM_DMIRROR_MIGRATE_TO_DEV: 1510 ret = dmirror_migrate_to_device(dmirror, &cmd); 1511 break; 1512 1513 case HMM_DMIRROR_MIGRATE_TO_SYS: 1514 ret = dmirror_migrate_to_system(dmirror, &cmd); 1515 break; 1516 1517 case HMM_DMIRROR_EXCLUSIVE: 1518 ret = dmirror_exclusive(dmirror, &cmd); 1519 break; 1520 1521 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1522 ret = dmirror_check_atomic(dmirror, cmd.addr, 1523 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1524 break; 1525 1526 case HMM_DMIRROR_SNAPSHOT: 1527 ret = dmirror_snapshot(dmirror, &cmd); 1528 break; 1529 1530 case HMM_DMIRROR_RELEASE: 1531 dmirror_device_remove_chunks(dmirror->mdevice); 1532 ret = 0; 1533 break; 1534 case HMM_DMIRROR_FLAGS: 1535 dmirror->flags = cmd.npages; 1536 ret = 0; 1537 break; 1538 1539 default: 1540 return -EINVAL; 1541 } 1542 if (ret) 1543 return ret; 1544 1545 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1546 return -EFAULT; 1547 1548 return 0; 1549 } 1550 1551 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) 1552 { 1553 unsigned long addr; 1554 1555 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1556 struct page *page; 1557 int ret; 1558 1559 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1560 if (!page) 1561 return -ENOMEM; 1562 1563 ret = vm_insert_page(vma, addr, page); 1564 if (ret) { 1565 __free_page(page); 1566 return ret; 1567 } 1568 put_page(page); 1569 } 1570 1571 return 0; 1572 } 1573 1574 static const struct file_operations dmirror_fops = { 1575 .open = dmirror_fops_open, 1576 .release = dmirror_fops_release, 1577 .mmap = dmirror_fops_mmap, 1578 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1579 .llseek = default_llseek, 1580 .owner = THIS_MODULE, 1581 }; 1582 1583 static void dmirror_devmem_free(struct folio *folio) 1584 { 1585 struct page *page = &folio->page; 1586 struct page *rpage = BACKING_PAGE(page); 1587 struct dmirror_device *mdevice; 1588 struct folio *rfolio = page_folio(rpage); 1589 unsigned int order = folio_order(rfolio); 1590 1591 if (rpage != page) { 1592 if (order) 1593 __free_pages(rpage, order); 1594 else 1595 __free_page(rpage); 1596 rpage = NULL; 1597 } 1598 1599 mdevice = dmirror_page_to_device(page); 1600 spin_lock(&mdevice->lock); 1601 1602 /* Return page to our allocator if not freeing the chunk */ 1603 if (!dmirror_page_to_chunk(page)->remove) { 1604 mdevice->cfree += 1 << order; 1605 if (order) { 1606 page->zone_device_data = mdevice->free_folios; 1607 mdevice->free_folios = page_folio(page); 1608 } else { 1609 page->zone_device_data = mdevice->free_pages; 1610 mdevice->free_pages = page; 1611 } 1612 } 1613 spin_unlock(&mdevice->lock); 1614 } 1615 1616 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1617 { 1618 struct migrate_vma args = { 0 }; 1619 struct page *rpage; 1620 struct dmirror *dmirror; 1621 vm_fault_t ret = 0; 1622 unsigned int order, nr; 1623 1624 /* 1625 * Normally, a device would use the page->zone_device_data to point to 1626 * the mirror but here we use it to hold the page for the simulated 1627 * device memory and that page holds the pointer to the mirror. 1628 */ 1629 rpage = folio_zone_device_data(page_folio(vmf->page)); 1630 dmirror = rpage->zone_device_data; 1631 1632 /* FIXME demonstrate how we can adjust migrate range */ 1633 order = folio_order(page_folio(vmf->page)); 1634 nr = 1 << order; 1635 1636 /* 1637 * When folios are partially mapped, we can't rely on the folio 1638 * order of vmf->page as the folio might not be fully split yet 1639 */ 1640 if (vmf->pte) { 1641 order = 0; 1642 nr = 1; 1643 } 1644 1645 /* 1646 * Consider a per-cpu cache of src and dst pfns, but with 1647 * large number of cpus that might not scale well. 1648 */ 1649 args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order)); 1650 args.vma = vmf->vma; 1651 args.end = args.start + (PAGE_SIZE << order); 1652 1653 nr = (args.end - args.start) >> PAGE_SHIFT; 1654 args.src = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); 1655 args.dst = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); 1656 args.pgmap_owner = dmirror->mdevice; 1657 args.flags = dmirror_select_device(dmirror); 1658 args.fault_page = vmf->page; 1659 1660 if (!args.src || !args.dst) { 1661 ret = VM_FAULT_OOM; 1662 goto err; 1663 } 1664 1665 if (order) 1666 args.flags |= MIGRATE_VMA_SELECT_COMPOUND; 1667 1668 if (migrate_vma_setup(&args)) 1669 return VM_FAULT_SIGBUS; 1670 1671 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1672 if (ret) 1673 goto err; 1674 migrate_vma_pages(&args); 1675 /* 1676 * No device finalize step is needed since 1677 * dmirror_devmem_fault_alloc_and_copy() will have already 1678 * invalidated the device page table. 1679 */ 1680 migrate_vma_finalize(&args); 1681 err: 1682 kfree(args.src); 1683 kfree(args.dst); 1684 return ret; 1685 } 1686 1687 static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail) 1688 { 1689 struct page *rpage = BACKING_PAGE(folio_page(head, 0)); 1690 struct page *rpage_tail; 1691 struct folio *rfolio; 1692 unsigned long offset = 0; 1693 1694 if (!rpage) { 1695 tail->page.zone_device_data = NULL; 1696 return; 1697 } 1698 1699 rfolio = page_folio(rpage); 1700 1701 if (tail == NULL) { 1702 folio_reset_order(rfolio); 1703 rfolio->mapping = NULL; 1704 folio_set_count(rfolio, 1); 1705 return; 1706 } 1707 1708 offset = folio_pfn(tail) - folio_pfn(head); 1709 1710 rpage_tail = folio_page(rfolio, offset); 1711 tail->page.zone_device_data = rpage_tail; 1712 rpage_tail->zone_device_data = rpage->zone_device_data; 1713 clear_compound_head(rpage_tail); 1714 rpage_tail->mapping = NULL; 1715 1716 folio_page(tail, 0)->mapping = folio_page(head, 0)->mapping; 1717 tail->pgmap = head->pgmap; 1718 folio_set_count(page_folio(rpage_tail), 1); 1719 } 1720 1721 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1722 .folio_free = dmirror_devmem_free, 1723 .migrate_to_ram = dmirror_devmem_fault, 1724 .folio_split = dmirror_devmem_folio_split, 1725 }; 1726 1727 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1728 { 1729 dev_t dev; 1730 int ret; 1731 1732 dev = MKDEV(MAJOR(dmirror_dev), id); 1733 mutex_init(&mdevice->devmem_lock); 1734 spin_lock_init(&mdevice->lock); 1735 1736 cdev_init(&mdevice->cdevice, &dmirror_fops); 1737 mdevice->cdevice.owner = THIS_MODULE; 1738 device_initialize(&mdevice->device); 1739 mdevice->device.devt = dev; 1740 1741 ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id); 1742 if (ret) 1743 goto put_device; 1744 1745 ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); 1746 if (ret) 1747 goto put_device; 1748 1749 /* Build a list of free ZONE_DEVICE struct pages */ 1750 return dmirror_allocate_chunk(mdevice, NULL, false); 1751 1752 put_device: 1753 put_device(&mdevice->device); 1754 return ret; 1755 } 1756 1757 static void dmirror_device_remove(struct dmirror_device *mdevice) 1758 { 1759 dmirror_device_remove_chunks(mdevice); 1760 cdev_device_del(&mdevice->cdevice, &mdevice->device); 1761 put_device(&mdevice->device); 1762 } 1763 1764 static int __init hmm_dmirror_init(void) 1765 { 1766 int ret; 1767 int id = 0; 1768 int ndevices = 0; 1769 1770 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1771 "HMM_DMIRROR"); 1772 if (ret) 1773 goto err_unreg; 1774 1775 memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); 1776 dmirror_devices[ndevices++].zone_device_type = 1777 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; 1778 dmirror_devices[ndevices++].zone_device_type = 1779 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; 1780 if (spm_addr_dev0 && spm_addr_dev1) { 1781 dmirror_devices[ndevices++].zone_device_type = 1782 HMM_DMIRROR_MEMORY_DEVICE_COHERENT; 1783 dmirror_devices[ndevices++].zone_device_type = 1784 HMM_DMIRROR_MEMORY_DEVICE_COHERENT; 1785 } 1786 for (id = 0; id < ndevices; id++) { 1787 ret = dmirror_device_init(dmirror_devices + id, id); 1788 if (ret) 1789 goto err_chrdev; 1790 } 1791 1792 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1793 return 0; 1794 1795 err_chrdev: 1796 while (--id >= 0) 1797 dmirror_device_remove(dmirror_devices + id); 1798 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1799 err_unreg: 1800 return ret; 1801 } 1802 1803 static void __exit hmm_dmirror_exit(void) 1804 { 1805 int id; 1806 1807 for (id = 0; id < DMIRROR_NDEVICES; id++) 1808 if (dmirror_devices[id].zone_device_type) 1809 dmirror_device_remove(dmirror_devices + id); 1810 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1811 } 1812 1813 module_init(hmm_dmirror_init); 1814 module_exit(hmm_dmirror_exit); 1815 MODULE_DESCRIPTION("HMM (Heterogeneous Memory Management) test module"); 1816 MODULE_LICENSE("GPL"); 1817