1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 #include <linux/rmap.h> 29 30 #include "test_hmm_uapi.h" 31 32 #define DMIRROR_NDEVICES 2 33 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 34 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 35 #define DEVMEM_CHUNKS_RESERVE 16 36 37 static const struct dev_pagemap_ops dmirror_devmem_ops; 38 static const struct mmu_interval_notifier_ops dmirror_min_ops; 39 static dev_t dmirror_dev; 40 41 struct dmirror_device; 42 43 struct dmirror_bounce { 44 void *ptr; 45 unsigned long size; 46 unsigned long addr; 47 unsigned long cpages; 48 }; 49 50 #define DPT_XA_TAG_ATOMIC 1UL 51 #define DPT_XA_TAG_WRITE 3UL 52 53 /* 54 * Data structure to track address ranges and register for mmu interval 55 * notifier updates. 56 */ 57 struct dmirror_interval { 58 struct mmu_interval_notifier notifier; 59 struct dmirror *dmirror; 60 }; 61 62 /* 63 * Data attached to the open device file. 64 * Note that it might be shared after a fork(). 65 */ 66 struct dmirror { 67 struct dmirror_device *mdevice; 68 struct xarray pt; 69 struct mmu_interval_notifier notifier; 70 struct mutex mutex; 71 }; 72 73 /* 74 * ZONE_DEVICE pages for migration and simulating device memory. 75 */ 76 struct dmirror_chunk { 77 struct dev_pagemap pagemap; 78 struct dmirror_device *mdevice; 79 }; 80 81 /* 82 * Per device data. 83 */ 84 struct dmirror_device { 85 struct cdev cdevice; 86 struct hmm_devmem *devmem; 87 88 unsigned int devmem_capacity; 89 unsigned int devmem_count; 90 struct dmirror_chunk **devmem_chunks; 91 struct mutex devmem_lock; /* protects the above */ 92 93 unsigned long calloc; 94 unsigned long cfree; 95 struct page *free_pages; 96 spinlock_t lock; /* protects the above */ 97 }; 98 99 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 100 101 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 102 unsigned long addr, 103 unsigned long size) 104 { 105 bounce->addr = addr; 106 bounce->size = size; 107 bounce->cpages = 0; 108 bounce->ptr = vmalloc(size); 109 if (!bounce->ptr) 110 return -ENOMEM; 111 return 0; 112 } 113 114 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 115 { 116 vfree(bounce->ptr); 117 } 118 119 static int dmirror_fops_open(struct inode *inode, struct file *filp) 120 { 121 struct cdev *cdev = inode->i_cdev; 122 struct dmirror *dmirror; 123 int ret; 124 125 /* Mirror this process address space */ 126 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 127 if (dmirror == NULL) 128 return -ENOMEM; 129 130 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 131 mutex_init(&dmirror->mutex); 132 xa_init(&dmirror->pt); 133 134 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 135 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 136 if (ret) { 137 kfree(dmirror); 138 return ret; 139 } 140 141 filp->private_data = dmirror; 142 return 0; 143 } 144 145 static int dmirror_fops_release(struct inode *inode, struct file *filp) 146 { 147 struct dmirror *dmirror = filp->private_data; 148 149 mmu_interval_notifier_remove(&dmirror->notifier); 150 xa_destroy(&dmirror->pt); 151 kfree(dmirror); 152 return 0; 153 } 154 155 static struct dmirror_device *dmirror_page_to_device(struct page *page) 156 157 { 158 return container_of(page->pgmap, struct dmirror_chunk, 159 pagemap)->mdevice; 160 } 161 162 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 163 { 164 unsigned long *pfns = range->hmm_pfns; 165 unsigned long pfn; 166 167 for (pfn = (range->start >> PAGE_SHIFT); 168 pfn < (range->end >> PAGE_SHIFT); 169 pfn++, pfns++) { 170 struct page *page; 171 void *entry; 172 173 /* 174 * Since we asked for hmm_range_fault() to populate pages, 175 * it shouldn't return an error entry on success. 176 */ 177 WARN_ON(*pfns & HMM_PFN_ERROR); 178 WARN_ON(!(*pfns & HMM_PFN_VALID)); 179 180 page = hmm_pfn_to_page(*pfns); 181 WARN_ON(!page); 182 183 entry = page; 184 if (*pfns & HMM_PFN_WRITE) 185 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 186 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 187 return -EFAULT; 188 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 189 if (xa_is_err(entry)) 190 return xa_err(entry); 191 } 192 193 return 0; 194 } 195 196 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 197 unsigned long end) 198 { 199 unsigned long pfn; 200 void *entry; 201 202 /* 203 * The XArray doesn't hold references to pages since it relies on 204 * the mmu notifier to clear page pointers when they become stale. 205 * Therefore, it is OK to just clear the entry. 206 */ 207 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 208 end >> PAGE_SHIFT) 209 xa_erase(&dmirror->pt, pfn); 210 } 211 212 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 213 const struct mmu_notifier_range *range, 214 unsigned long cur_seq) 215 { 216 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 217 218 /* 219 * Ignore invalidation callbacks for device private pages since 220 * the invalidation is handled as part of the migration process. 221 */ 222 if (range->event == MMU_NOTIFY_MIGRATE && 223 range->owner == dmirror->mdevice) 224 return true; 225 226 if (mmu_notifier_range_blockable(range)) 227 mutex_lock(&dmirror->mutex); 228 else if (!mutex_trylock(&dmirror->mutex)) 229 return false; 230 231 mmu_interval_set_seq(mni, cur_seq); 232 dmirror_do_update(dmirror, range->start, range->end); 233 234 mutex_unlock(&dmirror->mutex); 235 return true; 236 } 237 238 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 239 .invalidate = dmirror_interval_invalidate, 240 }; 241 242 static int dmirror_range_fault(struct dmirror *dmirror, 243 struct hmm_range *range) 244 { 245 struct mm_struct *mm = dmirror->notifier.mm; 246 unsigned long timeout = 247 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 248 int ret; 249 250 while (true) { 251 if (time_after(jiffies, timeout)) { 252 ret = -EBUSY; 253 goto out; 254 } 255 256 range->notifier_seq = mmu_interval_read_begin(range->notifier); 257 mmap_read_lock(mm); 258 ret = hmm_range_fault(range); 259 mmap_read_unlock(mm); 260 if (ret) { 261 if (ret == -EBUSY) 262 continue; 263 goto out; 264 } 265 266 mutex_lock(&dmirror->mutex); 267 if (mmu_interval_read_retry(range->notifier, 268 range->notifier_seq)) { 269 mutex_unlock(&dmirror->mutex); 270 continue; 271 } 272 break; 273 } 274 275 ret = dmirror_do_fault(dmirror, range); 276 277 mutex_unlock(&dmirror->mutex); 278 out: 279 return ret; 280 } 281 282 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 283 unsigned long end, bool write) 284 { 285 struct mm_struct *mm = dmirror->notifier.mm; 286 unsigned long addr; 287 unsigned long pfns[64]; 288 struct hmm_range range = { 289 .notifier = &dmirror->notifier, 290 .hmm_pfns = pfns, 291 .pfn_flags_mask = 0, 292 .default_flags = 293 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 294 .dev_private_owner = dmirror->mdevice, 295 }; 296 int ret = 0; 297 298 /* Since the mm is for the mirrored process, get a reference first. */ 299 if (!mmget_not_zero(mm)) 300 return 0; 301 302 for (addr = start; addr < end; addr = range.end) { 303 range.start = addr; 304 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 305 306 ret = dmirror_range_fault(dmirror, &range); 307 if (ret) 308 break; 309 } 310 311 mmput(mm); 312 return ret; 313 } 314 315 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 316 unsigned long end, struct dmirror_bounce *bounce) 317 { 318 unsigned long pfn; 319 void *ptr; 320 321 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 322 323 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 324 void *entry; 325 struct page *page; 326 void *tmp; 327 328 entry = xa_load(&dmirror->pt, pfn); 329 page = xa_untag_pointer(entry); 330 if (!page) 331 return -ENOENT; 332 333 tmp = kmap(page); 334 memcpy(ptr, tmp, PAGE_SIZE); 335 kunmap(page); 336 337 ptr += PAGE_SIZE; 338 bounce->cpages++; 339 } 340 341 return 0; 342 } 343 344 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 345 { 346 struct dmirror_bounce bounce; 347 unsigned long start, end; 348 unsigned long size = cmd->npages << PAGE_SHIFT; 349 int ret; 350 351 start = cmd->addr; 352 end = start + size; 353 if (end < start) 354 return -EINVAL; 355 356 ret = dmirror_bounce_init(&bounce, start, size); 357 if (ret) 358 return ret; 359 360 while (1) { 361 mutex_lock(&dmirror->mutex); 362 ret = dmirror_do_read(dmirror, start, end, &bounce); 363 mutex_unlock(&dmirror->mutex); 364 if (ret != -ENOENT) 365 break; 366 367 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 368 ret = dmirror_fault(dmirror, start, end, false); 369 if (ret) 370 break; 371 cmd->faults++; 372 } 373 374 if (ret == 0) { 375 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 376 bounce.size)) 377 ret = -EFAULT; 378 } 379 cmd->cpages = bounce.cpages; 380 dmirror_bounce_fini(&bounce); 381 return ret; 382 } 383 384 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 385 unsigned long end, struct dmirror_bounce *bounce) 386 { 387 unsigned long pfn; 388 void *ptr; 389 390 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 391 392 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 393 void *entry; 394 struct page *page; 395 void *tmp; 396 397 entry = xa_load(&dmirror->pt, pfn); 398 page = xa_untag_pointer(entry); 399 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 400 return -ENOENT; 401 402 tmp = kmap(page); 403 memcpy(tmp, ptr, PAGE_SIZE); 404 kunmap(page); 405 406 ptr += PAGE_SIZE; 407 bounce->cpages++; 408 } 409 410 return 0; 411 } 412 413 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 414 { 415 struct dmirror_bounce bounce; 416 unsigned long start, end; 417 unsigned long size = cmd->npages << PAGE_SHIFT; 418 int ret; 419 420 start = cmd->addr; 421 end = start + size; 422 if (end < start) 423 return -EINVAL; 424 425 ret = dmirror_bounce_init(&bounce, start, size); 426 if (ret) 427 return ret; 428 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 429 bounce.size)) { 430 ret = -EFAULT; 431 goto fini; 432 } 433 434 while (1) { 435 mutex_lock(&dmirror->mutex); 436 ret = dmirror_do_write(dmirror, start, end, &bounce); 437 mutex_unlock(&dmirror->mutex); 438 if (ret != -ENOENT) 439 break; 440 441 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 442 ret = dmirror_fault(dmirror, start, end, true); 443 if (ret) 444 break; 445 cmd->faults++; 446 } 447 448 fini: 449 cmd->cpages = bounce.cpages; 450 dmirror_bounce_fini(&bounce); 451 return ret; 452 } 453 454 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 455 struct page **ppage) 456 { 457 struct dmirror_chunk *devmem; 458 struct resource *res; 459 unsigned long pfn; 460 unsigned long pfn_first; 461 unsigned long pfn_last; 462 void *ptr; 463 464 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 465 if (!devmem) 466 return false; 467 468 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 469 "hmm_dmirror"); 470 if (IS_ERR(res)) 471 goto err_devmem; 472 473 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 474 devmem->pagemap.range.start = res->start; 475 devmem->pagemap.range.end = res->end; 476 devmem->pagemap.nr_range = 1; 477 devmem->pagemap.ops = &dmirror_devmem_ops; 478 devmem->pagemap.owner = mdevice; 479 480 mutex_lock(&mdevice->devmem_lock); 481 482 if (mdevice->devmem_count == mdevice->devmem_capacity) { 483 struct dmirror_chunk **new_chunks; 484 unsigned int new_capacity; 485 486 new_capacity = mdevice->devmem_capacity + 487 DEVMEM_CHUNKS_RESERVE; 488 new_chunks = krealloc(mdevice->devmem_chunks, 489 sizeof(new_chunks[0]) * new_capacity, 490 GFP_KERNEL); 491 if (!new_chunks) 492 goto err_release; 493 mdevice->devmem_capacity = new_capacity; 494 mdevice->devmem_chunks = new_chunks; 495 } 496 497 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 498 if (IS_ERR(ptr)) 499 goto err_release; 500 501 devmem->mdevice = mdevice; 502 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 503 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 504 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 505 506 mutex_unlock(&mdevice->devmem_lock); 507 508 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 509 DEVMEM_CHUNK_SIZE / (1024 * 1024), 510 mdevice->devmem_count, 511 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 512 pfn_first, pfn_last); 513 514 spin_lock(&mdevice->lock); 515 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 516 struct page *page = pfn_to_page(pfn); 517 518 page->zone_device_data = mdevice->free_pages; 519 mdevice->free_pages = page; 520 } 521 if (ppage) { 522 *ppage = mdevice->free_pages; 523 mdevice->free_pages = (*ppage)->zone_device_data; 524 mdevice->calloc++; 525 } 526 spin_unlock(&mdevice->lock); 527 528 return true; 529 530 err_release: 531 mutex_unlock(&mdevice->devmem_lock); 532 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 533 err_devmem: 534 kfree(devmem); 535 536 return false; 537 } 538 539 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 540 { 541 struct page *dpage = NULL; 542 struct page *rpage; 543 544 /* 545 * This is a fake device so we alloc real system memory to store 546 * our device memory. 547 */ 548 rpage = alloc_page(GFP_HIGHUSER); 549 if (!rpage) 550 return NULL; 551 552 spin_lock(&mdevice->lock); 553 554 if (mdevice->free_pages) { 555 dpage = mdevice->free_pages; 556 mdevice->free_pages = dpage->zone_device_data; 557 mdevice->calloc++; 558 spin_unlock(&mdevice->lock); 559 } else { 560 spin_unlock(&mdevice->lock); 561 if (!dmirror_allocate_chunk(mdevice, &dpage)) 562 goto error; 563 } 564 565 dpage->zone_device_data = rpage; 566 get_page(dpage); 567 lock_page(dpage); 568 return dpage; 569 570 error: 571 __free_page(rpage); 572 return NULL; 573 } 574 575 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 576 struct dmirror *dmirror) 577 { 578 struct dmirror_device *mdevice = dmirror->mdevice; 579 const unsigned long *src = args->src; 580 unsigned long *dst = args->dst; 581 unsigned long addr; 582 583 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 584 src++, dst++) { 585 struct page *spage; 586 struct page *dpage; 587 struct page *rpage; 588 589 if (!(*src & MIGRATE_PFN_MIGRATE)) 590 continue; 591 592 /* 593 * Note that spage might be NULL which is OK since it is an 594 * unallocated pte_none() or read-only zero page. 595 */ 596 spage = migrate_pfn_to_page(*src); 597 598 dpage = dmirror_devmem_alloc_page(mdevice); 599 if (!dpage) 600 continue; 601 602 rpage = dpage->zone_device_data; 603 if (spage) 604 copy_highpage(rpage, spage); 605 else 606 clear_highpage(rpage); 607 608 /* 609 * Normally, a device would use the page->zone_device_data to 610 * point to the mirror but here we use it to hold the page for 611 * the simulated device memory and that page holds the pointer 612 * to the mirror. 613 */ 614 rpage->zone_device_data = dmirror; 615 616 *dst = migrate_pfn(page_to_pfn(dpage)) | 617 MIGRATE_PFN_LOCKED; 618 if ((*src & MIGRATE_PFN_WRITE) || 619 (!spage && args->vma->vm_flags & VM_WRITE)) 620 *dst |= MIGRATE_PFN_WRITE; 621 } 622 } 623 624 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start, 625 unsigned long end) 626 { 627 unsigned long pfn; 628 629 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 630 void *entry; 631 struct page *page; 632 633 entry = xa_load(&dmirror->pt, pfn); 634 page = xa_untag_pointer(entry); 635 if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC) 636 return -EPERM; 637 } 638 639 return 0; 640 } 641 642 static int dmirror_atomic_map(unsigned long start, unsigned long end, 643 struct page **pages, struct dmirror *dmirror) 644 { 645 unsigned long pfn, mapped = 0; 646 int i; 647 648 /* Map the migrated pages into the device's page tables. */ 649 mutex_lock(&dmirror->mutex); 650 651 for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) { 652 void *entry; 653 654 if (!pages[i]) 655 continue; 656 657 entry = pages[i]; 658 entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC); 659 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 660 if (xa_is_err(entry)) { 661 mutex_unlock(&dmirror->mutex); 662 return xa_err(entry); 663 } 664 665 mapped++; 666 } 667 668 mutex_unlock(&dmirror->mutex); 669 return mapped; 670 } 671 672 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 673 struct dmirror *dmirror) 674 { 675 unsigned long start = args->start; 676 unsigned long end = args->end; 677 const unsigned long *src = args->src; 678 const unsigned long *dst = args->dst; 679 unsigned long pfn; 680 681 /* Map the migrated pages into the device's page tables. */ 682 mutex_lock(&dmirror->mutex); 683 684 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 685 src++, dst++) { 686 struct page *dpage; 687 void *entry; 688 689 if (!(*src & MIGRATE_PFN_MIGRATE)) 690 continue; 691 692 dpage = migrate_pfn_to_page(*dst); 693 if (!dpage) 694 continue; 695 696 /* 697 * Store the page that holds the data so the page table 698 * doesn't have to deal with ZONE_DEVICE private pages. 699 */ 700 entry = dpage->zone_device_data; 701 if (*dst & MIGRATE_PFN_WRITE) 702 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 703 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 704 if (xa_is_err(entry)) { 705 mutex_unlock(&dmirror->mutex); 706 return xa_err(entry); 707 } 708 } 709 710 mutex_unlock(&dmirror->mutex); 711 return 0; 712 } 713 714 static int dmirror_exclusive(struct dmirror *dmirror, 715 struct hmm_dmirror_cmd *cmd) 716 { 717 unsigned long start, end, addr; 718 unsigned long size = cmd->npages << PAGE_SHIFT; 719 struct mm_struct *mm = dmirror->notifier.mm; 720 struct page *pages[64]; 721 struct dmirror_bounce bounce; 722 unsigned long next; 723 int ret; 724 725 start = cmd->addr; 726 end = start + size; 727 if (end < start) 728 return -EINVAL; 729 730 /* Since the mm is for the mirrored process, get a reference first. */ 731 if (!mmget_not_zero(mm)) 732 return -EINVAL; 733 734 mmap_read_lock(mm); 735 for (addr = start; addr < end; addr = next) { 736 unsigned long mapped; 737 int i; 738 739 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) 740 next = end; 741 else 742 next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); 743 744 ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 745 mapped = dmirror_atomic_map(addr, next, pages, dmirror); 746 for (i = 0; i < ret; i++) { 747 if (pages[i]) { 748 unlock_page(pages[i]); 749 put_page(pages[i]); 750 } 751 } 752 753 if (addr + (mapped << PAGE_SHIFT) < next) { 754 mmap_read_unlock(mm); 755 mmput(mm); 756 return -EBUSY; 757 } 758 } 759 mmap_read_unlock(mm); 760 mmput(mm); 761 762 /* Return the migrated data for verification. */ 763 ret = dmirror_bounce_init(&bounce, start, size); 764 if (ret) 765 return ret; 766 mutex_lock(&dmirror->mutex); 767 ret = dmirror_do_read(dmirror, start, end, &bounce); 768 mutex_unlock(&dmirror->mutex); 769 if (ret == 0) { 770 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 771 bounce.size)) 772 ret = -EFAULT; 773 } 774 775 cmd->cpages = bounce.cpages; 776 dmirror_bounce_fini(&bounce); 777 return ret; 778 } 779 780 static int dmirror_migrate(struct dmirror *dmirror, 781 struct hmm_dmirror_cmd *cmd) 782 { 783 unsigned long start, end, addr; 784 unsigned long size = cmd->npages << PAGE_SHIFT; 785 struct mm_struct *mm = dmirror->notifier.mm; 786 struct vm_area_struct *vma; 787 unsigned long src_pfns[64]; 788 unsigned long dst_pfns[64]; 789 struct dmirror_bounce bounce; 790 struct migrate_vma args; 791 unsigned long next; 792 int ret; 793 794 start = cmd->addr; 795 end = start + size; 796 if (end < start) 797 return -EINVAL; 798 799 /* Since the mm is for the mirrored process, get a reference first. */ 800 if (!mmget_not_zero(mm)) 801 return -EINVAL; 802 803 mmap_read_lock(mm); 804 for (addr = start; addr < end; addr = next) { 805 vma = vma_lookup(mm, addr); 806 if (!vma || !(vma->vm_flags & VM_READ)) { 807 ret = -EINVAL; 808 goto out; 809 } 810 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 811 if (next > vma->vm_end) 812 next = vma->vm_end; 813 814 args.vma = vma; 815 args.src = src_pfns; 816 args.dst = dst_pfns; 817 args.start = addr; 818 args.end = next; 819 args.pgmap_owner = dmirror->mdevice; 820 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 821 ret = migrate_vma_setup(&args); 822 if (ret) 823 goto out; 824 825 dmirror_migrate_alloc_and_copy(&args, dmirror); 826 migrate_vma_pages(&args); 827 dmirror_migrate_finalize_and_map(&args, dmirror); 828 migrate_vma_finalize(&args); 829 } 830 mmap_read_unlock(mm); 831 mmput(mm); 832 833 /* Return the migrated data for verification. */ 834 ret = dmirror_bounce_init(&bounce, start, size); 835 if (ret) 836 return ret; 837 mutex_lock(&dmirror->mutex); 838 ret = dmirror_do_read(dmirror, start, end, &bounce); 839 mutex_unlock(&dmirror->mutex); 840 if (ret == 0) { 841 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 842 bounce.size)) 843 ret = -EFAULT; 844 } 845 cmd->cpages = bounce.cpages; 846 dmirror_bounce_fini(&bounce); 847 return ret; 848 849 out: 850 mmap_read_unlock(mm); 851 mmput(mm); 852 return ret; 853 } 854 855 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 856 unsigned char *perm, unsigned long entry) 857 { 858 struct page *page; 859 860 if (entry & HMM_PFN_ERROR) { 861 *perm = HMM_DMIRROR_PROT_ERROR; 862 return; 863 } 864 if (!(entry & HMM_PFN_VALID)) { 865 *perm = HMM_DMIRROR_PROT_NONE; 866 return; 867 } 868 869 page = hmm_pfn_to_page(entry); 870 if (is_device_private_page(page)) { 871 /* Is the page migrated to this device or some other? */ 872 if (dmirror->mdevice == dmirror_page_to_device(page)) 873 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 874 else 875 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 876 } else if (is_zero_pfn(page_to_pfn(page))) 877 *perm = HMM_DMIRROR_PROT_ZERO; 878 else 879 *perm = HMM_DMIRROR_PROT_NONE; 880 if (entry & HMM_PFN_WRITE) 881 *perm |= HMM_DMIRROR_PROT_WRITE; 882 else 883 *perm |= HMM_DMIRROR_PROT_READ; 884 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 885 *perm |= HMM_DMIRROR_PROT_PMD; 886 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 887 *perm |= HMM_DMIRROR_PROT_PUD; 888 } 889 890 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 891 const struct mmu_notifier_range *range, 892 unsigned long cur_seq) 893 { 894 struct dmirror_interval *dmi = 895 container_of(mni, struct dmirror_interval, notifier); 896 struct dmirror *dmirror = dmi->dmirror; 897 898 if (mmu_notifier_range_blockable(range)) 899 mutex_lock(&dmirror->mutex); 900 else if (!mutex_trylock(&dmirror->mutex)) 901 return false; 902 903 /* 904 * Snapshots only need to set the sequence number since any 905 * invalidation in the interval invalidates the whole snapshot. 906 */ 907 mmu_interval_set_seq(mni, cur_seq); 908 909 mutex_unlock(&dmirror->mutex); 910 return true; 911 } 912 913 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 914 .invalidate = dmirror_snapshot_invalidate, 915 }; 916 917 static int dmirror_range_snapshot(struct dmirror *dmirror, 918 struct hmm_range *range, 919 unsigned char *perm) 920 { 921 struct mm_struct *mm = dmirror->notifier.mm; 922 struct dmirror_interval notifier; 923 unsigned long timeout = 924 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 925 unsigned long i; 926 unsigned long n; 927 int ret = 0; 928 929 notifier.dmirror = dmirror; 930 range->notifier = ¬ifier.notifier; 931 932 ret = mmu_interval_notifier_insert(range->notifier, mm, 933 range->start, range->end - range->start, 934 &dmirror_mrn_ops); 935 if (ret) 936 return ret; 937 938 while (true) { 939 if (time_after(jiffies, timeout)) { 940 ret = -EBUSY; 941 goto out; 942 } 943 944 range->notifier_seq = mmu_interval_read_begin(range->notifier); 945 946 mmap_read_lock(mm); 947 ret = hmm_range_fault(range); 948 mmap_read_unlock(mm); 949 if (ret) { 950 if (ret == -EBUSY) 951 continue; 952 goto out; 953 } 954 955 mutex_lock(&dmirror->mutex); 956 if (mmu_interval_read_retry(range->notifier, 957 range->notifier_seq)) { 958 mutex_unlock(&dmirror->mutex); 959 continue; 960 } 961 break; 962 } 963 964 n = (range->end - range->start) >> PAGE_SHIFT; 965 for (i = 0; i < n; i++) 966 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 967 968 mutex_unlock(&dmirror->mutex); 969 out: 970 mmu_interval_notifier_remove(range->notifier); 971 return ret; 972 } 973 974 static int dmirror_snapshot(struct dmirror *dmirror, 975 struct hmm_dmirror_cmd *cmd) 976 { 977 struct mm_struct *mm = dmirror->notifier.mm; 978 unsigned long start, end; 979 unsigned long size = cmd->npages << PAGE_SHIFT; 980 unsigned long addr; 981 unsigned long next; 982 unsigned long pfns[64]; 983 unsigned char perm[64]; 984 char __user *uptr; 985 struct hmm_range range = { 986 .hmm_pfns = pfns, 987 .dev_private_owner = dmirror->mdevice, 988 }; 989 int ret = 0; 990 991 start = cmd->addr; 992 end = start + size; 993 if (end < start) 994 return -EINVAL; 995 996 /* Since the mm is for the mirrored process, get a reference first. */ 997 if (!mmget_not_zero(mm)) 998 return -EINVAL; 999 1000 /* 1001 * Register a temporary notifier to detect invalidations even if it 1002 * overlaps with other mmu_interval_notifiers. 1003 */ 1004 uptr = u64_to_user_ptr(cmd->ptr); 1005 for (addr = start; addr < end; addr = next) { 1006 unsigned long n; 1007 1008 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 1009 range.start = addr; 1010 range.end = next; 1011 1012 ret = dmirror_range_snapshot(dmirror, &range, perm); 1013 if (ret) 1014 break; 1015 1016 n = (range.end - range.start) >> PAGE_SHIFT; 1017 if (copy_to_user(uptr, perm, n)) { 1018 ret = -EFAULT; 1019 break; 1020 } 1021 1022 cmd->cpages += n; 1023 uptr += n; 1024 } 1025 mmput(mm); 1026 1027 return ret; 1028 } 1029 1030 static long dmirror_fops_unlocked_ioctl(struct file *filp, 1031 unsigned int command, 1032 unsigned long arg) 1033 { 1034 void __user *uarg = (void __user *)arg; 1035 struct hmm_dmirror_cmd cmd; 1036 struct dmirror *dmirror; 1037 int ret; 1038 1039 dmirror = filp->private_data; 1040 if (!dmirror) 1041 return -EINVAL; 1042 1043 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 1044 return -EFAULT; 1045 1046 if (cmd.addr & ~PAGE_MASK) 1047 return -EINVAL; 1048 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 1049 return -EINVAL; 1050 1051 cmd.cpages = 0; 1052 cmd.faults = 0; 1053 1054 switch (command) { 1055 case HMM_DMIRROR_READ: 1056 ret = dmirror_read(dmirror, &cmd); 1057 break; 1058 1059 case HMM_DMIRROR_WRITE: 1060 ret = dmirror_write(dmirror, &cmd); 1061 break; 1062 1063 case HMM_DMIRROR_MIGRATE: 1064 ret = dmirror_migrate(dmirror, &cmd); 1065 break; 1066 1067 case HMM_DMIRROR_EXCLUSIVE: 1068 ret = dmirror_exclusive(dmirror, &cmd); 1069 break; 1070 1071 case HMM_DMIRROR_CHECK_EXCLUSIVE: 1072 ret = dmirror_check_atomic(dmirror, cmd.addr, 1073 cmd.addr + (cmd.npages << PAGE_SHIFT)); 1074 break; 1075 1076 case HMM_DMIRROR_SNAPSHOT: 1077 ret = dmirror_snapshot(dmirror, &cmd); 1078 break; 1079 1080 default: 1081 return -EINVAL; 1082 } 1083 if (ret) 1084 return ret; 1085 1086 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 1087 return -EFAULT; 1088 1089 return 0; 1090 } 1091 1092 static const struct file_operations dmirror_fops = { 1093 .open = dmirror_fops_open, 1094 .release = dmirror_fops_release, 1095 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 1096 .llseek = default_llseek, 1097 .owner = THIS_MODULE, 1098 }; 1099 1100 static void dmirror_devmem_free(struct page *page) 1101 { 1102 struct page *rpage = page->zone_device_data; 1103 struct dmirror_device *mdevice; 1104 1105 if (rpage) 1106 __free_page(rpage); 1107 1108 mdevice = dmirror_page_to_device(page); 1109 1110 spin_lock(&mdevice->lock); 1111 mdevice->cfree++; 1112 page->zone_device_data = mdevice->free_pages; 1113 mdevice->free_pages = page; 1114 spin_unlock(&mdevice->lock); 1115 } 1116 1117 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 1118 struct dmirror *dmirror) 1119 { 1120 const unsigned long *src = args->src; 1121 unsigned long *dst = args->dst; 1122 unsigned long start = args->start; 1123 unsigned long end = args->end; 1124 unsigned long addr; 1125 1126 for (addr = start; addr < end; addr += PAGE_SIZE, 1127 src++, dst++) { 1128 struct page *dpage, *spage; 1129 1130 spage = migrate_pfn_to_page(*src); 1131 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1132 continue; 1133 spage = spage->zone_device_data; 1134 1135 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1136 if (!dpage) 1137 continue; 1138 1139 lock_page(dpage); 1140 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1141 copy_highpage(dpage, spage); 1142 *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 1143 if (*src & MIGRATE_PFN_WRITE) 1144 *dst |= MIGRATE_PFN_WRITE; 1145 } 1146 return 0; 1147 } 1148 1149 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1150 { 1151 struct migrate_vma args; 1152 unsigned long src_pfns; 1153 unsigned long dst_pfns; 1154 struct page *rpage; 1155 struct dmirror *dmirror; 1156 vm_fault_t ret; 1157 1158 /* 1159 * Normally, a device would use the page->zone_device_data to point to 1160 * the mirror but here we use it to hold the page for the simulated 1161 * device memory and that page holds the pointer to the mirror. 1162 */ 1163 rpage = vmf->page->zone_device_data; 1164 dmirror = rpage->zone_device_data; 1165 1166 /* FIXME demonstrate how we can adjust migrate range */ 1167 args.vma = vmf->vma; 1168 args.start = vmf->address; 1169 args.end = args.start + PAGE_SIZE; 1170 args.src = &src_pfns; 1171 args.dst = &dst_pfns; 1172 args.pgmap_owner = dmirror->mdevice; 1173 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1174 1175 if (migrate_vma_setup(&args)) 1176 return VM_FAULT_SIGBUS; 1177 1178 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1179 if (ret) 1180 return ret; 1181 migrate_vma_pages(&args); 1182 /* 1183 * No device finalize step is needed since 1184 * dmirror_devmem_fault_alloc_and_copy() will have already 1185 * invalidated the device page table. 1186 */ 1187 migrate_vma_finalize(&args); 1188 return 0; 1189 } 1190 1191 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1192 .page_free = dmirror_devmem_free, 1193 .migrate_to_ram = dmirror_devmem_fault, 1194 }; 1195 1196 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1197 { 1198 dev_t dev; 1199 int ret; 1200 1201 dev = MKDEV(MAJOR(dmirror_dev), id); 1202 mutex_init(&mdevice->devmem_lock); 1203 spin_lock_init(&mdevice->lock); 1204 1205 cdev_init(&mdevice->cdevice, &dmirror_fops); 1206 mdevice->cdevice.owner = THIS_MODULE; 1207 ret = cdev_add(&mdevice->cdevice, dev, 1); 1208 if (ret) 1209 return ret; 1210 1211 /* Build a list of free ZONE_DEVICE private struct pages */ 1212 dmirror_allocate_chunk(mdevice, NULL); 1213 1214 return 0; 1215 } 1216 1217 static void dmirror_device_remove(struct dmirror_device *mdevice) 1218 { 1219 unsigned int i; 1220 1221 if (mdevice->devmem_chunks) { 1222 for (i = 0; i < mdevice->devmem_count; i++) { 1223 struct dmirror_chunk *devmem = 1224 mdevice->devmem_chunks[i]; 1225 1226 memunmap_pages(&devmem->pagemap); 1227 release_mem_region(devmem->pagemap.range.start, 1228 range_len(&devmem->pagemap.range)); 1229 kfree(devmem); 1230 } 1231 kfree(mdevice->devmem_chunks); 1232 } 1233 1234 cdev_del(&mdevice->cdevice); 1235 } 1236 1237 static int __init hmm_dmirror_init(void) 1238 { 1239 int ret; 1240 int id; 1241 1242 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1243 "HMM_DMIRROR"); 1244 if (ret) 1245 goto err_unreg; 1246 1247 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1248 ret = dmirror_device_init(dmirror_devices + id, id); 1249 if (ret) 1250 goto err_chrdev; 1251 } 1252 1253 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1254 return 0; 1255 1256 err_chrdev: 1257 while (--id >= 0) 1258 dmirror_device_remove(dmirror_devices + id); 1259 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1260 err_unreg: 1261 return ret; 1262 } 1263 1264 static void __exit hmm_dmirror_exit(void) 1265 { 1266 int id; 1267 1268 for (id = 0; id < DMIRROR_NDEVICES; id++) 1269 dmirror_device_remove(dmirror_devices + id); 1270 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1271 } 1272 1273 module_init(hmm_dmirror_init); 1274 module_exit(hmm_dmirror_exit); 1275 MODULE_LICENSE("GPL"); 1276