1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * This is a module to test the HMM (Heterogeneous Memory Management) 4 * mirror and zone device private memory migration APIs of the kernel. 5 * Userspace programs can register with the driver to mirror their own address 6 * space and can use the device to read/write any valid virtual address. 7 */ 8 #include <linux/init.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/module.h> 12 #include <linux/kernel.h> 13 #include <linux/cdev.h> 14 #include <linux/device.h> 15 #include <linux/mutex.h> 16 #include <linux/rwsem.h> 17 #include <linux/sched.h> 18 #include <linux/slab.h> 19 #include <linux/highmem.h> 20 #include <linux/delay.h> 21 #include <linux/pagemap.h> 22 #include <linux/hmm.h> 23 #include <linux/vmalloc.h> 24 #include <linux/swap.h> 25 #include <linux/swapops.h> 26 #include <linux/sched/mm.h> 27 #include <linux/platform_device.h> 28 29 #include "test_hmm_uapi.h" 30 31 #define DMIRROR_NDEVICES 2 32 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 33 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) 34 #define DEVMEM_CHUNKS_RESERVE 16 35 36 static const struct dev_pagemap_ops dmirror_devmem_ops; 37 static const struct mmu_interval_notifier_ops dmirror_min_ops; 38 static dev_t dmirror_dev; 39 static struct page *dmirror_zero_page; 40 41 struct dmirror_device; 42 43 struct dmirror_bounce { 44 void *ptr; 45 unsigned long size; 46 unsigned long addr; 47 unsigned long cpages; 48 }; 49 50 #define DPT_XA_TAG_WRITE 3UL 51 52 /* 53 * Data structure to track address ranges and register for mmu interval 54 * notifier updates. 55 */ 56 struct dmirror_interval { 57 struct mmu_interval_notifier notifier; 58 struct dmirror *dmirror; 59 }; 60 61 /* 62 * Data attached to the open device file. 63 * Note that it might be shared after a fork(). 64 */ 65 struct dmirror { 66 struct dmirror_device *mdevice; 67 struct xarray pt; 68 struct mmu_interval_notifier notifier; 69 struct mutex mutex; 70 }; 71 72 /* 73 * ZONE_DEVICE pages for migration and simulating device memory. 74 */ 75 struct dmirror_chunk { 76 struct dev_pagemap pagemap; 77 struct dmirror_device *mdevice; 78 }; 79 80 /* 81 * Per device data. 82 */ 83 struct dmirror_device { 84 struct cdev cdevice; 85 struct hmm_devmem *devmem; 86 87 unsigned int devmem_capacity; 88 unsigned int devmem_count; 89 struct dmirror_chunk **devmem_chunks; 90 struct mutex devmem_lock; /* protects the above */ 91 92 unsigned long calloc; 93 unsigned long cfree; 94 struct page *free_pages; 95 spinlock_t lock; /* protects the above */ 96 }; 97 98 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; 99 100 static int dmirror_bounce_init(struct dmirror_bounce *bounce, 101 unsigned long addr, 102 unsigned long size) 103 { 104 bounce->addr = addr; 105 bounce->size = size; 106 bounce->cpages = 0; 107 bounce->ptr = vmalloc(size); 108 if (!bounce->ptr) 109 return -ENOMEM; 110 return 0; 111 } 112 113 static void dmirror_bounce_fini(struct dmirror_bounce *bounce) 114 { 115 vfree(bounce->ptr); 116 } 117 118 static int dmirror_fops_open(struct inode *inode, struct file *filp) 119 { 120 struct cdev *cdev = inode->i_cdev; 121 struct dmirror *dmirror; 122 int ret; 123 124 /* Mirror this process address space */ 125 dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); 126 if (dmirror == NULL) 127 return -ENOMEM; 128 129 dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); 130 mutex_init(&dmirror->mutex); 131 xa_init(&dmirror->pt); 132 133 ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, 134 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); 135 if (ret) { 136 kfree(dmirror); 137 return ret; 138 } 139 140 filp->private_data = dmirror; 141 return 0; 142 } 143 144 static int dmirror_fops_release(struct inode *inode, struct file *filp) 145 { 146 struct dmirror *dmirror = filp->private_data; 147 148 mmu_interval_notifier_remove(&dmirror->notifier); 149 xa_destroy(&dmirror->pt); 150 kfree(dmirror); 151 return 0; 152 } 153 154 static struct dmirror_device *dmirror_page_to_device(struct page *page) 155 156 { 157 return container_of(page->pgmap, struct dmirror_chunk, 158 pagemap)->mdevice; 159 } 160 161 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) 162 { 163 unsigned long *pfns = range->hmm_pfns; 164 unsigned long pfn; 165 166 for (pfn = (range->start >> PAGE_SHIFT); 167 pfn < (range->end >> PAGE_SHIFT); 168 pfn++, pfns++) { 169 struct page *page; 170 void *entry; 171 172 /* 173 * Since we asked for hmm_range_fault() to populate pages, 174 * it shouldn't return an error entry on success. 175 */ 176 WARN_ON(*pfns & HMM_PFN_ERROR); 177 WARN_ON(!(*pfns & HMM_PFN_VALID)); 178 179 page = hmm_pfn_to_page(*pfns); 180 WARN_ON(!page); 181 182 entry = page; 183 if (*pfns & HMM_PFN_WRITE) 184 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 185 else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) 186 return -EFAULT; 187 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 188 if (xa_is_err(entry)) 189 return xa_err(entry); 190 } 191 192 return 0; 193 } 194 195 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, 196 unsigned long end) 197 { 198 unsigned long pfn; 199 void *entry; 200 201 /* 202 * The XArray doesn't hold references to pages since it relies on 203 * the mmu notifier to clear page pointers when they become stale. 204 * Therefore, it is OK to just clear the entry. 205 */ 206 xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, 207 end >> PAGE_SHIFT) 208 xa_erase(&dmirror->pt, pfn); 209 } 210 211 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, 212 const struct mmu_notifier_range *range, 213 unsigned long cur_seq) 214 { 215 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 216 217 /* 218 * Ignore invalidation callbacks for device private pages since 219 * the invalidation is handled as part of the migration process. 220 */ 221 if (range->event == MMU_NOTIFY_MIGRATE && 222 range->migrate_pgmap_owner == dmirror->mdevice) 223 return true; 224 225 if (mmu_notifier_range_blockable(range)) 226 mutex_lock(&dmirror->mutex); 227 else if (!mutex_trylock(&dmirror->mutex)) 228 return false; 229 230 mmu_interval_set_seq(mni, cur_seq); 231 dmirror_do_update(dmirror, range->start, range->end); 232 233 mutex_unlock(&dmirror->mutex); 234 return true; 235 } 236 237 static const struct mmu_interval_notifier_ops dmirror_min_ops = { 238 .invalidate = dmirror_interval_invalidate, 239 }; 240 241 static int dmirror_range_fault(struct dmirror *dmirror, 242 struct hmm_range *range) 243 { 244 struct mm_struct *mm = dmirror->notifier.mm; 245 unsigned long timeout = 246 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 247 int ret; 248 249 while (true) { 250 if (time_after(jiffies, timeout)) { 251 ret = -EBUSY; 252 goto out; 253 } 254 255 range->notifier_seq = mmu_interval_read_begin(range->notifier); 256 mmap_read_lock(mm); 257 ret = hmm_range_fault(range); 258 mmap_read_unlock(mm); 259 if (ret) { 260 if (ret == -EBUSY) 261 continue; 262 goto out; 263 } 264 265 mutex_lock(&dmirror->mutex); 266 if (mmu_interval_read_retry(range->notifier, 267 range->notifier_seq)) { 268 mutex_unlock(&dmirror->mutex); 269 continue; 270 } 271 break; 272 } 273 274 ret = dmirror_do_fault(dmirror, range); 275 276 mutex_unlock(&dmirror->mutex); 277 out: 278 return ret; 279 } 280 281 static int dmirror_fault(struct dmirror *dmirror, unsigned long start, 282 unsigned long end, bool write) 283 { 284 struct mm_struct *mm = dmirror->notifier.mm; 285 unsigned long addr; 286 unsigned long pfns[64]; 287 struct hmm_range range = { 288 .notifier = &dmirror->notifier, 289 .hmm_pfns = pfns, 290 .pfn_flags_mask = 0, 291 .default_flags = 292 HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), 293 .dev_private_owner = dmirror->mdevice, 294 }; 295 int ret = 0; 296 297 /* Since the mm is for the mirrored process, get a reference first. */ 298 if (!mmget_not_zero(mm)) 299 return 0; 300 301 for (addr = start; addr < end; addr = range.end) { 302 range.start = addr; 303 range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 304 305 ret = dmirror_range_fault(dmirror, &range); 306 if (ret) 307 break; 308 } 309 310 mmput(mm); 311 return ret; 312 } 313 314 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, 315 unsigned long end, struct dmirror_bounce *bounce) 316 { 317 unsigned long pfn; 318 void *ptr; 319 320 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 321 322 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 323 void *entry; 324 struct page *page; 325 void *tmp; 326 327 entry = xa_load(&dmirror->pt, pfn); 328 page = xa_untag_pointer(entry); 329 if (!page) 330 return -ENOENT; 331 332 tmp = kmap(page); 333 memcpy(ptr, tmp, PAGE_SIZE); 334 kunmap(page); 335 336 ptr += PAGE_SIZE; 337 bounce->cpages++; 338 } 339 340 return 0; 341 } 342 343 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 344 { 345 struct dmirror_bounce bounce; 346 unsigned long start, end; 347 unsigned long size = cmd->npages << PAGE_SHIFT; 348 int ret; 349 350 start = cmd->addr; 351 end = start + size; 352 if (end < start) 353 return -EINVAL; 354 355 ret = dmirror_bounce_init(&bounce, start, size); 356 if (ret) 357 return ret; 358 359 while (1) { 360 mutex_lock(&dmirror->mutex); 361 ret = dmirror_do_read(dmirror, start, end, &bounce); 362 mutex_unlock(&dmirror->mutex); 363 if (ret != -ENOENT) 364 break; 365 366 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 367 ret = dmirror_fault(dmirror, start, end, false); 368 if (ret) 369 break; 370 cmd->faults++; 371 } 372 373 if (ret == 0) { 374 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 375 bounce.size)) 376 ret = -EFAULT; 377 } 378 cmd->cpages = bounce.cpages; 379 dmirror_bounce_fini(&bounce); 380 return ret; 381 } 382 383 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, 384 unsigned long end, struct dmirror_bounce *bounce) 385 { 386 unsigned long pfn; 387 void *ptr; 388 389 ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); 390 391 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { 392 void *entry; 393 struct page *page; 394 void *tmp; 395 396 entry = xa_load(&dmirror->pt, pfn); 397 page = xa_untag_pointer(entry); 398 if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) 399 return -ENOENT; 400 401 tmp = kmap(page); 402 memcpy(tmp, ptr, PAGE_SIZE); 403 kunmap(page); 404 405 ptr += PAGE_SIZE; 406 bounce->cpages++; 407 } 408 409 return 0; 410 } 411 412 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) 413 { 414 struct dmirror_bounce bounce; 415 unsigned long start, end; 416 unsigned long size = cmd->npages << PAGE_SHIFT; 417 int ret; 418 419 start = cmd->addr; 420 end = start + size; 421 if (end < start) 422 return -EINVAL; 423 424 ret = dmirror_bounce_init(&bounce, start, size); 425 if (ret) 426 return ret; 427 if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), 428 bounce.size)) { 429 ret = -EFAULT; 430 goto fini; 431 } 432 433 while (1) { 434 mutex_lock(&dmirror->mutex); 435 ret = dmirror_do_write(dmirror, start, end, &bounce); 436 mutex_unlock(&dmirror->mutex); 437 if (ret != -ENOENT) 438 break; 439 440 start = cmd->addr + (bounce.cpages << PAGE_SHIFT); 441 ret = dmirror_fault(dmirror, start, end, true); 442 if (ret) 443 break; 444 cmd->faults++; 445 } 446 447 fini: 448 cmd->cpages = bounce.cpages; 449 dmirror_bounce_fini(&bounce); 450 return ret; 451 } 452 453 static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, 454 struct page **ppage) 455 { 456 struct dmirror_chunk *devmem; 457 struct resource *res; 458 unsigned long pfn; 459 unsigned long pfn_first; 460 unsigned long pfn_last; 461 void *ptr; 462 463 devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); 464 if (!devmem) 465 return -ENOMEM; 466 467 res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, 468 "hmm_dmirror"); 469 if (IS_ERR(res)) 470 goto err_devmem; 471 472 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 473 devmem->pagemap.range.start = res->start; 474 devmem->pagemap.range.end = res->end; 475 devmem->pagemap.nr_range = 1; 476 devmem->pagemap.ops = &dmirror_devmem_ops; 477 devmem->pagemap.owner = mdevice; 478 479 mutex_lock(&mdevice->devmem_lock); 480 481 if (mdevice->devmem_count == mdevice->devmem_capacity) { 482 struct dmirror_chunk **new_chunks; 483 unsigned int new_capacity; 484 485 new_capacity = mdevice->devmem_capacity + 486 DEVMEM_CHUNKS_RESERVE; 487 new_chunks = krealloc(mdevice->devmem_chunks, 488 sizeof(new_chunks[0]) * new_capacity, 489 GFP_KERNEL); 490 if (!new_chunks) 491 goto err_release; 492 mdevice->devmem_capacity = new_capacity; 493 mdevice->devmem_chunks = new_chunks; 494 } 495 496 ptr = memremap_pages(&devmem->pagemap, numa_node_id()); 497 if (IS_ERR(ptr)) 498 goto err_release; 499 500 devmem->mdevice = mdevice; 501 pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; 502 pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); 503 mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; 504 505 mutex_unlock(&mdevice->devmem_lock); 506 507 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", 508 DEVMEM_CHUNK_SIZE / (1024 * 1024), 509 mdevice->devmem_count, 510 mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), 511 pfn_first, pfn_last); 512 513 spin_lock(&mdevice->lock); 514 for (pfn = pfn_first; pfn < pfn_last; pfn++) { 515 struct page *page = pfn_to_page(pfn); 516 517 page->zone_device_data = mdevice->free_pages; 518 mdevice->free_pages = page; 519 } 520 if (ppage) { 521 *ppage = mdevice->free_pages; 522 mdevice->free_pages = (*ppage)->zone_device_data; 523 mdevice->calloc++; 524 } 525 spin_unlock(&mdevice->lock); 526 527 return true; 528 529 err_release: 530 mutex_unlock(&mdevice->devmem_lock); 531 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); 532 err_devmem: 533 kfree(devmem); 534 535 return false; 536 } 537 538 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 539 { 540 struct page *dpage = NULL; 541 struct page *rpage; 542 543 /* 544 * This is a fake device so we alloc real system memory to store 545 * our device memory. 546 */ 547 rpage = alloc_page(GFP_HIGHUSER); 548 if (!rpage) 549 return NULL; 550 551 spin_lock(&mdevice->lock); 552 553 if (mdevice->free_pages) { 554 dpage = mdevice->free_pages; 555 mdevice->free_pages = dpage->zone_device_data; 556 mdevice->calloc++; 557 spin_unlock(&mdevice->lock); 558 } else { 559 spin_unlock(&mdevice->lock); 560 if (!dmirror_allocate_chunk(mdevice, &dpage)) 561 goto error; 562 } 563 564 dpage->zone_device_data = rpage; 565 get_page(dpage); 566 lock_page(dpage); 567 return dpage; 568 569 error: 570 __free_page(rpage); 571 return NULL; 572 } 573 574 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 575 struct dmirror *dmirror) 576 { 577 struct dmirror_device *mdevice = dmirror->mdevice; 578 const unsigned long *src = args->src; 579 unsigned long *dst = args->dst; 580 unsigned long addr; 581 582 for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 583 src++, dst++) { 584 struct page *spage; 585 struct page *dpage; 586 struct page *rpage; 587 588 if (!(*src & MIGRATE_PFN_MIGRATE)) 589 continue; 590 591 /* 592 * Note that spage might be NULL which is OK since it is an 593 * unallocated pte_none() or read-only zero page. 594 */ 595 spage = migrate_pfn_to_page(*src); 596 597 dpage = dmirror_devmem_alloc_page(mdevice); 598 if (!dpage) 599 continue; 600 601 rpage = dpage->zone_device_data; 602 if (spage) 603 copy_highpage(rpage, spage); 604 else 605 clear_highpage(rpage); 606 607 /* 608 * Normally, a device would use the page->zone_device_data to 609 * point to the mirror but here we use it to hold the page for 610 * the simulated device memory and that page holds the pointer 611 * to the mirror. 612 */ 613 rpage->zone_device_data = dmirror; 614 615 *dst = migrate_pfn(page_to_pfn(dpage)) | 616 MIGRATE_PFN_LOCKED; 617 if ((*src & MIGRATE_PFN_WRITE) || 618 (!spage && args->vma->vm_flags & VM_WRITE)) 619 *dst |= MIGRATE_PFN_WRITE; 620 } 621 } 622 623 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, 624 struct dmirror *dmirror) 625 { 626 unsigned long start = args->start; 627 unsigned long end = args->end; 628 const unsigned long *src = args->src; 629 const unsigned long *dst = args->dst; 630 unsigned long pfn; 631 632 /* Map the migrated pages into the device's page tables. */ 633 mutex_lock(&dmirror->mutex); 634 635 for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 636 src++, dst++) { 637 struct page *dpage; 638 void *entry; 639 640 if (!(*src & MIGRATE_PFN_MIGRATE)) 641 continue; 642 643 dpage = migrate_pfn_to_page(*dst); 644 if (!dpage) 645 continue; 646 647 /* 648 * Store the page that holds the data so the page table 649 * doesn't have to deal with ZONE_DEVICE private pages. 650 */ 651 entry = dpage->zone_device_data; 652 if (*dst & MIGRATE_PFN_WRITE) 653 entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 654 entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 655 if (xa_is_err(entry)) { 656 mutex_unlock(&dmirror->mutex); 657 return xa_err(entry); 658 } 659 } 660 661 mutex_unlock(&dmirror->mutex); 662 return 0; 663 } 664 665 static int dmirror_migrate(struct dmirror *dmirror, 666 struct hmm_dmirror_cmd *cmd) 667 { 668 unsigned long start, end, addr; 669 unsigned long size = cmd->npages << PAGE_SHIFT; 670 struct mm_struct *mm = dmirror->notifier.mm; 671 struct vm_area_struct *vma; 672 unsigned long src_pfns[64]; 673 unsigned long dst_pfns[64]; 674 struct dmirror_bounce bounce; 675 struct migrate_vma args; 676 unsigned long next; 677 int ret; 678 679 start = cmd->addr; 680 end = start + size; 681 if (end < start) 682 return -EINVAL; 683 684 /* Since the mm is for the mirrored process, get a reference first. */ 685 if (!mmget_not_zero(mm)) 686 return -EINVAL; 687 688 mmap_read_lock(mm); 689 for (addr = start; addr < end; addr = next) { 690 vma = find_vma(mm, addr); 691 if (!vma || addr < vma->vm_start || 692 !(vma->vm_flags & VM_READ)) { 693 ret = -EINVAL; 694 goto out; 695 } 696 next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 697 if (next > vma->vm_end) 698 next = vma->vm_end; 699 700 args.vma = vma; 701 args.src = src_pfns; 702 args.dst = dst_pfns; 703 args.start = addr; 704 args.end = next; 705 args.pgmap_owner = dmirror->mdevice; 706 args.flags = MIGRATE_VMA_SELECT_SYSTEM; 707 ret = migrate_vma_setup(&args); 708 if (ret) 709 goto out; 710 711 dmirror_migrate_alloc_and_copy(&args, dmirror); 712 migrate_vma_pages(&args); 713 dmirror_migrate_finalize_and_map(&args, dmirror); 714 migrate_vma_finalize(&args); 715 } 716 mmap_read_unlock(mm); 717 mmput(mm); 718 719 /* Return the migrated data for verification. */ 720 ret = dmirror_bounce_init(&bounce, start, size); 721 if (ret) 722 return ret; 723 mutex_lock(&dmirror->mutex); 724 ret = dmirror_do_read(dmirror, start, end, &bounce); 725 mutex_unlock(&dmirror->mutex); 726 if (ret == 0) { 727 if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, 728 bounce.size)) 729 ret = -EFAULT; 730 } 731 cmd->cpages = bounce.cpages; 732 dmirror_bounce_fini(&bounce); 733 return ret; 734 735 out: 736 mmap_read_unlock(mm); 737 mmput(mm); 738 return ret; 739 } 740 741 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, 742 unsigned char *perm, unsigned long entry) 743 { 744 struct page *page; 745 746 if (entry & HMM_PFN_ERROR) { 747 *perm = HMM_DMIRROR_PROT_ERROR; 748 return; 749 } 750 if (!(entry & HMM_PFN_VALID)) { 751 *perm = HMM_DMIRROR_PROT_NONE; 752 return; 753 } 754 755 page = hmm_pfn_to_page(entry); 756 if (is_device_private_page(page)) { 757 /* Is the page migrated to this device or some other? */ 758 if (dmirror->mdevice == dmirror_page_to_device(page)) 759 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; 760 else 761 *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; 762 } else if (is_zero_pfn(page_to_pfn(page))) 763 *perm = HMM_DMIRROR_PROT_ZERO; 764 else 765 *perm = HMM_DMIRROR_PROT_NONE; 766 if (entry & HMM_PFN_WRITE) 767 *perm |= HMM_DMIRROR_PROT_WRITE; 768 else 769 *perm |= HMM_DMIRROR_PROT_READ; 770 if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 771 *perm |= HMM_DMIRROR_PROT_PMD; 772 else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 773 *perm |= HMM_DMIRROR_PROT_PUD; 774 } 775 776 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, 777 const struct mmu_notifier_range *range, 778 unsigned long cur_seq) 779 { 780 struct dmirror_interval *dmi = 781 container_of(mni, struct dmirror_interval, notifier); 782 struct dmirror *dmirror = dmi->dmirror; 783 784 if (mmu_notifier_range_blockable(range)) 785 mutex_lock(&dmirror->mutex); 786 else if (!mutex_trylock(&dmirror->mutex)) 787 return false; 788 789 /* 790 * Snapshots only need to set the sequence number since any 791 * invalidation in the interval invalidates the whole snapshot. 792 */ 793 mmu_interval_set_seq(mni, cur_seq); 794 795 mutex_unlock(&dmirror->mutex); 796 return true; 797 } 798 799 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { 800 .invalidate = dmirror_snapshot_invalidate, 801 }; 802 803 static int dmirror_range_snapshot(struct dmirror *dmirror, 804 struct hmm_range *range, 805 unsigned char *perm) 806 { 807 struct mm_struct *mm = dmirror->notifier.mm; 808 struct dmirror_interval notifier; 809 unsigned long timeout = 810 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 811 unsigned long i; 812 unsigned long n; 813 int ret = 0; 814 815 notifier.dmirror = dmirror; 816 range->notifier = ¬ifier.notifier; 817 818 ret = mmu_interval_notifier_insert(range->notifier, mm, 819 range->start, range->end - range->start, 820 &dmirror_mrn_ops); 821 if (ret) 822 return ret; 823 824 while (true) { 825 if (time_after(jiffies, timeout)) { 826 ret = -EBUSY; 827 goto out; 828 } 829 830 range->notifier_seq = mmu_interval_read_begin(range->notifier); 831 832 mmap_read_lock(mm); 833 ret = hmm_range_fault(range); 834 mmap_read_unlock(mm); 835 if (ret) { 836 if (ret == -EBUSY) 837 continue; 838 goto out; 839 } 840 841 mutex_lock(&dmirror->mutex); 842 if (mmu_interval_read_retry(range->notifier, 843 range->notifier_seq)) { 844 mutex_unlock(&dmirror->mutex); 845 continue; 846 } 847 break; 848 } 849 850 n = (range->end - range->start) >> PAGE_SHIFT; 851 for (i = 0; i < n; i++) 852 dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); 853 854 mutex_unlock(&dmirror->mutex); 855 out: 856 mmu_interval_notifier_remove(range->notifier); 857 return ret; 858 } 859 860 static int dmirror_snapshot(struct dmirror *dmirror, 861 struct hmm_dmirror_cmd *cmd) 862 { 863 struct mm_struct *mm = dmirror->notifier.mm; 864 unsigned long start, end; 865 unsigned long size = cmd->npages << PAGE_SHIFT; 866 unsigned long addr; 867 unsigned long next; 868 unsigned long pfns[64]; 869 unsigned char perm[64]; 870 char __user *uptr; 871 struct hmm_range range = { 872 .hmm_pfns = pfns, 873 .dev_private_owner = dmirror->mdevice, 874 }; 875 int ret = 0; 876 877 start = cmd->addr; 878 end = start + size; 879 if (end < start) 880 return -EINVAL; 881 882 /* Since the mm is for the mirrored process, get a reference first. */ 883 if (!mmget_not_zero(mm)) 884 return -EINVAL; 885 886 /* 887 * Register a temporary notifier to detect invalidations even if it 888 * overlaps with other mmu_interval_notifiers. 889 */ 890 uptr = u64_to_user_ptr(cmd->ptr); 891 for (addr = start; addr < end; addr = next) { 892 unsigned long n; 893 894 next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); 895 range.start = addr; 896 range.end = next; 897 898 ret = dmirror_range_snapshot(dmirror, &range, perm); 899 if (ret) 900 break; 901 902 n = (range.end - range.start) >> PAGE_SHIFT; 903 if (copy_to_user(uptr, perm, n)) { 904 ret = -EFAULT; 905 break; 906 } 907 908 cmd->cpages += n; 909 uptr += n; 910 } 911 mmput(mm); 912 913 return ret; 914 } 915 916 static long dmirror_fops_unlocked_ioctl(struct file *filp, 917 unsigned int command, 918 unsigned long arg) 919 { 920 void __user *uarg = (void __user *)arg; 921 struct hmm_dmirror_cmd cmd; 922 struct dmirror *dmirror; 923 int ret; 924 925 dmirror = filp->private_data; 926 if (!dmirror) 927 return -EINVAL; 928 929 if (copy_from_user(&cmd, uarg, sizeof(cmd))) 930 return -EFAULT; 931 932 if (cmd.addr & ~PAGE_MASK) 933 return -EINVAL; 934 if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) 935 return -EINVAL; 936 937 cmd.cpages = 0; 938 cmd.faults = 0; 939 940 switch (command) { 941 case HMM_DMIRROR_READ: 942 ret = dmirror_read(dmirror, &cmd); 943 break; 944 945 case HMM_DMIRROR_WRITE: 946 ret = dmirror_write(dmirror, &cmd); 947 break; 948 949 case HMM_DMIRROR_MIGRATE: 950 ret = dmirror_migrate(dmirror, &cmd); 951 break; 952 953 case HMM_DMIRROR_SNAPSHOT: 954 ret = dmirror_snapshot(dmirror, &cmd); 955 break; 956 957 default: 958 return -EINVAL; 959 } 960 if (ret) 961 return ret; 962 963 if (copy_to_user(uarg, &cmd, sizeof(cmd))) 964 return -EFAULT; 965 966 return 0; 967 } 968 969 static const struct file_operations dmirror_fops = { 970 .open = dmirror_fops_open, 971 .release = dmirror_fops_release, 972 .unlocked_ioctl = dmirror_fops_unlocked_ioctl, 973 .llseek = default_llseek, 974 .owner = THIS_MODULE, 975 }; 976 977 static void dmirror_devmem_free(struct page *page) 978 { 979 struct page *rpage = page->zone_device_data; 980 struct dmirror_device *mdevice; 981 982 if (rpage) 983 __free_page(rpage); 984 985 mdevice = dmirror_page_to_device(page); 986 987 spin_lock(&mdevice->lock); 988 mdevice->cfree++; 989 page->zone_device_data = mdevice->free_pages; 990 mdevice->free_pages = page; 991 spin_unlock(&mdevice->lock); 992 } 993 994 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 995 struct dmirror *dmirror) 996 { 997 const unsigned long *src = args->src; 998 unsigned long *dst = args->dst; 999 unsigned long start = args->start; 1000 unsigned long end = args->end; 1001 unsigned long addr; 1002 1003 for (addr = start; addr < end; addr += PAGE_SIZE, 1004 src++, dst++) { 1005 struct page *dpage, *spage; 1006 1007 spage = migrate_pfn_to_page(*src); 1008 if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 1009 continue; 1010 spage = spage->zone_device_data; 1011 1012 dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 1013 if (!dpage) 1014 continue; 1015 1016 lock_page(dpage); 1017 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1018 copy_highpage(dpage, spage); 1019 *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 1020 if (*src & MIGRATE_PFN_WRITE) 1021 *dst |= MIGRATE_PFN_WRITE; 1022 } 1023 return 0; 1024 } 1025 1026 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1027 { 1028 struct migrate_vma args; 1029 unsigned long src_pfns; 1030 unsigned long dst_pfns; 1031 struct page *rpage; 1032 struct dmirror *dmirror; 1033 vm_fault_t ret; 1034 1035 /* 1036 * Normally, a device would use the page->zone_device_data to point to 1037 * the mirror but here we use it to hold the page for the simulated 1038 * device memory and that page holds the pointer to the mirror. 1039 */ 1040 rpage = vmf->page->zone_device_data; 1041 dmirror = rpage->zone_device_data; 1042 1043 /* FIXME demonstrate how we can adjust migrate range */ 1044 args.vma = vmf->vma; 1045 args.start = vmf->address; 1046 args.end = args.start + PAGE_SIZE; 1047 args.src = &src_pfns; 1048 args.dst = &dst_pfns; 1049 args.pgmap_owner = dmirror->mdevice; 1050 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1051 1052 if (migrate_vma_setup(&args)) 1053 return VM_FAULT_SIGBUS; 1054 1055 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1056 if (ret) 1057 return ret; 1058 migrate_vma_pages(&args); 1059 /* 1060 * No device finalize step is needed since 1061 * dmirror_devmem_fault_alloc_and_copy() will have already 1062 * invalidated the device page table. 1063 */ 1064 migrate_vma_finalize(&args); 1065 return 0; 1066 } 1067 1068 static const struct dev_pagemap_ops dmirror_devmem_ops = { 1069 .page_free = dmirror_devmem_free, 1070 .migrate_to_ram = dmirror_devmem_fault, 1071 }; 1072 1073 static int dmirror_device_init(struct dmirror_device *mdevice, int id) 1074 { 1075 dev_t dev; 1076 int ret; 1077 1078 dev = MKDEV(MAJOR(dmirror_dev), id); 1079 mutex_init(&mdevice->devmem_lock); 1080 spin_lock_init(&mdevice->lock); 1081 1082 cdev_init(&mdevice->cdevice, &dmirror_fops); 1083 mdevice->cdevice.owner = THIS_MODULE; 1084 ret = cdev_add(&mdevice->cdevice, dev, 1); 1085 if (ret) 1086 return ret; 1087 1088 /* Build a list of free ZONE_DEVICE private struct pages */ 1089 dmirror_allocate_chunk(mdevice, NULL); 1090 1091 return 0; 1092 } 1093 1094 static void dmirror_device_remove(struct dmirror_device *mdevice) 1095 { 1096 unsigned int i; 1097 1098 if (mdevice->devmem_chunks) { 1099 for (i = 0; i < mdevice->devmem_count; i++) { 1100 struct dmirror_chunk *devmem = 1101 mdevice->devmem_chunks[i]; 1102 1103 memunmap_pages(&devmem->pagemap); 1104 release_mem_region(devmem->pagemap.range.start, 1105 range_len(&devmem->pagemap.range)); 1106 kfree(devmem); 1107 } 1108 kfree(mdevice->devmem_chunks); 1109 } 1110 1111 cdev_del(&mdevice->cdevice); 1112 } 1113 1114 static int __init hmm_dmirror_init(void) 1115 { 1116 int ret; 1117 int id; 1118 1119 ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, 1120 "HMM_DMIRROR"); 1121 if (ret) 1122 goto err_unreg; 1123 1124 for (id = 0; id < DMIRROR_NDEVICES; id++) { 1125 ret = dmirror_device_init(dmirror_devices + id, id); 1126 if (ret) 1127 goto err_chrdev; 1128 } 1129 1130 /* 1131 * Allocate a zero page to simulate a reserved page of device private 1132 * memory which is always zero. The zero_pfn page isn't used just to 1133 * make the code here simpler (i.e., we need a struct page for it). 1134 */ 1135 dmirror_zero_page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); 1136 if (!dmirror_zero_page) { 1137 ret = -ENOMEM; 1138 goto err_chrdev; 1139 } 1140 1141 pr_info("HMM test module loaded. This is only for testing HMM.\n"); 1142 return 0; 1143 1144 err_chrdev: 1145 while (--id >= 0) 1146 dmirror_device_remove(dmirror_devices + id); 1147 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1148 err_unreg: 1149 return ret; 1150 } 1151 1152 static void __exit hmm_dmirror_exit(void) 1153 { 1154 int id; 1155 1156 if (dmirror_zero_page) 1157 __free_page(dmirror_zero_page); 1158 for (id = 0; id < DMIRROR_NDEVICES; id++) 1159 dmirror_device_remove(dmirror_devices + id); 1160 unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); 1161 } 1162 1163 module_init(hmm_dmirror_init); 1164 module_exit(hmm_dmirror_exit); 1165 MODULE_LICENSE("GPL"); 1166