1 /* 2 * Copyright 2013 Red Hat Inc. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 15 */ 16 /* 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 18 * management or HMM for short. 19 */ 20 #include <linux/mm.h> 21 #include <linux/hmm.h> 22 #include <linux/init.h> 23 #include <linux/rmap.h> 24 #include <linux/swap.h> 25 #include <linux/slab.h> 26 #include <linux/sched.h> 27 #include <linux/mmzone.h> 28 #include <linux/pagemap.h> 29 #include <linux/swapops.h> 30 #include <linux/hugetlb.h> 31 #include <linux/memremap.h> 32 #include <linux/jump_label.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/memory_hotplug.h> 35 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37 38 #if IS_ENABLED(CONFIG_HMM_MIRROR) 39 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 40 41 /* 42 * struct hmm - HMM per mm struct 43 * 44 * @mm: mm struct this HMM struct is bound to 45 * @lock: lock protecting ranges list 46 * @ranges: list of range being snapshotted 47 * @mirrors: list of mirrors for this mm 48 * @mmu_notifier: mmu notifier to track updates to CPU page table 49 * @mirrors_sem: read/write semaphore protecting the mirrors list 50 */ 51 struct hmm { 52 struct mm_struct *mm; 53 struct kref kref; 54 spinlock_t lock; 55 struct list_head ranges; 56 struct list_head mirrors; 57 struct mmu_notifier mmu_notifier; 58 struct rw_semaphore mirrors_sem; 59 }; 60 61 static inline struct hmm *mm_get_hmm(struct mm_struct *mm) 62 { 63 struct hmm *hmm = READ_ONCE(mm->hmm); 64 65 if (hmm && kref_get_unless_zero(&hmm->kref)) 66 return hmm; 67 68 return NULL; 69 } 70 71 /** 72 * hmm_get_or_create - register HMM against an mm (HMM internal) 73 * 74 * @mm: mm struct to attach to 75 * Returns: returns an HMM object, either by referencing the existing 76 * (per-process) object, or by creating a new one. 77 * 78 * This is not intended to be used directly by device drivers. If mm already 79 * has an HMM struct then it get a reference on it and returns it. Otherwise 80 * it allocates an HMM struct, initializes it, associate it with the mm and 81 * returns it. 82 */ 83 static struct hmm *hmm_get_or_create(struct mm_struct *mm) 84 { 85 struct hmm *hmm = mm_get_hmm(mm); 86 bool cleanup = false; 87 88 if (hmm) 89 return hmm; 90 91 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 92 if (!hmm) 93 return NULL; 94 INIT_LIST_HEAD(&hmm->mirrors); 95 init_rwsem(&hmm->mirrors_sem); 96 hmm->mmu_notifier.ops = NULL; 97 INIT_LIST_HEAD(&hmm->ranges); 98 spin_lock_init(&hmm->lock); 99 kref_init(&hmm->kref); 100 hmm->mm = mm; 101 102 spin_lock(&mm->page_table_lock); 103 if (!mm->hmm) 104 mm->hmm = hmm; 105 else 106 cleanup = true; 107 spin_unlock(&mm->page_table_lock); 108 109 if (cleanup) 110 goto error; 111 112 /* 113 * We should only get here if hold the mmap_sem in write mode ie on 114 * registration of first mirror through hmm_mirror_register() 115 */ 116 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 117 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) 118 goto error_mm; 119 120 return hmm; 121 122 error_mm: 123 spin_lock(&mm->page_table_lock); 124 if (mm->hmm == hmm) 125 mm->hmm = NULL; 126 spin_unlock(&mm->page_table_lock); 127 error: 128 kfree(hmm); 129 return NULL; 130 } 131 132 static void hmm_free(struct kref *kref) 133 { 134 struct hmm *hmm = container_of(kref, struct hmm, kref); 135 struct mm_struct *mm = hmm->mm; 136 137 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 138 139 spin_lock(&mm->page_table_lock); 140 if (mm->hmm == hmm) 141 mm->hmm = NULL; 142 spin_unlock(&mm->page_table_lock); 143 144 kfree(hmm); 145 } 146 147 static inline void hmm_put(struct hmm *hmm) 148 { 149 kref_put(&hmm->kref, hmm_free); 150 } 151 152 void hmm_mm_destroy(struct mm_struct *mm) 153 { 154 struct hmm *hmm; 155 156 spin_lock(&mm->page_table_lock); 157 hmm = mm_get_hmm(mm); 158 mm->hmm = NULL; 159 if (hmm) { 160 hmm->mm = NULL; 161 spin_unlock(&mm->page_table_lock); 162 hmm_put(hmm); 163 return; 164 } 165 166 spin_unlock(&mm->page_table_lock); 167 } 168 169 static int hmm_invalidate_range(struct hmm *hmm, bool device, 170 const struct hmm_update *update) 171 { 172 struct hmm_mirror *mirror; 173 struct hmm_range *range; 174 175 spin_lock(&hmm->lock); 176 list_for_each_entry(range, &hmm->ranges, list) { 177 if (update->end < range->start || update->start >= range->end) 178 continue; 179 180 range->valid = false; 181 } 182 spin_unlock(&hmm->lock); 183 184 if (!device) 185 return 0; 186 187 down_read(&hmm->mirrors_sem); 188 list_for_each_entry(mirror, &hmm->mirrors, list) { 189 int ret; 190 191 ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); 192 if (!update->blockable && ret == -EAGAIN) { 193 up_read(&hmm->mirrors_sem); 194 return -EAGAIN; 195 } 196 } 197 up_read(&hmm->mirrors_sem); 198 199 return 0; 200 } 201 202 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 203 { 204 struct hmm_mirror *mirror; 205 struct hmm *hmm = mm_get_hmm(mm); 206 207 down_write(&hmm->mirrors_sem); 208 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 209 list); 210 while (mirror) { 211 list_del_init(&mirror->list); 212 if (mirror->ops->release) { 213 /* 214 * Drop mirrors_sem so callback can wait on any pending 215 * work that might itself trigger mmu_notifier callback 216 * and thus would deadlock with us. 217 */ 218 up_write(&hmm->mirrors_sem); 219 mirror->ops->release(mirror); 220 down_write(&hmm->mirrors_sem); 221 } 222 mirror = list_first_entry_or_null(&hmm->mirrors, 223 struct hmm_mirror, list); 224 } 225 up_write(&hmm->mirrors_sem); 226 227 hmm_put(hmm); 228 } 229 230 static int hmm_invalidate_range_start(struct mmu_notifier *mn, 231 const struct mmu_notifier_range *range) 232 { 233 struct hmm *hmm = mm_get_hmm(range->mm); 234 struct hmm_update update; 235 int ret; 236 237 VM_BUG_ON(!hmm); 238 239 update.start = range->start; 240 update.end = range->end; 241 update.event = HMM_UPDATE_INVALIDATE; 242 update.blockable = range->blockable; 243 ret = hmm_invalidate_range(hmm, true, &update); 244 hmm_put(hmm); 245 return ret; 246 } 247 248 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 249 const struct mmu_notifier_range *range) 250 { 251 struct hmm *hmm = mm_get_hmm(range->mm); 252 struct hmm_update update; 253 254 VM_BUG_ON(!hmm); 255 256 update.start = range->start; 257 update.end = range->end; 258 update.event = HMM_UPDATE_INVALIDATE; 259 update.blockable = true; 260 hmm_invalidate_range(hmm, false, &update); 261 hmm_put(hmm); 262 } 263 264 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 265 .release = hmm_release, 266 .invalidate_range_start = hmm_invalidate_range_start, 267 .invalidate_range_end = hmm_invalidate_range_end, 268 }; 269 270 /* 271 * hmm_mirror_register() - register a mirror against an mm 272 * 273 * @mirror: new mirror struct to register 274 * @mm: mm to register against 275 * 276 * To start mirroring a process address space, the device driver must register 277 * an HMM mirror struct. 278 * 279 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 280 */ 281 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 282 { 283 /* Sanity check */ 284 if (!mm || !mirror || !mirror->ops) 285 return -EINVAL; 286 287 mirror->hmm = hmm_get_or_create(mm); 288 if (!mirror->hmm) 289 return -ENOMEM; 290 291 down_write(&mirror->hmm->mirrors_sem); 292 list_add(&mirror->list, &mirror->hmm->mirrors); 293 up_write(&mirror->hmm->mirrors_sem); 294 295 return 0; 296 } 297 EXPORT_SYMBOL(hmm_mirror_register); 298 299 /* 300 * hmm_mirror_unregister() - unregister a mirror 301 * 302 * @mirror: new mirror struct to register 303 * 304 * Stop mirroring a process address space, and cleanup. 305 */ 306 void hmm_mirror_unregister(struct hmm_mirror *mirror) 307 { 308 struct hmm *hmm = READ_ONCE(mirror->hmm); 309 310 if (hmm == NULL) 311 return; 312 313 down_write(&hmm->mirrors_sem); 314 list_del_init(&mirror->list); 315 /* To protect us against double unregister ... */ 316 mirror->hmm = NULL; 317 up_write(&hmm->mirrors_sem); 318 319 hmm_put(hmm); 320 } 321 EXPORT_SYMBOL(hmm_mirror_unregister); 322 323 struct hmm_vma_walk { 324 struct hmm_range *range; 325 unsigned long last; 326 bool fault; 327 bool block; 328 }; 329 330 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 331 bool write_fault, uint64_t *pfn) 332 { 333 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 334 struct hmm_vma_walk *hmm_vma_walk = walk->private; 335 struct hmm_range *range = hmm_vma_walk->range; 336 struct vm_area_struct *vma = walk->vma; 337 vm_fault_t ret; 338 339 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 340 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 341 ret = handle_mm_fault(vma, addr, flags); 342 if (ret & VM_FAULT_RETRY) 343 return -EBUSY; 344 if (ret & VM_FAULT_ERROR) { 345 *pfn = range->values[HMM_PFN_ERROR]; 346 return -EFAULT; 347 } 348 349 return -EAGAIN; 350 } 351 352 static int hmm_pfns_bad(unsigned long addr, 353 unsigned long end, 354 struct mm_walk *walk) 355 { 356 struct hmm_vma_walk *hmm_vma_walk = walk->private; 357 struct hmm_range *range = hmm_vma_walk->range; 358 uint64_t *pfns = range->pfns; 359 unsigned long i; 360 361 i = (addr - range->start) >> PAGE_SHIFT; 362 for (; addr < end; addr += PAGE_SIZE, i++) 363 pfns[i] = range->values[HMM_PFN_ERROR]; 364 365 return 0; 366 } 367 368 /* 369 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 370 * @start: range virtual start address (inclusive) 371 * @end: range virtual end address (exclusive) 372 * @fault: should we fault or not ? 373 * @write_fault: write fault ? 374 * @walk: mm_walk structure 375 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 376 * 377 * This function will be called whenever pmd_none() or pte_none() returns true, 378 * or whenever there is no page directory covering the virtual address range. 379 */ 380 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 381 bool fault, bool write_fault, 382 struct mm_walk *walk) 383 { 384 struct hmm_vma_walk *hmm_vma_walk = walk->private; 385 struct hmm_range *range = hmm_vma_walk->range; 386 uint64_t *pfns = range->pfns; 387 unsigned long i; 388 389 hmm_vma_walk->last = addr; 390 i = (addr - range->start) >> PAGE_SHIFT; 391 for (; addr < end; addr += PAGE_SIZE, i++) { 392 pfns[i] = range->values[HMM_PFN_NONE]; 393 if (fault || write_fault) { 394 int ret; 395 396 ret = hmm_vma_do_fault(walk, addr, write_fault, 397 &pfns[i]); 398 if (ret != -EAGAIN) 399 return ret; 400 } 401 } 402 403 return (fault || write_fault) ? -EAGAIN : 0; 404 } 405 406 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 407 uint64_t pfns, uint64_t cpu_flags, 408 bool *fault, bool *write_fault) 409 { 410 struct hmm_range *range = hmm_vma_walk->range; 411 412 *fault = *write_fault = false; 413 if (!hmm_vma_walk->fault) 414 return; 415 416 /* We aren't ask to do anything ... */ 417 if (!(pfns & range->flags[HMM_PFN_VALID])) 418 return; 419 /* If this is device memory than only fault if explicitly requested */ 420 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 421 /* Do we fault on device memory ? */ 422 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 423 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 424 *fault = true; 425 } 426 return; 427 } 428 429 /* If CPU page table is not valid then we need to fault */ 430 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 431 /* Need to write fault ? */ 432 if ((pfns & range->flags[HMM_PFN_WRITE]) && 433 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 434 *write_fault = true; 435 *fault = true; 436 } 437 } 438 439 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 440 const uint64_t *pfns, unsigned long npages, 441 uint64_t cpu_flags, bool *fault, 442 bool *write_fault) 443 { 444 unsigned long i; 445 446 if (!hmm_vma_walk->fault) { 447 *fault = *write_fault = false; 448 return; 449 } 450 451 for (i = 0; i < npages; ++i) { 452 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 453 fault, write_fault); 454 if ((*fault) || (*write_fault)) 455 return; 456 } 457 } 458 459 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 460 struct mm_walk *walk) 461 { 462 struct hmm_vma_walk *hmm_vma_walk = walk->private; 463 struct hmm_range *range = hmm_vma_walk->range; 464 bool fault, write_fault; 465 unsigned long i, npages; 466 uint64_t *pfns; 467 468 i = (addr - range->start) >> PAGE_SHIFT; 469 npages = (end - addr) >> PAGE_SHIFT; 470 pfns = &range->pfns[i]; 471 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 472 0, &fault, &write_fault); 473 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 474 } 475 476 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 477 { 478 if (pmd_protnone(pmd)) 479 return 0; 480 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 481 range->flags[HMM_PFN_WRITE] : 482 range->flags[HMM_PFN_VALID]; 483 } 484 485 static int hmm_vma_handle_pmd(struct mm_walk *walk, 486 unsigned long addr, 487 unsigned long end, 488 uint64_t *pfns, 489 pmd_t pmd) 490 { 491 struct hmm_vma_walk *hmm_vma_walk = walk->private; 492 struct hmm_range *range = hmm_vma_walk->range; 493 unsigned long pfn, npages, i; 494 bool fault, write_fault; 495 uint64_t cpu_flags; 496 497 npages = (end - addr) >> PAGE_SHIFT; 498 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 499 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 500 &fault, &write_fault); 501 502 if (pmd_protnone(pmd) || fault || write_fault) 503 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 504 505 pfn = pmd_pfn(pmd) + pte_index(addr); 506 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 507 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 508 hmm_vma_walk->last = end; 509 return 0; 510 } 511 512 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 513 { 514 if (pte_none(pte) || !pte_present(pte)) 515 return 0; 516 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 517 range->flags[HMM_PFN_WRITE] : 518 range->flags[HMM_PFN_VALID]; 519 } 520 521 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 522 unsigned long end, pmd_t *pmdp, pte_t *ptep, 523 uint64_t *pfn) 524 { 525 struct hmm_vma_walk *hmm_vma_walk = walk->private; 526 struct hmm_range *range = hmm_vma_walk->range; 527 struct vm_area_struct *vma = walk->vma; 528 bool fault, write_fault; 529 uint64_t cpu_flags; 530 pte_t pte = *ptep; 531 uint64_t orig_pfn = *pfn; 532 533 *pfn = range->values[HMM_PFN_NONE]; 534 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 535 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 536 &fault, &write_fault); 537 538 if (pte_none(pte)) { 539 if (fault || write_fault) 540 goto fault; 541 return 0; 542 } 543 544 if (!pte_present(pte)) { 545 swp_entry_t entry = pte_to_swp_entry(pte); 546 547 if (!non_swap_entry(entry)) { 548 if (fault || write_fault) 549 goto fault; 550 return 0; 551 } 552 553 /* 554 * This is a special swap entry, ignore migration, use 555 * device and report anything else as error. 556 */ 557 if (is_device_private_entry(entry)) { 558 cpu_flags = range->flags[HMM_PFN_VALID] | 559 range->flags[HMM_PFN_DEVICE_PRIVATE]; 560 cpu_flags |= is_write_device_private_entry(entry) ? 561 range->flags[HMM_PFN_WRITE] : 0; 562 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 563 &fault, &write_fault); 564 if (fault || write_fault) 565 goto fault; 566 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 567 *pfn |= cpu_flags; 568 return 0; 569 } 570 571 if (is_migration_entry(entry)) { 572 if (fault || write_fault) { 573 pte_unmap(ptep); 574 hmm_vma_walk->last = addr; 575 migration_entry_wait(vma->vm_mm, 576 pmdp, addr); 577 return -EAGAIN; 578 } 579 return 0; 580 } 581 582 /* Report error for everything else */ 583 *pfn = range->values[HMM_PFN_ERROR]; 584 return -EFAULT; 585 } 586 587 if (fault || write_fault) 588 goto fault; 589 590 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 591 return 0; 592 593 fault: 594 pte_unmap(ptep); 595 /* Fault any virtual address we were asked to fault */ 596 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 597 } 598 599 static int hmm_vma_walk_pmd(pmd_t *pmdp, 600 unsigned long start, 601 unsigned long end, 602 struct mm_walk *walk) 603 { 604 struct hmm_vma_walk *hmm_vma_walk = walk->private; 605 struct hmm_range *range = hmm_vma_walk->range; 606 struct vm_area_struct *vma = walk->vma; 607 uint64_t *pfns = range->pfns; 608 unsigned long addr = start, i; 609 pte_t *ptep; 610 pmd_t pmd; 611 612 613 again: 614 pmd = READ_ONCE(*pmdp); 615 if (pmd_none(pmd)) 616 return hmm_vma_walk_hole(start, end, walk); 617 618 if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) 619 return hmm_pfns_bad(start, end, walk); 620 621 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 622 bool fault, write_fault; 623 unsigned long npages; 624 uint64_t *pfns; 625 626 i = (addr - range->start) >> PAGE_SHIFT; 627 npages = (end - addr) >> PAGE_SHIFT; 628 pfns = &range->pfns[i]; 629 630 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 631 0, &fault, &write_fault); 632 if (fault || write_fault) { 633 hmm_vma_walk->last = addr; 634 pmd_migration_entry_wait(vma->vm_mm, pmdp); 635 return -EAGAIN; 636 } 637 return 0; 638 } else if (!pmd_present(pmd)) 639 return hmm_pfns_bad(start, end, walk); 640 641 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 642 /* 643 * No need to take pmd_lock here, even if some other threads 644 * is splitting the huge pmd we will get that event through 645 * mmu_notifier callback. 646 * 647 * So just read pmd value and check again its a transparent 648 * huge or device mapping one and compute corresponding pfn 649 * values. 650 */ 651 pmd = pmd_read_atomic(pmdp); 652 barrier(); 653 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 654 goto again; 655 656 i = (addr - range->start) >> PAGE_SHIFT; 657 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 658 } 659 660 /* 661 * We have handled all the valid case above ie either none, migration, 662 * huge or transparent huge. At this point either it is a valid pmd 663 * entry pointing to pte directory or it is a bad pmd that will not 664 * recover. 665 */ 666 if (pmd_bad(pmd)) 667 return hmm_pfns_bad(start, end, walk); 668 669 ptep = pte_offset_map(pmdp, addr); 670 i = (addr - range->start) >> PAGE_SHIFT; 671 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 672 int r; 673 674 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 675 if (r) { 676 /* hmm_vma_handle_pte() did unmap pte directory */ 677 hmm_vma_walk->last = addr; 678 return r; 679 } 680 } 681 pte_unmap(ptep - 1); 682 683 hmm_vma_walk->last = addr; 684 return 0; 685 } 686 687 static void hmm_pfns_clear(struct hmm_range *range, 688 uint64_t *pfns, 689 unsigned long addr, 690 unsigned long end) 691 { 692 for (; addr < end; addr += PAGE_SIZE, pfns++) 693 *pfns = range->values[HMM_PFN_NONE]; 694 } 695 696 static void hmm_pfns_special(struct hmm_range *range) 697 { 698 unsigned long addr = range->start, i = 0; 699 700 for (; addr < range->end; addr += PAGE_SIZE, i++) 701 range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 702 } 703 704 /* 705 * hmm_range_snapshot() - snapshot CPU page table for a range 706 * @range: range 707 * Returns: number of valid pages in range->pfns[] (from range start 708 * address). This may be zero. If the return value is negative, 709 * then one of the following values may be returned: 710 * 711 * -EINVAL invalid arguments or mm or virtual address are in an 712 * invalid vma (ie either hugetlbfs or device file vma). 713 * -EPERM For example, asking for write, when the range is 714 * read-only 715 * -EAGAIN Caller needs to retry 716 * -EFAULT Either no valid vma exists for this range, or it is 717 * illegal to access the range 718 * 719 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 720 * validity is tracked by range struct. See hmm_vma_range_done() for further 721 * information. 722 */ 723 long hmm_range_snapshot(struct hmm_range *range) 724 { 725 struct vm_area_struct *vma = range->vma; 726 struct hmm_vma_walk hmm_vma_walk; 727 struct mm_walk mm_walk; 728 struct hmm *hmm; 729 730 range->hmm = NULL; 731 732 /* Sanity check, this really should not happen ! */ 733 if (range->start < vma->vm_start || range->start >= vma->vm_end) 734 return -EINVAL; 735 if (range->end < vma->vm_start || range->end > vma->vm_end) 736 return -EINVAL; 737 738 hmm = hmm_get_or_create(vma->vm_mm); 739 if (!hmm) 740 return -ENOMEM; 741 742 /* Check if hmm_mm_destroy() was call. */ 743 if (hmm->mm == NULL) { 744 hmm_put(hmm); 745 return -EINVAL; 746 } 747 748 /* FIXME support hugetlb fs */ 749 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 750 vma_is_dax(vma)) { 751 hmm_pfns_special(range); 752 hmm_put(hmm); 753 return -EINVAL; 754 } 755 756 if (!(vma->vm_flags & VM_READ)) { 757 /* 758 * If vma do not allow read access, then assume that it does 759 * not allow write access, either. Architecture that allow 760 * write without read access are not supported by HMM, because 761 * operations such has atomic access would not work. 762 */ 763 hmm_pfns_clear(range, range->pfns, range->start, range->end); 764 hmm_put(hmm); 765 return -EPERM; 766 } 767 768 /* Initialize range to track CPU page table update */ 769 spin_lock(&hmm->lock); 770 range->valid = true; 771 list_add_rcu(&range->list, &hmm->ranges); 772 spin_unlock(&hmm->lock); 773 774 hmm_vma_walk.fault = false; 775 hmm_vma_walk.range = range; 776 mm_walk.private = &hmm_vma_walk; 777 hmm_vma_walk.last = range->start; 778 779 mm_walk.vma = vma; 780 mm_walk.mm = vma->vm_mm; 781 mm_walk.pte_entry = NULL; 782 mm_walk.test_walk = NULL; 783 mm_walk.hugetlb_entry = NULL; 784 mm_walk.pmd_entry = hmm_vma_walk_pmd; 785 mm_walk.pte_hole = hmm_vma_walk_hole; 786 787 walk_page_range(range->start, range->end, &mm_walk); 788 /* 789 * Transfer hmm reference to the range struct it will be drop inside 790 * the hmm_vma_range_done() function (which _must_ be call if this 791 * function return 0). 792 */ 793 range->hmm = hmm; 794 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 795 } 796 EXPORT_SYMBOL(hmm_range_snapshot); 797 798 /* 799 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 800 * @range: range being tracked 801 * Returns: false if range data has been invalidated, true otherwise 802 * 803 * Range struct is used to track updates to the CPU page table after a call to 804 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 805 * using the data, or wants to lock updates to the data it got from those 806 * functions, it must call the hmm_vma_range_done() function, which will then 807 * stop tracking CPU page table updates. 808 * 809 * Note that device driver must still implement general CPU page table update 810 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 811 * the mmu_notifier API directly. 812 * 813 * CPU page table update tracking done through hmm_range is only temporary and 814 * to be used while trying to duplicate CPU page table contents for a range of 815 * virtual addresses. 816 * 817 * There are two ways to use this : 818 * again: 819 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 820 * trans = device_build_page_table_update_transaction(pfns); 821 * device_page_table_lock(); 822 * if (!hmm_vma_range_done(range)) { 823 * device_page_table_unlock(); 824 * goto again; 825 * } 826 * device_commit_transaction(trans); 827 * device_page_table_unlock(); 828 * 829 * Or: 830 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 831 * device_page_table_lock(); 832 * hmm_vma_range_done(range); 833 * device_update_page_table(range->pfns); 834 * device_page_table_unlock(); 835 */ 836 bool hmm_vma_range_done(struct hmm_range *range) 837 { 838 bool ret = false; 839 840 /* Sanity check this really should not happen. */ 841 if (range->hmm == NULL || range->end <= range->start) { 842 BUG(); 843 return false; 844 } 845 846 spin_lock(&range->hmm->lock); 847 list_del_rcu(&range->list); 848 ret = range->valid; 849 spin_unlock(&range->hmm->lock); 850 851 /* Is the mm still alive ? */ 852 if (range->hmm->mm == NULL) 853 ret = false; 854 855 /* Drop reference taken by hmm_vma_fault() or hmm_vma_get_pfns() */ 856 hmm_put(range->hmm); 857 range->hmm = NULL; 858 return ret; 859 } 860 EXPORT_SYMBOL(hmm_vma_range_done); 861 862 /* 863 * hmm_vma_fault() - try to fault some address in a virtual address range 864 * @range: range being faulted 865 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 866 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 867 * 868 * This is similar to a regular CPU page fault except that it will not trigger 869 * any memory migration if the memory being faulted is not accessible by CPUs. 870 * 871 * On error, for one virtual address in the range, the function will mark the 872 * corresponding HMM pfn entry with an error flag. 873 * 874 * Expected use pattern: 875 * retry: 876 * down_read(&mm->mmap_sem); 877 * // Find vma and address device wants to fault, initialize hmm_pfn_t 878 * // array accordingly 879 * ret = hmm_vma_fault(range, write, block); 880 * switch (ret) { 881 * case -EAGAIN: 882 * hmm_vma_range_done(range); 883 * // You might want to rate limit or yield to play nicely, you may 884 * // also commit any valid pfn in the array assuming that you are 885 * // getting true from hmm_vma_range_monitor_end() 886 * goto retry; 887 * case 0: 888 * break; 889 * case -ENOMEM: 890 * case -EINVAL: 891 * case -EPERM: 892 * default: 893 * // Handle error ! 894 * up_read(&mm->mmap_sem) 895 * return; 896 * } 897 * // Take device driver lock that serialize device page table update 898 * driver_lock_device_page_table_update(); 899 * hmm_vma_range_done(range); 900 * // Commit pfns we got from hmm_vma_fault() 901 * driver_unlock_device_page_table_update(); 902 * up_read(&mm->mmap_sem) 903 * 904 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 905 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 906 * 907 * YOU HAVE BEEN WARNED ! 908 */ 909 int hmm_vma_fault(struct hmm_range *range, bool block) 910 { 911 struct vm_area_struct *vma = range->vma; 912 unsigned long start = range->start; 913 struct hmm_vma_walk hmm_vma_walk; 914 struct mm_walk mm_walk; 915 struct hmm *hmm; 916 int ret; 917 918 range->hmm = NULL; 919 920 /* Sanity check, this really should not happen ! */ 921 if (range->start < vma->vm_start || range->start >= vma->vm_end) 922 return -EINVAL; 923 if (range->end < vma->vm_start || range->end > vma->vm_end) 924 return -EINVAL; 925 926 hmm = hmm_get_or_create(vma->vm_mm); 927 if (!hmm) { 928 hmm_pfns_clear(range, range->pfns, range->start, range->end); 929 return -ENOMEM; 930 } 931 932 /* Check if hmm_mm_destroy() was call. */ 933 if (hmm->mm == NULL) { 934 hmm_put(hmm); 935 return -EINVAL; 936 } 937 938 /* FIXME support hugetlb fs */ 939 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 940 vma_is_dax(vma)) { 941 hmm_pfns_special(range); 942 hmm_put(hmm); 943 return -EINVAL; 944 } 945 946 if (!(vma->vm_flags & VM_READ)) { 947 /* 948 * If vma do not allow read access, then assume that it does 949 * not allow write access, either. Architecture that allow 950 * write without read access are not supported by HMM, because 951 * operations such has atomic access would not work. 952 */ 953 hmm_pfns_clear(range, range->pfns, range->start, range->end); 954 hmm_put(hmm); 955 return -EPERM; 956 } 957 958 /* Initialize range to track CPU page table update */ 959 spin_lock(&hmm->lock); 960 range->valid = true; 961 list_add_rcu(&range->list, &hmm->ranges); 962 spin_unlock(&hmm->lock); 963 964 hmm_vma_walk.fault = true; 965 hmm_vma_walk.block = block; 966 hmm_vma_walk.range = range; 967 mm_walk.private = &hmm_vma_walk; 968 hmm_vma_walk.last = range->start; 969 970 mm_walk.vma = vma; 971 mm_walk.mm = vma->vm_mm; 972 mm_walk.pte_entry = NULL; 973 mm_walk.test_walk = NULL; 974 mm_walk.hugetlb_entry = NULL; 975 mm_walk.pmd_entry = hmm_vma_walk_pmd; 976 mm_walk.pte_hole = hmm_vma_walk_hole; 977 978 do { 979 ret = walk_page_range(start, range->end, &mm_walk); 980 start = hmm_vma_walk.last; 981 } while (ret == -EAGAIN); 982 983 if (ret) { 984 unsigned long i; 985 986 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 987 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, 988 range->end); 989 hmm_vma_range_done(range); 990 hmm_put(hmm); 991 } else { 992 /* 993 * Transfer hmm reference to the range struct it will be drop 994 * inside the hmm_vma_range_done() function (which _must_ be 995 * call if this function return 0). 996 */ 997 range->hmm = hmm; 998 } 999 1000 return ret; 1001 } 1002 EXPORT_SYMBOL(hmm_vma_fault); 1003 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 1004 1005 1006 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 1007 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 1008 unsigned long addr) 1009 { 1010 struct page *page; 1011 1012 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 1013 if (!page) 1014 return NULL; 1015 lock_page(page); 1016 return page; 1017 } 1018 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 1019 1020 1021 static void hmm_devmem_ref_release(struct percpu_ref *ref) 1022 { 1023 struct hmm_devmem *devmem; 1024 1025 devmem = container_of(ref, struct hmm_devmem, ref); 1026 complete(&devmem->completion); 1027 } 1028 1029 static void hmm_devmem_ref_exit(void *data) 1030 { 1031 struct percpu_ref *ref = data; 1032 struct hmm_devmem *devmem; 1033 1034 devmem = container_of(ref, struct hmm_devmem, ref); 1035 wait_for_completion(&devmem->completion); 1036 percpu_ref_exit(ref); 1037 } 1038 1039 static void hmm_devmem_ref_kill(struct percpu_ref *ref) 1040 { 1041 percpu_ref_kill(ref); 1042 } 1043 1044 static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, 1045 unsigned long addr, 1046 const struct page *page, 1047 unsigned int flags, 1048 pmd_t *pmdp) 1049 { 1050 struct hmm_devmem *devmem = page->pgmap->data; 1051 1052 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 1053 } 1054 1055 static void hmm_devmem_free(struct page *page, void *data) 1056 { 1057 struct hmm_devmem *devmem = data; 1058 1059 page->mapping = NULL; 1060 1061 devmem->ops->free(devmem, page); 1062 } 1063 1064 /* 1065 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 1066 * 1067 * @ops: memory event device driver callback (see struct hmm_devmem_ops) 1068 * @device: device struct to bind the resource too 1069 * @size: size in bytes of the device memory to add 1070 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 1071 * 1072 * This function first finds an empty range of physical address big enough to 1073 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 1074 * in turn allocates struct pages. It does not do anything beyond that; all 1075 * events affecting the memory will go through the various callbacks provided 1076 * by hmm_devmem_ops struct. 1077 * 1078 * Device driver should call this function during device initialization and 1079 * is then responsible of memory management. HMM only provides helpers. 1080 */ 1081 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 1082 struct device *device, 1083 unsigned long size) 1084 { 1085 struct hmm_devmem *devmem; 1086 resource_size_t addr; 1087 void *result; 1088 int ret; 1089 1090 dev_pagemap_get_ops(); 1091 1092 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 1093 if (!devmem) 1094 return ERR_PTR(-ENOMEM); 1095 1096 init_completion(&devmem->completion); 1097 devmem->pfn_first = -1UL; 1098 devmem->pfn_last = -1UL; 1099 devmem->resource = NULL; 1100 devmem->device = device; 1101 devmem->ops = ops; 1102 1103 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1104 0, GFP_KERNEL); 1105 if (ret) 1106 return ERR_PTR(ret); 1107 1108 ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); 1109 if (ret) 1110 return ERR_PTR(ret); 1111 1112 size = ALIGN(size, PA_SECTION_SIZE); 1113 addr = min((unsigned long)iomem_resource.end, 1114 (1UL << MAX_PHYSMEM_BITS) - 1); 1115 addr = addr - size + 1UL; 1116 1117 /* 1118 * FIXME add a new helper to quickly walk resource tree and find free 1119 * range 1120 * 1121 * FIXME what about ioport_resource resource ? 1122 */ 1123 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 1124 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 1125 if (ret != REGION_DISJOINT) 1126 continue; 1127 1128 devmem->resource = devm_request_mem_region(device, addr, size, 1129 dev_name(device)); 1130 if (!devmem->resource) 1131 return ERR_PTR(-ENOMEM); 1132 break; 1133 } 1134 if (!devmem->resource) 1135 return ERR_PTR(-ERANGE); 1136 1137 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1138 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1139 devmem->pfn_last = devmem->pfn_first + 1140 (resource_size(devmem->resource) >> PAGE_SHIFT); 1141 devmem->page_fault = hmm_devmem_fault; 1142 1143 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1144 devmem->pagemap.res = *devmem->resource; 1145 devmem->pagemap.page_free = hmm_devmem_free; 1146 devmem->pagemap.altmap_valid = false; 1147 devmem->pagemap.ref = &devmem->ref; 1148 devmem->pagemap.data = devmem; 1149 devmem->pagemap.kill = hmm_devmem_ref_kill; 1150 1151 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1152 if (IS_ERR(result)) 1153 return result; 1154 return devmem; 1155 } 1156 EXPORT_SYMBOL_GPL(hmm_devmem_add); 1157 1158 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1159 struct device *device, 1160 struct resource *res) 1161 { 1162 struct hmm_devmem *devmem; 1163 void *result; 1164 int ret; 1165 1166 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1167 return ERR_PTR(-EINVAL); 1168 1169 dev_pagemap_get_ops(); 1170 1171 devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 1172 if (!devmem) 1173 return ERR_PTR(-ENOMEM); 1174 1175 init_completion(&devmem->completion); 1176 devmem->pfn_first = -1UL; 1177 devmem->pfn_last = -1UL; 1178 devmem->resource = res; 1179 devmem->device = device; 1180 devmem->ops = ops; 1181 1182 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1183 0, GFP_KERNEL); 1184 if (ret) 1185 return ERR_PTR(ret); 1186 1187 ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, 1188 &devmem->ref); 1189 if (ret) 1190 return ERR_PTR(ret); 1191 1192 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1193 devmem->pfn_last = devmem->pfn_first + 1194 (resource_size(devmem->resource) >> PAGE_SHIFT); 1195 devmem->page_fault = hmm_devmem_fault; 1196 1197 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1198 devmem->pagemap.res = *devmem->resource; 1199 devmem->pagemap.page_free = hmm_devmem_free; 1200 devmem->pagemap.altmap_valid = false; 1201 devmem->pagemap.ref = &devmem->ref; 1202 devmem->pagemap.data = devmem; 1203 devmem->pagemap.kill = hmm_devmem_ref_kill; 1204 1205 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1206 if (IS_ERR(result)) 1207 return result; 1208 return devmem; 1209 } 1210 EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); 1211 1212 /* 1213 * A device driver that wants to handle multiple devices memory through a 1214 * single fake device can use hmm_device to do so. This is purely a helper 1215 * and it is not needed to make use of any HMM functionality. 1216 */ 1217 #define HMM_DEVICE_MAX 256 1218 1219 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1220 static DEFINE_SPINLOCK(hmm_device_lock); 1221 static struct class *hmm_device_class; 1222 static dev_t hmm_device_devt; 1223 1224 static void hmm_device_release(struct device *device) 1225 { 1226 struct hmm_device *hmm_device; 1227 1228 hmm_device = container_of(device, struct hmm_device, device); 1229 spin_lock(&hmm_device_lock); 1230 clear_bit(hmm_device->minor, hmm_device_mask); 1231 spin_unlock(&hmm_device_lock); 1232 1233 kfree(hmm_device); 1234 } 1235 1236 struct hmm_device *hmm_device_new(void *drvdata) 1237 { 1238 struct hmm_device *hmm_device; 1239 1240 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1241 if (!hmm_device) 1242 return ERR_PTR(-ENOMEM); 1243 1244 spin_lock(&hmm_device_lock); 1245 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1246 if (hmm_device->minor >= HMM_DEVICE_MAX) { 1247 spin_unlock(&hmm_device_lock); 1248 kfree(hmm_device); 1249 return ERR_PTR(-EBUSY); 1250 } 1251 set_bit(hmm_device->minor, hmm_device_mask); 1252 spin_unlock(&hmm_device_lock); 1253 1254 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1255 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1256 hmm_device->minor); 1257 hmm_device->device.release = hmm_device_release; 1258 dev_set_drvdata(&hmm_device->device, drvdata); 1259 hmm_device->device.class = hmm_device_class; 1260 device_initialize(&hmm_device->device); 1261 1262 return hmm_device; 1263 } 1264 EXPORT_SYMBOL(hmm_device_new); 1265 1266 void hmm_device_put(struct hmm_device *hmm_device) 1267 { 1268 put_device(&hmm_device->device); 1269 } 1270 EXPORT_SYMBOL(hmm_device_put); 1271 1272 static int __init hmm_init(void) 1273 { 1274 int ret; 1275 1276 ret = alloc_chrdev_region(&hmm_device_devt, 0, 1277 HMM_DEVICE_MAX, 1278 "hmm_device"); 1279 if (ret) 1280 return ret; 1281 1282 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1283 if (IS_ERR(hmm_device_class)) { 1284 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1285 return PTR_ERR(hmm_device_class); 1286 } 1287 return 0; 1288 } 1289 1290 device_initcall(hmm_init); 1291 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1292