1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright 2013 Red Hat Inc. 4 * 5 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 7 /* 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 10 */ 11 #include <linux/pagewalk.h> 12 #include <linux/hmm.h> 13 #include <linux/init.h> 14 #include <linux/rmap.h> 15 #include <linux/swap.h> 16 #include <linux/slab.h> 17 #include <linux/sched.h> 18 #include <linux/mmzone.h> 19 #include <linux/pagemap.h> 20 #include <linux/swapops.h> 21 #include <linux/hugetlb.h> 22 #include <linux/memremap.h> 23 #include <linux/sched/mm.h> 24 #include <linux/jump_label.h> 25 #include <linux/dma-mapping.h> 26 #include <linux/mmu_notifier.h> 27 #include <linux/memory_hotplug.h> 28 29 struct hmm_vma_walk { 30 struct hmm_range *range; 31 unsigned long last; 32 }; 33 34 enum { 35 HMM_NEED_FAULT = 1 << 0, 36 HMM_NEED_WRITE_FAULT = 1 << 1, 37 HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 38 }; 39 40 /* 41 * hmm_device_entry_from_pfn() - create a valid device entry value from pfn 42 * @range: range use to encode HMM pfn value 43 * @pfn: pfn value for which to create the device entry 44 * Return: valid device entry for the pfn 45 */ 46 static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, 47 unsigned long pfn) 48 { 49 return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; 50 } 51 52 static int hmm_pfns_fill(unsigned long addr, unsigned long end, 53 struct hmm_range *range, enum hmm_pfn_value_e value) 54 { 55 uint64_t *pfns = range->pfns; 56 unsigned long i; 57 58 i = (addr - range->start) >> PAGE_SHIFT; 59 for (; addr < end; addr += PAGE_SIZE, i++) 60 pfns[i] = range->values[value]; 61 62 return 0; 63 } 64 65 /* 66 * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 67 * @addr: range virtual start address (inclusive) 68 * @end: range virtual end address (exclusive) 69 * @required_fault: HMM_NEED_* flags 70 * @walk: mm_walk structure 71 * Return: -EBUSY after page fault, or page fault error 72 * 73 * This function will be called whenever pmd_none() or pte_none() returns true, 74 * or whenever there is no page directory covering the virtual address range. 75 */ 76 static int hmm_vma_fault(unsigned long addr, unsigned long end, 77 unsigned int required_fault, struct mm_walk *walk) 78 { 79 struct hmm_vma_walk *hmm_vma_walk = walk->private; 80 struct vm_area_struct *vma = walk->vma; 81 unsigned int fault_flags = FAULT_FLAG_REMOTE; 82 83 WARN_ON_ONCE(!required_fault); 84 hmm_vma_walk->last = addr; 85 86 if (required_fault & HMM_NEED_WRITE_FAULT) { 87 if (!(vma->vm_flags & VM_WRITE)) 88 return -EPERM; 89 fault_flags |= FAULT_FLAG_WRITE; 90 } 91 92 for (; addr < end; addr += PAGE_SIZE) 93 if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) 94 return -EFAULT; 95 return -EBUSY; 96 } 97 98 static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 99 uint64_t pfns, uint64_t cpu_flags) 100 { 101 struct hmm_range *range = hmm_vma_walk->range; 102 103 /* 104 * So we not only consider the individual per page request we also 105 * consider the default flags requested for the range. The API can 106 * be used 2 ways. The first one where the HMM user coalesces 107 * multiple page faults into one request and sets flags per pfn for 108 * those faults. The second one where the HMM user wants to pre- 109 * fault a range with specific flags. For the latter one it is a 110 * waste to have the user pre-fill the pfn arrays with a default 111 * flags value. 112 */ 113 pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 114 115 /* We aren't ask to do anything ... */ 116 if (!(pfns & range->flags[HMM_PFN_VALID])) 117 return 0; 118 119 /* Need to write fault ? */ 120 if ((pfns & range->flags[HMM_PFN_WRITE]) && 121 !(cpu_flags & range->flags[HMM_PFN_WRITE])) 122 return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 123 124 /* If CPU page table is not valid then we need to fault */ 125 if (!(cpu_flags & range->flags[HMM_PFN_VALID])) 126 return HMM_NEED_FAULT; 127 return 0; 128 } 129 130 static unsigned int 131 hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 132 const uint64_t *pfns, unsigned long npages, 133 uint64_t cpu_flags) 134 { 135 struct hmm_range *range = hmm_vma_walk->range; 136 unsigned int required_fault = 0; 137 unsigned long i; 138 139 /* 140 * If the default flags do not request to fault pages, and the mask does 141 * not allow for individual pages to be faulted, then 142 * hmm_pte_need_fault() will always return 0. 143 */ 144 if (!((range->default_flags | range->pfn_flags_mask) & 145 range->flags[HMM_PFN_VALID])) 146 return 0; 147 148 for (i = 0; i < npages; ++i) { 149 required_fault |= 150 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); 151 if (required_fault == HMM_NEED_ALL_BITS) 152 return required_fault; 153 } 154 return required_fault; 155 } 156 157 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 158 __always_unused int depth, struct mm_walk *walk) 159 { 160 struct hmm_vma_walk *hmm_vma_walk = walk->private; 161 struct hmm_range *range = hmm_vma_walk->range; 162 unsigned int required_fault; 163 unsigned long i, npages; 164 uint64_t *pfns; 165 166 i = (addr - range->start) >> PAGE_SHIFT; 167 npages = (end - addr) >> PAGE_SHIFT; 168 pfns = &range->pfns[i]; 169 required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); 170 if (!walk->vma) { 171 if (required_fault) 172 return -EFAULT; 173 return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); 174 } 175 if (required_fault) 176 return hmm_vma_fault(addr, end, required_fault, walk); 177 hmm_vma_walk->last = addr; 178 return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); 179 } 180 181 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 182 { 183 if (pmd_protnone(pmd)) 184 return 0; 185 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 186 range->flags[HMM_PFN_WRITE] : 187 range->flags[HMM_PFN_VALID]; 188 } 189 190 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 191 static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 192 unsigned long end, uint64_t *pfns, pmd_t pmd) 193 { 194 struct hmm_vma_walk *hmm_vma_walk = walk->private; 195 struct hmm_range *range = hmm_vma_walk->range; 196 unsigned long pfn, npages, i; 197 unsigned int required_fault; 198 uint64_t cpu_flags; 199 200 npages = (end - addr) >> PAGE_SHIFT; 201 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 202 required_fault = 203 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); 204 if (required_fault) 205 return hmm_vma_fault(addr, end, required_fault, walk); 206 207 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 208 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 209 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 210 hmm_vma_walk->last = end; 211 return 0; 212 } 213 #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 214 /* stub to allow the code below to compile */ 215 int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 216 unsigned long end, uint64_t *pfns, pmd_t pmd); 217 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 218 219 static inline bool hmm_is_device_private_entry(struct hmm_range *range, 220 swp_entry_t entry) 221 { 222 return is_device_private_entry(entry) && 223 device_private_entry_to_page(entry)->pgmap->owner == 224 range->dev_private_owner; 225 } 226 227 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 228 { 229 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 230 return 0; 231 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 232 range->flags[HMM_PFN_WRITE] : 233 range->flags[HMM_PFN_VALID]; 234 } 235 236 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 237 unsigned long end, pmd_t *pmdp, pte_t *ptep, 238 uint64_t *pfn) 239 { 240 struct hmm_vma_walk *hmm_vma_walk = walk->private; 241 struct hmm_range *range = hmm_vma_walk->range; 242 unsigned int required_fault; 243 uint64_t cpu_flags; 244 pte_t pte = *ptep; 245 uint64_t orig_pfn = *pfn; 246 247 if (pte_none(pte)) { 248 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); 249 if (required_fault) 250 goto fault; 251 *pfn = range->values[HMM_PFN_NONE]; 252 return 0; 253 } 254 255 if (!pte_present(pte)) { 256 swp_entry_t entry = pte_to_swp_entry(pte); 257 258 /* 259 * Never fault in device private pages pages, but just report 260 * the PFN even if not present. 261 */ 262 if (hmm_is_device_private_entry(range, entry)) { 263 *pfn = hmm_device_entry_from_pfn(range, 264 device_private_entry_to_pfn(entry)); 265 *pfn |= range->flags[HMM_PFN_VALID]; 266 if (is_write_device_private_entry(entry)) 267 *pfn |= range->flags[HMM_PFN_WRITE]; 268 return 0; 269 } 270 271 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); 272 if (!required_fault) { 273 *pfn = range->values[HMM_PFN_NONE]; 274 return 0; 275 } 276 277 if (!non_swap_entry(entry)) 278 goto fault; 279 280 if (is_migration_entry(entry)) { 281 pte_unmap(ptep); 282 hmm_vma_walk->last = addr; 283 migration_entry_wait(walk->mm, pmdp, addr); 284 return -EBUSY; 285 } 286 287 /* Report error for everything else */ 288 pte_unmap(ptep); 289 return -EFAULT; 290 } 291 292 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 293 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); 294 if (required_fault) 295 goto fault; 296 297 /* 298 * Since each architecture defines a struct page for the zero page, just 299 * fall through and treat it like a normal page. 300 */ 301 if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { 302 if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { 303 pte_unmap(ptep); 304 return -EFAULT; 305 } 306 *pfn = range->values[HMM_PFN_SPECIAL]; 307 return 0; 308 } 309 310 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 311 return 0; 312 313 fault: 314 pte_unmap(ptep); 315 /* Fault any virtual address we were asked to fault */ 316 return hmm_vma_fault(addr, end, required_fault, walk); 317 } 318 319 static int hmm_vma_walk_pmd(pmd_t *pmdp, 320 unsigned long start, 321 unsigned long end, 322 struct mm_walk *walk) 323 { 324 struct hmm_vma_walk *hmm_vma_walk = walk->private; 325 struct hmm_range *range = hmm_vma_walk->range; 326 uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; 327 unsigned long npages = (end - start) >> PAGE_SHIFT; 328 unsigned long addr = start; 329 pte_t *ptep; 330 pmd_t pmd; 331 332 again: 333 pmd = READ_ONCE(*pmdp); 334 if (pmd_none(pmd)) 335 return hmm_vma_walk_hole(start, end, -1, walk); 336 337 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 338 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { 339 hmm_vma_walk->last = addr; 340 pmd_migration_entry_wait(walk->mm, pmdp); 341 return -EBUSY; 342 } 343 return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 344 } 345 346 if (!pmd_present(pmd)) { 347 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) 348 return -EFAULT; 349 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 350 } 351 352 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 353 /* 354 * No need to take pmd_lock here, even if some other thread 355 * is splitting the huge pmd we will get that event through 356 * mmu_notifier callback. 357 * 358 * So just read pmd value and check again it's a transparent 359 * huge or device mapping one and compute corresponding pfn 360 * values. 361 */ 362 pmd = pmd_read_atomic(pmdp); 363 barrier(); 364 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 365 goto again; 366 367 return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); 368 } 369 370 /* 371 * We have handled all the valid cases above ie either none, migration, 372 * huge or transparent huge. At this point either it is a valid pmd 373 * entry pointing to pte directory or it is a bad pmd that will not 374 * recover. 375 */ 376 if (pmd_bad(pmd)) { 377 if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) 378 return -EFAULT; 379 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 380 } 381 382 ptep = pte_offset_map(pmdp, addr); 383 for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { 384 int r; 385 386 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); 387 if (r) { 388 /* hmm_vma_handle_pte() did pte_unmap() */ 389 hmm_vma_walk->last = addr; 390 return r; 391 } 392 } 393 pte_unmap(ptep - 1); 394 395 hmm_vma_walk->last = addr; 396 return 0; 397 } 398 399 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 400 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 401 static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 402 { 403 if (!pud_present(pud)) 404 return 0; 405 return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 406 range->flags[HMM_PFN_WRITE] : 407 range->flags[HMM_PFN_VALID]; 408 } 409 410 static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 411 struct mm_walk *walk) 412 { 413 struct hmm_vma_walk *hmm_vma_walk = walk->private; 414 struct hmm_range *range = hmm_vma_walk->range; 415 unsigned long addr = start; 416 pud_t pud; 417 int ret = 0; 418 spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 419 420 if (!ptl) 421 return 0; 422 423 /* Normally we don't want to split the huge page */ 424 walk->action = ACTION_CONTINUE; 425 426 pud = READ_ONCE(*pudp); 427 if (pud_none(pud)) { 428 spin_unlock(ptl); 429 return hmm_vma_walk_hole(start, end, -1, walk); 430 } 431 432 if (pud_huge(pud) && pud_devmap(pud)) { 433 unsigned long i, npages, pfn; 434 unsigned int required_fault; 435 uint64_t *pfns, cpu_flags; 436 437 if (!pud_present(pud)) { 438 spin_unlock(ptl); 439 return hmm_vma_walk_hole(start, end, -1, walk); 440 } 441 442 i = (addr - range->start) >> PAGE_SHIFT; 443 npages = (end - addr) >> PAGE_SHIFT; 444 pfns = &range->pfns[i]; 445 446 cpu_flags = pud_to_hmm_pfn_flags(range, pud); 447 required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, 448 npages, cpu_flags); 449 if (required_fault) { 450 spin_unlock(ptl); 451 return hmm_vma_fault(addr, end, required_fault, walk); 452 } 453 454 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 455 for (i = 0; i < npages; ++i, ++pfn) 456 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 457 cpu_flags; 458 hmm_vma_walk->last = end; 459 goto out_unlock; 460 } 461 462 /* Ask for the PUD to be split */ 463 walk->action = ACTION_SUBTREE; 464 465 out_unlock: 466 spin_unlock(ptl); 467 return ret; 468 } 469 #else 470 #define hmm_vma_walk_pud NULL 471 #endif 472 473 #ifdef CONFIG_HUGETLB_PAGE 474 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 475 unsigned long start, unsigned long end, 476 struct mm_walk *walk) 477 { 478 unsigned long addr = start, i, pfn; 479 struct hmm_vma_walk *hmm_vma_walk = walk->private; 480 struct hmm_range *range = hmm_vma_walk->range; 481 struct vm_area_struct *vma = walk->vma; 482 uint64_t orig_pfn, cpu_flags; 483 unsigned int required_fault; 484 spinlock_t *ptl; 485 pte_t entry; 486 487 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 488 entry = huge_ptep_get(pte); 489 490 i = (start - range->start) >> PAGE_SHIFT; 491 orig_pfn = range->pfns[i]; 492 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 493 required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); 494 if (required_fault) { 495 spin_unlock(ptl); 496 return hmm_vma_fault(addr, end, required_fault, walk); 497 } 498 499 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 500 for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 501 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 502 cpu_flags; 503 hmm_vma_walk->last = end; 504 spin_unlock(ptl); 505 return 0; 506 } 507 #else 508 #define hmm_vma_walk_hugetlb_entry NULL 509 #endif /* CONFIG_HUGETLB_PAGE */ 510 511 static int hmm_vma_walk_test(unsigned long start, unsigned long end, 512 struct mm_walk *walk) 513 { 514 struct hmm_vma_walk *hmm_vma_walk = walk->private; 515 struct hmm_range *range = hmm_vma_walk->range; 516 struct vm_area_struct *vma = walk->vma; 517 518 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && 519 vma->vm_flags & VM_READ) 520 return 0; 521 522 /* 523 * vma ranges that don't have struct page backing them or map I/O 524 * devices directly cannot be handled by hmm_range_fault(). 525 * 526 * If the vma does not allow read access, then assume that it does not 527 * allow write access either. HMM does not support architectures that 528 * allow write without read. 529 * 530 * If a fault is requested for an unsupported range then it is a hard 531 * failure. 532 */ 533 if (hmm_range_need_fault(hmm_vma_walk, 534 range->pfns + 535 ((start - range->start) >> PAGE_SHIFT), 536 (end - start) >> PAGE_SHIFT, 0)) 537 return -EFAULT; 538 539 hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 540 hmm_vma_walk->last = end; 541 542 /* Skip this vma and continue processing the next vma. */ 543 return 1; 544 } 545 546 static const struct mm_walk_ops hmm_walk_ops = { 547 .pud_entry = hmm_vma_walk_pud, 548 .pmd_entry = hmm_vma_walk_pmd, 549 .pte_hole = hmm_vma_walk_hole, 550 .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 551 .test_walk = hmm_vma_walk_test, 552 }; 553 554 /** 555 * hmm_range_fault - try to fault some address in a virtual address range 556 * @range: argument structure 557 * 558 * Return: the number of valid pages in range->pfns[] (from range start 559 * address), which may be zero. On error one of the following status codes 560 * can be returned: 561 * 562 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 563 * (e.g., device file vma). 564 * -ENOMEM: Out of memory. 565 * -EPERM: Invalid permission (e.g., asking for write and range is read 566 * only). 567 * -EBUSY: The range has been invalidated and the caller needs to wait for 568 * the invalidation to finish. 569 * -EFAULT: A page was requested to be valid and could not be made valid 570 * ie it has no backing VMA or it is illegal to access 571 * 572 * This is similar to get_user_pages(), except that it can read the page tables 573 * without mutating them (ie causing faults). 574 */ 575 long hmm_range_fault(struct hmm_range *range) 576 { 577 struct hmm_vma_walk hmm_vma_walk = { 578 .range = range, 579 .last = range->start, 580 }; 581 struct mm_struct *mm = range->notifier->mm; 582 int ret; 583 584 lockdep_assert_held(&mm->mmap_sem); 585 586 do { 587 /* If range is no longer valid force retry. */ 588 if (mmu_interval_check_retry(range->notifier, 589 range->notifier_seq)) 590 return -EBUSY; 591 ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 592 &hmm_walk_ops, &hmm_vma_walk); 593 } while (ret == -EBUSY); 594 595 if (ret) 596 return ret; 597 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 598 } 599 EXPORT_SYMBOL(hmm_range_fault); 600