1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2133ff0eaSJérôme Glisse /* 3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 4133ff0eaSJérôme Glisse * 5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 6133ff0eaSJérôme Glisse */ 7133ff0eaSJérôme Glisse /* 8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 9133ff0eaSJérôme Glisse * management or HMM for short. 10133ff0eaSJérôme Glisse */ 11a520110eSChristoph Hellwig #include <linux/pagewalk.h> 12133ff0eaSJérôme Glisse #include <linux/hmm.h> 13858b54daSJérôme Glisse #include <linux/init.h> 14da4c3c73SJérôme Glisse #include <linux/rmap.h> 15da4c3c73SJérôme Glisse #include <linux/swap.h> 16133ff0eaSJérôme Glisse #include <linux/slab.h> 17133ff0eaSJérôme Glisse #include <linux/sched.h> 184ef589dcSJérôme Glisse #include <linux/mmzone.h> 194ef589dcSJérôme Glisse #include <linux/pagemap.h> 20da4c3c73SJérôme Glisse #include <linux/swapops.h> 21da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 224ef589dcSJérôme Glisse #include <linux/memremap.h> 23c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h> 247b2d55d2SJérôme Glisse #include <linux/jump_label.h> 2555c0ece8SJérôme Glisse #include <linux/dma-mapping.h> 26c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 274ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 284ef589dcSJérôme Glisse 29b756a3b5SAlistair Popple #include "internal.h" 30b756a3b5SAlistair Popple 3174eee180SJérôme Glisse struct hmm_vma_walk { 3274eee180SJérôme Glisse struct hmm_range *range; 3374eee180SJérôme Glisse unsigned long last; 3474eee180SJérôme Glisse }; 3574eee180SJérôme Glisse 36a3eb13c1SJason Gunthorpe enum { 37a3eb13c1SJason Gunthorpe HMM_NEED_FAULT = 1 << 0, 38a3eb13c1SJason Gunthorpe HMM_NEED_WRITE_FAULT = 1 << 1, 39a3eb13c1SJason Gunthorpe HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 40a3eb13c1SJason Gunthorpe }; 41a3eb13c1SJason Gunthorpe 42d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end, 432733ea14SJason Gunthorpe struct hmm_range *range, unsigned long cpu_flags) 44da4c3c73SJérôme Glisse { 452733ea14SJason Gunthorpe unsigned long i = (addr - range->start) >> PAGE_SHIFT; 46da4c3c73SJérôme Glisse 47da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 482733ea14SJason Gunthorpe range->hmm_pfns[i] = cpu_flags; 49da4c3c73SJérôme Glisse return 0; 50da4c3c73SJérôme Glisse } 51da4c3c73SJérôme Glisse 525504ed29SJérôme Glisse /* 53f8c888a3SChristoph Hellwig * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 54d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive) 555504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 56a3eb13c1SJason Gunthorpe * @required_fault: HMM_NEED_* flags 575504ed29SJérôme Glisse * @walk: mm_walk structure 58f8c888a3SChristoph Hellwig * Return: -EBUSY after page fault, or page fault error 595504ed29SJérôme Glisse * 605504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 615504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 625504ed29SJérôme Glisse */ 63f8c888a3SChristoph Hellwig static int hmm_vma_fault(unsigned long addr, unsigned long end, 64a3eb13c1SJason Gunthorpe unsigned int required_fault, struct mm_walk *walk) 65da4c3c73SJérôme Glisse { 6674eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 675a0c38d3SChristoph Hellwig struct vm_area_struct *vma = walk->vma; 685a0c38d3SChristoph Hellwig unsigned int fault_flags = FAULT_FLAG_REMOTE; 69da4c3c73SJérôme Glisse 70a3eb13c1SJason Gunthorpe WARN_ON_ONCE(!required_fault); 7174eee180SJérôme Glisse hmm_vma_walk->last = addr; 7263d5066fSJérôme Glisse 73a3eb13c1SJason Gunthorpe if (required_fault & HMM_NEED_WRITE_FAULT) { 745a0c38d3SChristoph Hellwig if (!(vma->vm_flags & VM_WRITE)) 75c18ce674SRalph Campbell return -EPERM; 765a0c38d3SChristoph Hellwig fault_flags |= FAULT_FLAG_WRITE; 7774eee180SJérôme Glisse } 7874eee180SJérôme Glisse 7953bfe17fSJason Gunthorpe for (; addr < end; addr += PAGE_SIZE) 80bce617edSPeter Xu if (handle_mm_fault(vma, addr, fault_flags, NULL) & 81bce617edSPeter Xu VM_FAULT_ERROR) 825a0c38d3SChristoph Hellwig return -EFAULT; 8353bfe17fSJason Gunthorpe return -EBUSY; 842aee09d8SJérôme Glisse } 852aee09d8SJérôme Glisse 86a3eb13c1SJason Gunthorpe static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 872733ea14SJason Gunthorpe unsigned long pfn_req_flags, 882733ea14SJason Gunthorpe unsigned long cpu_flags) 892aee09d8SJérôme Glisse { 90f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 91f88a1e90SJérôme Glisse 92023a019aSJérôme Glisse /* 93023a019aSJérôme Glisse * So we not only consider the individual per page request we also 94023a019aSJérôme Glisse * consider the default flags requested for the range. The API can 95d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces 96d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for 97d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre- 98023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a 99023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default 100023a019aSJérôme Glisse * flags value. 101023a019aSJérôme Glisse */ 1022733ea14SJason Gunthorpe pfn_req_flags &= range->pfn_flags_mask; 1032733ea14SJason Gunthorpe pfn_req_flags |= range->default_flags; 104023a019aSJérôme Glisse 1052aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 1062733ea14SJason Gunthorpe if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) 107a3eb13c1SJason Gunthorpe return 0; 108f88a1e90SJérôme Glisse 109f88a1e90SJérôme Glisse /* Need to write fault ? */ 1102733ea14SJason Gunthorpe if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && 1112733ea14SJason Gunthorpe !(cpu_flags & HMM_PFN_WRITE)) 112a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 113a3eb13c1SJason Gunthorpe 114a3eb13c1SJason Gunthorpe /* If CPU page table is not valid then we need to fault */ 1152733ea14SJason Gunthorpe if (!(cpu_flags & HMM_PFN_VALID)) 116a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT; 117a3eb13c1SJason Gunthorpe return 0; 1182aee09d8SJérôme Glisse } 1192aee09d8SJérôme Glisse 120a3eb13c1SJason Gunthorpe static unsigned int 121a3eb13c1SJason Gunthorpe hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1222733ea14SJason Gunthorpe const unsigned long hmm_pfns[], unsigned long npages, 1232733ea14SJason Gunthorpe unsigned long cpu_flags) 1242aee09d8SJérôme Glisse { 1256bfef2f9SJason Gunthorpe struct hmm_range *range = hmm_vma_walk->range; 126a3eb13c1SJason Gunthorpe unsigned int required_fault = 0; 1272aee09d8SJérôme Glisse unsigned long i; 1282aee09d8SJérôme Glisse 1296bfef2f9SJason Gunthorpe /* 1306bfef2f9SJason Gunthorpe * If the default flags do not request to fault pages, and the mask does 1316bfef2f9SJason Gunthorpe * not allow for individual pages to be faulted, then 1326bfef2f9SJason Gunthorpe * hmm_pte_need_fault() will always return 0. 1336bfef2f9SJason Gunthorpe */ 1346bfef2f9SJason Gunthorpe if (!((range->default_flags | range->pfn_flags_mask) & 1352733ea14SJason Gunthorpe HMM_PFN_REQ_FAULT)) 136a3eb13c1SJason Gunthorpe return 0; 1372aee09d8SJérôme Glisse 1382aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 1392733ea14SJason Gunthorpe required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], 1402733ea14SJason Gunthorpe cpu_flags); 141a3eb13c1SJason Gunthorpe if (required_fault == HMM_NEED_ALL_BITS) 142a3eb13c1SJason Gunthorpe return required_fault; 1432aee09d8SJérôme Glisse } 144a3eb13c1SJason Gunthorpe return required_fault; 1452aee09d8SJérôme Glisse } 1462aee09d8SJérôme Glisse 1472aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 148b7a16c7aSSteven Price __always_unused int depth, struct mm_walk *walk) 1492aee09d8SJérôme Glisse { 1502aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 1512aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 152a3eb13c1SJason Gunthorpe unsigned int required_fault; 1532aee09d8SJérôme Glisse unsigned long i, npages; 1542733ea14SJason Gunthorpe unsigned long *hmm_pfns; 1552aee09d8SJérôme Glisse 1562aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 1572aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 1582733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i]; 1592733ea14SJason Gunthorpe required_fault = 1602733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); 161bd5d3587SJason Gunthorpe if (!walk->vma) { 162bd5d3587SJason Gunthorpe if (required_fault) 163bd5d3587SJason Gunthorpe return -EFAULT; 164bd5d3587SJason Gunthorpe return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); 165bd5d3587SJason Gunthorpe } 166a3eb13c1SJason Gunthorpe if (required_fault) 167a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 1682733ea14SJason Gunthorpe return hmm_pfns_fill(addr, end, range, 0); 1692aee09d8SJérôme Glisse } 1702aee09d8SJérôme Glisse 1713b50a6e5SRalph Campbell static inline unsigned long hmm_pfn_flags_order(unsigned long order) 1723b50a6e5SRalph Campbell { 1733b50a6e5SRalph Campbell return order << HMM_PFN_ORDER_SHIFT; 1743b50a6e5SRalph Campbell } 1753b50a6e5SRalph Campbell 1762733ea14SJason Gunthorpe static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, 1772733ea14SJason Gunthorpe pmd_t pmd) 1782aee09d8SJérôme Glisse { 1792aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 1802aee09d8SJérôme Glisse return 0; 1813b50a6e5SRalph Campbell return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 1823b50a6e5SRalph Campbell HMM_PFN_VALID) | 1833b50a6e5SRalph Campbell hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); 184da4c3c73SJérôme Glisse } 185da4c3c73SJérôme Glisse 186992de9a8SJérôme Glisse #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1879d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 1882733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[], 1892733ea14SJason Gunthorpe pmd_t pmd) 1909d3973d6SChristoph Hellwig { 19153f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 192f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 1932aee09d8SJérôme Glisse unsigned long pfn, npages, i; 194a3eb13c1SJason Gunthorpe unsigned int required_fault; 1952733ea14SJason Gunthorpe unsigned long cpu_flags; 19653f5c3f4SJérôme Glisse 1972aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 198f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 199a3eb13c1SJason Gunthorpe required_fault = 2002733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); 201a3eb13c1SJason Gunthorpe if (required_fault) 202a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 20353f5c3f4SJérôme Glisse 204309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 205068354adSJason Gunthorpe for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 2062733ea14SJason Gunthorpe hmm_pfns[i] = pfn | cpu_flags; 20753f5c3f4SJérôme Glisse return 0; 20853f5c3f4SJérôme Glisse } 2099d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2109d3973d6SChristoph Hellwig /* stub to allow the code below to compile */ 2119d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2122733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); 2139d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 21453f5c3f4SJérôme Glisse 2152733ea14SJason Gunthorpe static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, 2162733ea14SJason Gunthorpe pte_t pte) 2172aee09d8SJérôme Glisse { 218789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2192aee09d8SJérôme Glisse return 0; 2202733ea14SJason Gunthorpe return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 2212aee09d8SJérôme Glisse } 2222aee09d8SJérôme Glisse 22353f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 22453f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 2252733ea14SJason Gunthorpe unsigned long *hmm_pfn) 22653f5c3f4SJérôme Glisse { 22753f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 228f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 229a3eb13c1SJason Gunthorpe unsigned int required_fault; 2302733ea14SJason Gunthorpe unsigned long cpu_flags; 23153f5c3f4SJérôme Glisse pte_t pte = *ptep; 2322733ea14SJason Gunthorpe uint64_t pfn_req_flags = *hmm_pfn; 23353f5c3f4SJérôme Glisse 2345c041f5dSPeter Xu if (pte_none_mostly(pte)) { 2352733ea14SJason Gunthorpe required_fault = 2362733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 237a3eb13c1SJason Gunthorpe if (required_fault) 23853f5c3f4SJérôme Glisse goto fault; 2392733ea14SJason Gunthorpe *hmm_pfn = 0; 24053f5c3f4SJérôme Glisse return 0; 24153f5c3f4SJérôme Glisse } 24253f5c3f4SJérôme Glisse 24353f5c3f4SJérôme Glisse if (!pte_present(pte)) { 24453f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 24553f5c3f4SJérôme Glisse 24653f5c3f4SJérôme Glisse /* 2478a295dbbSRalph Campbell * Don't fault in device private pages owned by the caller, 2488a295dbbSRalph Campbell * just report the PFN. 24953f5c3f4SJérôme Glisse */ 2508a295dbbSRalph Campbell if (is_device_private_entry(entry) && 2518a295dbbSRalph Campbell pfn_swap_entry_to_page(entry)->pgmap->owner == 2528a295dbbSRalph Campbell range->dev_private_owner) { 2532733ea14SJason Gunthorpe cpu_flags = HMM_PFN_VALID; 2544dd845b5SAlistair Popple if (is_writable_device_private_entry(entry)) 2552733ea14SJason Gunthorpe cpu_flags |= HMM_PFN_WRITE; 2560d206b5dSPeter Xu *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; 25753f5c3f4SJérôme Glisse return 0; 25853f5c3f4SJérôme Glisse } 25953f5c3f4SJérôme Glisse 2602733ea14SJason Gunthorpe required_fault = 2612733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 262846babe8SJason Gunthorpe if (!required_fault) { 2632733ea14SJason Gunthorpe *hmm_pfn = 0; 26476612d6cSJason Gunthorpe return 0; 265846babe8SJason Gunthorpe } 26676612d6cSJason Gunthorpe 26776612d6cSJason Gunthorpe if (!non_swap_entry(entry)) 26876612d6cSJason Gunthorpe goto fault; 26976612d6cSJason Gunthorpe 2708a295dbbSRalph Campbell if (is_device_private_entry(entry)) 2718a295dbbSRalph Campbell goto fault; 2728a295dbbSRalph Campbell 273b756a3b5SAlistair Popple if (is_device_exclusive_entry(entry)) 274b756a3b5SAlistair Popple goto fault; 275b756a3b5SAlistair Popple 27653f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 27753f5c3f4SJérôme Glisse pte_unmap(ptep); 27853f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 279d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr); 28073231612SJérôme Glisse return -EBUSY; 28153f5c3f4SJérôme Glisse } 28253f5c3f4SJérôme Glisse 28353f5c3f4SJérôme Glisse /* Report error for everything else */ 284dfdc2207SJason Gunthorpe pte_unmap(ptep); 28553f5c3f4SJérôme Glisse return -EFAULT; 28653f5c3f4SJérôme Glisse } 28753f5c3f4SJérôme Glisse 28876612d6cSJason Gunthorpe cpu_flags = pte_to_hmm_pfn_flags(range, pte); 2892733ea14SJason Gunthorpe required_fault = 2902733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 291a3eb13c1SJason Gunthorpe if (required_fault) 29253f5c3f4SJérôme Glisse goto fault; 29353f5c3f4SJérôme Glisse 29440550627SJason Gunthorpe /* 2954b42fb21SLi Zhijian * Bypass devmap pte such as DAX page when all pfn requested 2964b42fb21SLi Zhijian * flags(pfn_req_flags) are fulfilled. 29740550627SJason Gunthorpe * Since each architecture defines a struct page for the zero page, just 29840550627SJason Gunthorpe * fall through and treat it like a normal page. 29940550627SJason Gunthorpe */ 30087c01d57SAlistair Popple if (!vm_normal_page(walk->vma, addr, pte) && 30187c01d57SAlistair Popple !pte_devmap(pte) && 3024b42fb21SLi Zhijian !is_zero_pfn(pte_pfn(pte))) { 3032733ea14SJason Gunthorpe if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { 304dfdc2207SJason Gunthorpe pte_unmap(ptep); 305992de9a8SJérôme Glisse return -EFAULT; 306992de9a8SJérôme Glisse } 3072733ea14SJason Gunthorpe *hmm_pfn = HMM_PFN_ERROR; 30840550627SJason Gunthorpe return 0; 309ac541f25SRalph Campbell } 310992de9a8SJérôme Glisse 3112733ea14SJason Gunthorpe *hmm_pfn = pte_pfn(pte) | cpu_flags; 31253f5c3f4SJérôme Glisse return 0; 31353f5c3f4SJérôme Glisse 31453f5c3f4SJérôme Glisse fault: 31553f5c3f4SJérôme Glisse pte_unmap(ptep); 31653f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 317a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 31853f5c3f4SJérôme Glisse } 31953f5c3f4SJérôme Glisse 320da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 321da4c3c73SJérôme Glisse unsigned long start, 322da4c3c73SJérôme Glisse unsigned long end, 323da4c3c73SJérôme Glisse struct mm_walk *walk) 324da4c3c73SJérôme Glisse { 32574eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 32674eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 3272733ea14SJason Gunthorpe unsigned long *hmm_pfns = 3282733ea14SJason Gunthorpe &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; 3292288a9a6SJason Gunthorpe unsigned long npages = (end - start) >> PAGE_SHIFT; 3302288a9a6SJason Gunthorpe unsigned long addr = start; 331da4c3c73SJérôme Glisse pte_t *ptep; 332da4c3c73SJérôme Glisse pmd_t pmd; 333da4c3c73SJérôme Glisse 334d08faca0SJérôme Glisse again: 33526e1a0c3SHugh Dickins pmd = pmdp_get_lockless(pmdp); 336d08faca0SJérôme Glisse if (pmd_none(pmd)) 337b7a16c7aSSteven Price return hmm_vma_walk_hole(start, end, -1, walk); 338d08faca0SJérôme Glisse 339d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 3402733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { 341d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 342d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp); 34373231612SJérôme Glisse return -EBUSY; 344d08faca0SJérôme Glisse } 3452733ea14SJason Gunthorpe return hmm_pfns_fill(start, end, range, 0); 3462288a9a6SJason Gunthorpe } 3472288a9a6SJason Gunthorpe 3482288a9a6SJason Gunthorpe if (!pmd_present(pmd)) { 3492733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3502288a9a6SJason Gunthorpe return -EFAULT; 351d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3522288a9a6SJason Gunthorpe } 353d08faca0SJérôme Glisse 354d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 355da4c3c73SJérôme Glisse /* 356d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread 357da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 358da4c3c73SJérôme Glisse * mmu_notifier callback. 359da4c3c73SJérôme Glisse * 360d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent 361da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 362da4c3c73SJérôme Glisse * values. 363da4c3c73SJérôme Glisse */ 364dab6e717SPeter Zijlstra pmd = pmdp_get_lockless(pmdp); 365da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 366da4c3c73SJérôme Glisse goto again; 367da4c3c73SJérôme Glisse 3682733ea14SJason Gunthorpe return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); 369da4c3c73SJérôme Glisse } 370da4c3c73SJérôme Glisse 371d08faca0SJérôme Glisse /* 372d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration, 373d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 374d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 375d08faca0SJérôme Glisse * recover. 376d08faca0SJérôme Glisse */ 3772288a9a6SJason Gunthorpe if (pmd_bad(pmd)) { 3782733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3792288a9a6SJason Gunthorpe return -EFAULT; 380d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3812288a9a6SJason Gunthorpe } 382da4c3c73SJérôme Glisse 383da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 384*6ec1905fSHugh Dickins if (!ptep) 385*6ec1905fSHugh Dickins goto again; 3862733ea14SJason Gunthorpe for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { 38753f5c3f4SJérôme Glisse int r; 388da4c3c73SJérôme Glisse 3892733ea14SJason Gunthorpe r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); 39053f5c3f4SJérôme Glisse if (r) { 391dfdc2207SJason Gunthorpe /* hmm_vma_handle_pte() did pte_unmap() */ 39253f5c3f4SJérôme Glisse return r; 39374eee180SJérôme Glisse } 394da4c3c73SJérôme Glisse } 395da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 396da4c3c73SJérôme Glisse return 0; 397da4c3c73SJérôme Glisse } 398da4c3c73SJérôme Glisse 399f0b3c45cSChristoph Hellwig #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 400f0b3c45cSChristoph Hellwig defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 4012733ea14SJason Gunthorpe static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, 4022733ea14SJason Gunthorpe pud_t pud) 403f0b3c45cSChristoph Hellwig { 404f0b3c45cSChristoph Hellwig if (!pud_present(pud)) 405f0b3c45cSChristoph Hellwig return 0; 4063b50a6e5SRalph Campbell return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 4073b50a6e5SRalph Campbell HMM_PFN_VALID) | 4083b50a6e5SRalph Campbell hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); 409f0b3c45cSChristoph Hellwig } 410f0b3c45cSChristoph Hellwig 411f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 412992de9a8SJérôme Glisse struct mm_walk *walk) 413992de9a8SJérôme Glisse { 414992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 415992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4163afc4236SSteven Price unsigned long addr = start; 417992de9a8SJérôme Glisse pud_t pud; 4183afc4236SSteven Price spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 419992de9a8SJérôme Glisse 4203afc4236SSteven Price if (!ptl) 4213afc4236SSteven Price return 0; 4223afc4236SSteven Price 4233afc4236SSteven Price /* Normally we don't want to split the huge page */ 4243afc4236SSteven Price walk->action = ACTION_CONTINUE; 4253afc4236SSteven Price 426992de9a8SJérôme Glisse pud = READ_ONCE(*pudp); 4273afc4236SSteven Price if (pud_none(pud)) { 42805fc1df9SJason Gunthorpe spin_unlock(ptl); 42905fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4303afc4236SSteven Price } 431992de9a8SJérôme Glisse 432992de9a8SJérôme Glisse if (pud_huge(pud) && pud_devmap(pud)) { 433992de9a8SJérôme Glisse unsigned long i, npages, pfn; 434a3eb13c1SJason Gunthorpe unsigned int required_fault; 4352733ea14SJason Gunthorpe unsigned long *hmm_pfns; 4362733ea14SJason Gunthorpe unsigned long cpu_flags; 437992de9a8SJérôme Glisse 4383afc4236SSteven Price if (!pud_present(pud)) { 43905fc1df9SJason Gunthorpe spin_unlock(ptl); 44005fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4413afc4236SSteven Price } 442992de9a8SJérôme Glisse 443992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 444992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 4452733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i]; 446992de9a8SJérôme Glisse 447992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud); 4482733ea14SJason Gunthorpe required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, 449a3eb13c1SJason Gunthorpe npages, cpu_flags); 450a3eb13c1SJason Gunthorpe if (required_fault) { 45105fc1df9SJason Gunthorpe spin_unlock(ptl); 452a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 4533afc4236SSteven Price } 454992de9a8SJérôme Glisse 455992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 456068354adSJason Gunthorpe for (i = 0; i < npages; ++i, ++pfn) 4572733ea14SJason Gunthorpe hmm_pfns[i] = pfn | cpu_flags; 4583afc4236SSteven Price goto out_unlock; 459992de9a8SJérôme Glisse } 460992de9a8SJérôme Glisse 4613afc4236SSteven Price /* Ask for the PUD to be split */ 4623afc4236SSteven Price walk->action = ACTION_SUBTREE; 463992de9a8SJérôme Glisse 4643afc4236SSteven Price out_unlock: 4653afc4236SSteven Price spin_unlock(ptl); 466d0977efaSMiaohe Lin return 0; 467992de9a8SJérôme Glisse } 468f0b3c45cSChristoph Hellwig #else 469f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL 470f0b3c45cSChristoph Hellwig #endif 471992de9a8SJérôme Glisse 472251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE 47363d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 47463d5066fSJérôme Glisse unsigned long start, unsigned long end, 47563d5066fSJérôme Glisse struct mm_walk *walk) 47663d5066fSJérôme Glisse { 47705c23af4SChristoph Hellwig unsigned long addr = start, i, pfn; 47863d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 47963d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 48063d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma; 481a3eb13c1SJason Gunthorpe unsigned int required_fault; 4822733ea14SJason Gunthorpe unsigned long pfn_req_flags; 4832733ea14SJason Gunthorpe unsigned long cpu_flags; 48463d5066fSJérôme Glisse spinlock_t *ptl; 48563d5066fSJérôme Glisse pte_t entry; 48663d5066fSJérôme Glisse 487d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 48863d5066fSJérôme Glisse entry = huge_ptep_get(pte); 48963d5066fSJérôme Glisse 4907f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT; 4912733ea14SJason Gunthorpe pfn_req_flags = range->hmm_pfns[i]; 4923b50a6e5SRalph Campbell cpu_flags = pte_to_hmm_pfn_flags(range, entry) | 4933b50a6e5SRalph Campbell hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); 4942733ea14SJason Gunthorpe required_fault = 4952733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 496a3eb13c1SJason Gunthorpe if (required_fault) { 497dd361e50SPeter Xu int ret; 498dd361e50SPeter Xu 49945050692SChristoph Hellwig spin_unlock(ptl); 500dd361e50SPeter Xu hugetlb_vma_unlock_read(vma); 501dd361e50SPeter Xu /* 502dd361e50SPeter Xu * Avoid deadlock: drop the vma lock before calling 503dd361e50SPeter Xu * hmm_vma_fault(), which will itself potentially take and 504dd361e50SPeter Xu * drop the vma lock. This is also correct from a 505dd361e50SPeter Xu * protection point of view, because there is no further 506dd361e50SPeter Xu * use here of either pte or ptl after dropping the vma 507dd361e50SPeter Xu * lock. 508dd361e50SPeter Xu */ 509dd361e50SPeter Xu ret = hmm_vma_fault(addr, end, required_fault, walk); 510dd361e50SPeter Xu hugetlb_vma_lock_read(vma); 511dd361e50SPeter Xu return ret; 51263d5066fSJérôme Glisse } 51363d5066fSJérôme Glisse 51405c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 5157f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 5162733ea14SJason Gunthorpe range->hmm_pfns[i] = pfn | cpu_flags; 5172733ea14SJason Gunthorpe 51863d5066fSJérôme Glisse spin_unlock(ptl); 51945050692SChristoph Hellwig return 0; 52063d5066fSJérôme Glisse } 521251bbe59SChristoph Hellwig #else 522251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL 523251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */ 52463d5066fSJérôme Glisse 525d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end, 526d28c2c9aSRalph Campbell struct mm_walk *walk) 52733cd47dcSJérôme Glisse { 528d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private; 529d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range; 530d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma; 531d28c2c9aSRalph Campbell 53287c01d57SAlistair Popple if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 533a3eb13c1SJason Gunthorpe vma->vm_flags & VM_READ) 534a3eb13c1SJason Gunthorpe return 0; 535a3eb13c1SJason Gunthorpe 536d28c2c9aSRalph Campbell /* 537a3eb13c1SJason Gunthorpe * vma ranges that don't have struct page backing them or map I/O 538a3eb13c1SJason Gunthorpe * devices directly cannot be handled by hmm_range_fault(). 539c2579c9cSJason Gunthorpe * 540d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not 541c2579c9cSJason Gunthorpe * allow write access either. HMM does not support architectures that 542c2579c9cSJason Gunthorpe * allow write without read. 543a3eb13c1SJason Gunthorpe * 544a3eb13c1SJason Gunthorpe * If a fault is requested for an unsupported range then it is a hard 545a3eb13c1SJason Gunthorpe * failure. 546d28c2c9aSRalph Campbell */ 547a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, 5482733ea14SJason Gunthorpe range->hmm_pfns + 549d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT), 550a3eb13c1SJason Gunthorpe (end - start) >> PAGE_SHIFT, 0)) 551d28c2c9aSRalph Campbell return -EFAULT; 552d28c2c9aSRalph Campbell 553c2579c9cSJason Gunthorpe hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 554d28c2c9aSRalph Campbell 555d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */ 556d28c2c9aSRalph Campbell return 1; 557d28c2c9aSRalph Campbell } 558d28c2c9aSRalph Campbell 5597b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = { 5607b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud, 5617b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd, 5627b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole, 5637b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 564d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test, 5657b86ac33SChristoph Hellwig }; 5667b86ac33SChristoph Hellwig 5679a4903e4SChristoph Hellwig /** 5689a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range 569f970b977SJason Gunthorpe * @range: argument structure 57073231612SJérôme Glisse * 571be957c88SJason Gunthorpe * Returns 0 on success or one of the following error codes: 5729a4903e4SChristoph Hellwig * 5739a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 5749a4903e4SChristoph Hellwig * (e.g., device file vma). 57573231612SJérôme Glisse * -ENOMEM: Out of memory. 5769a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read 5779a4903e4SChristoph Hellwig * only). 5789a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for 5799a4903e4SChristoph Hellwig * the invalidation to finish. 580f970b977SJason Gunthorpe * -EFAULT: A page was requested to be valid and could not be made valid 581f970b977SJason Gunthorpe * ie it has no backing VMA or it is illegal to access 58274eee180SJérôme Glisse * 583f970b977SJason Gunthorpe * This is similar to get_user_pages(), except that it can read the page tables 584f970b977SJason Gunthorpe * without mutating them (ie causing faults). 58574eee180SJérôme Glisse */ 586be957c88SJason Gunthorpe int hmm_range_fault(struct hmm_range *range) 58774eee180SJérôme Glisse { 588d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = { 589d28c2c9aSRalph Campbell .range = range, 590d28c2c9aSRalph Campbell .last = range->start, 591d28c2c9aSRalph Campbell }; 592a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm; 59374eee180SJérôme Glisse int ret; 59474eee180SJérôme Glisse 59542fc5414SMichel Lespinasse mmap_assert_locked(mm); 596a3e0d41cSJérôme Glisse 597a3e0d41cSJérôme Glisse do { 598a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 599a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 600a22dd506SJason Gunthorpe range->notifier_seq)) 6012bcbeaefSChristoph Hellwig return -EBUSY; 602d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 6037b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk); 604be957c88SJason Gunthorpe /* 605be957c88SJason Gunthorpe * When -EBUSY is returned the loop restarts with 606be957c88SJason Gunthorpe * hmm_vma_walk.last set to an address that has not been stored 607be957c88SJason Gunthorpe * in pfns. All entries < last in the pfn array are set to their 608be957c88SJason Gunthorpe * output, and all >= are still at their input values. 609be957c88SJason Gunthorpe */ 610d28c2c9aSRalph Campbell } while (ret == -EBUSY); 61173231612SJérôme Glisse return ret; 61274eee180SJérôme Glisse } 61373231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 614