1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2133ff0eaSJérôme Glisse /* 3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 4133ff0eaSJérôme Glisse * 5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 6133ff0eaSJérôme Glisse */ 7133ff0eaSJérôme Glisse /* 8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 9133ff0eaSJérôme Glisse * management or HMM for short. 10133ff0eaSJérôme Glisse */ 11a520110eSChristoph Hellwig #include <linux/pagewalk.h> 12133ff0eaSJérôme Glisse #include <linux/hmm.h> 13858b54daSJérôme Glisse #include <linux/init.h> 14da4c3c73SJérôme Glisse #include <linux/rmap.h> 15da4c3c73SJérôme Glisse #include <linux/swap.h> 16133ff0eaSJérôme Glisse #include <linux/slab.h> 17133ff0eaSJérôme Glisse #include <linux/sched.h> 184ef589dcSJérôme Glisse #include <linux/mmzone.h> 194ef589dcSJérôme Glisse #include <linux/pagemap.h> 20da4c3c73SJérôme Glisse #include <linux/swapops.h> 21da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 224ef589dcSJérôme Glisse #include <linux/memremap.h> 23c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h> 247b2d55d2SJérôme Glisse #include <linux/jump_label.h> 2555c0ece8SJérôme Glisse #include <linux/dma-mapping.h> 26c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 274ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 284ef589dcSJérôme Glisse 2974eee180SJérôme Glisse struct hmm_vma_walk { 3074eee180SJérôme Glisse struct hmm_range *range; 3174eee180SJérôme Glisse unsigned long last; 3274eee180SJérôme Glisse }; 3374eee180SJérôme Glisse 34a3eb13c1SJason Gunthorpe enum { 35a3eb13c1SJason Gunthorpe HMM_NEED_FAULT = 1 << 0, 36a3eb13c1SJason Gunthorpe HMM_NEED_WRITE_FAULT = 1 << 1, 37a3eb13c1SJason Gunthorpe HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 38a3eb13c1SJason Gunthorpe }; 39a3eb13c1SJason Gunthorpe 40d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end, 41*2733ea14SJason Gunthorpe struct hmm_range *range, unsigned long cpu_flags) 42da4c3c73SJérôme Glisse { 43*2733ea14SJason Gunthorpe unsigned long i = (addr - range->start) >> PAGE_SHIFT; 44da4c3c73SJérôme Glisse 45da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 46*2733ea14SJason Gunthorpe range->hmm_pfns[i] = cpu_flags; 47da4c3c73SJérôme Glisse return 0; 48da4c3c73SJérôme Glisse } 49da4c3c73SJérôme Glisse 505504ed29SJérôme Glisse /* 51f8c888a3SChristoph Hellwig * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 52d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive) 535504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 54a3eb13c1SJason Gunthorpe * @required_fault: HMM_NEED_* flags 555504ed29SJérôme Glisse * @walk: mm_walk structure 56f8c888a3SChristoph Hellwig * Return: -EBUSY after page fault, or page fault error 575504ed29SJérôme Glisse * 585504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 595504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 605504ed29SJérôme Glisse */ 61f8c888a3SChristoph Hellwig static int hmm_vma_fault(unsigned long addr, unsigned long end, 62a3eb13c1SJason Gunthorpe unsigned int required_fault, struct mm_walk *walk) 63da4c3c73SJérôme Glisse { 6474eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 655a0c38d3SChristoph Hellwig struct vm_area_struct *vma = walk->vma; 665a0c38d3SChristoph Hellwig unsigned int fault_flags = FAULT_FLAG_REMOTE; 67da4c3c73SJérôme Glisse 68a3eb13c1SJason Gunthorpe WARN_ON_ONCE(!required_fault); 6974eee180SJérôme Glisse hmm_vma_walk->last = addr; 7063d5066fSJérôme Glisse 71a3eb13c1SJason Gunthorpe if (required_fault & HMM_NEED_WRITE_FAULT) { 725a0c38d3SChristoph Hellwig if (!(vma->vm_flags & VM_WRITE)) 73c18ce674SRalph Campbell return -EPERM; 745a0c38d3SChristoph Hellwig fault_flags |= FAULT_FLAG_WRITE; 7574eee180SJérôme Glisse } 7674eee180SJérôme Glisse 7753bfe17fSJason Gunthorpe for (; addr < end; addr += PAGE_SIZE) 785a0c38d3SChristoph Hellwig if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) 795a0c38d3SChristoph Hellwig return -EFAULT; 8053bfe17fSJason Gunthorpe return -EBUSY; 812aee09d8SJérôme Glisse } 822aee09d8SJérôme Glisse 83a3eb13c1SJason Gunthorpe static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 84*2733ea14SJason Gunthorpe unsigned long pfn_req_flags, 85*2733ea14SJason Gunthorpe unsigned long cpu_flags) 862aee09d8SJérôme Glisse { 87f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 88f88a1e90SJérôme Glisse 89023a019aSJérôme Glisse /* 90023a019aSJérôme Glisse * So we not only consider the individual per page request we also 91023a019aSJérôme Glisse * consider the default flags requested for the range. The API can 92d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces 93d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for 94d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre- 95023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a 96023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default 97023a019aSJérôme Glisse * flags value. 98023a019aSJérôme Glisse */ 99*2733ea14SJason Gunthorpe pfn_req_flags &= range->pfn_flags_mask; 100*2733ea14SJason Gunthorpe pfn_req_flags |= range->default_flags; 101023a019aSJérôme Glisse 1022aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 103*2733ea14SJason Gunthorpe if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) 104a3eb13c1SJason Gunthorpe return 0; 105f88a1e90SJérôme Glisse 106f88a1e90SJérôme Glisse /* Need to write fault ? */ 107*2733ea14SJason Gunthorpe if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && 108*2733ea14SJason Gunthorpe !(cpu_flags & HMM_PFN_WRITE)) 109a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 110a3eb13c1SJason Gunthorpe 111a3eb13c1SJason Gunthorpe /* If CPU page table is not valid then we need to fault */ 112*2733ea14SJason Gunthorpe if (!(cpu_flags & HMM_PFN_VALID)) 113a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT; 114a3eb13c1SJason Gunthorpe return 0; 1152aee09d8SJérôme Glisse } 1162aee09d8SJérôme Glisse 117a3eb13c1SJason Gunthorpe static unsigned int 118a3eb13c1SJason Gunthorpe hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 119*2733ea14SJason Gunthorpe const unsigned long hmm_pfns[], unsigned long npages, 120*2733ea14SJason Gunthorpe unsigned long cpu_flags) 1212aee09d8SJérôme Glisse { 1226bfef2f9SJason Gunthorpe struct hmm_range *range = hmm_vma_walk->range; 123a3eb13c1SJason Gunthorpe unsigned int required_fault = 0; 1242aee09d8SJérôme Glisse unsigned long i; 1252aee09d8SJérôme Glisse 1266bfef2f9SJason Gunthorpe /* 1276bfef2f9SJason Gunthorpe * If the default flags do not request to fault pages, and the mask does 1286bfef2f9SJason Gunthorpe * not allow for individual pages to be faulted, then 1296bfef2f9SJason Gunthorpe * hmm_pte_need_fault() will always return 0. 1306bfef2f9SJason Gunthorpe */ 1316bfef2f9SJason Gunthorpe if (!((range->default_flags | range->pfn_flags_mask) & 132*2733ea14SJason Gunthorpe HMM_PFN_REQ_FAULT)) 133a3eb13c1SJason Gunthorpe return 0; 1342aee09d8SJérôme Glisse 1352aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 136*2733ea14SJason Gunthorpe required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], 137*2733ea14SJason Gunthorpe cpu_flags); 138a3eb13c1SJason Gunthorpe if (required_fault == HMM_NEED_ALL_BITS) 139a3eb13c1SJason Gunthorpe return required_fault; 1402aee09d8SJérôme Glisse } 141a3eb13c1SJason Gunthorpe return required_fault; 1422aee09d8SJérôme Glisse } 1432aee09d8SJérôme Glisse 1442aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 145b7a16c7aSSteven Price __always_unused int depth, struct mm_walk *walk) 1462aee09d8SJérôme Glisse { 1472aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 1482aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 149a3eb13c1SJason Gunthorpe unsigned int required_fault; 1502aee09d8SJérôme Glisse unsigned long i, npages; 151*2733ea14SJason Gunthorpe unsigned long *hmm_pfns; 1522aee09d8SJérôme Glisse 1532aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 1542aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 155*2733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i]; 156*2733ea14SJason Gunthorpe required_fault = 157*2733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); 158bd5d3587SJason Gunthorpe if (!walk->vma) { 159bd5d3587SJason Gunthorpe if (required_fault) 160bd5d3587SJason Gunthorpe return -EFAULT; 161bd5d3587SJason Gunthorpe return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); 162bd5d3587SJason Gunthorpe } 163a3eb13c1SJason Gunthorpe if (required_fault) 164a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 165*2733ea14SJason Gunthorpe return hmm_pfns_fill(addr, end, range, 0); 1662aee09d8SJérôme Glisse } 1672aee09d8SJérôme Glisse 168*2733ea14SJason Gunthorpe static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, 169*2733ea14SJason Gunthorpe pmd_t pmd) 1702aee09d8SJérôme Glisse { 1712aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 1722aee09d8SJérôme Glisse return 0; 173*2733ea14SJason Gunthorpe return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 174da4c3c73SJérôme Glisse } 175da4c3c73SJérôme Glisse 176992de9a8SJérôme Glisse #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1779d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 178*2733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[], 179*2733ea14SJason Gunthorpe pmd_t pmd) 1809d3973d6SChristoph Hellwig { 18153f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 182f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 1832aee09d8SJérôme Glisse unsigned long pfn, npages, i; 184a3eb13c1SJason Gunthorpe unsigned int required_fault; 185*2733ea14SJason Gunthorpe unsigned long cpu_flags; 18653f5c3f4SJérôme Glisse 1872aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 188f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 189a3eb13c1SJason Gunthorpe required_fault = 190*2733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); 191a3eb13c1SJason Gunthorpe if (required_fault) 192a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 19353f5c3f4SJérôme Glisse 194309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 195068354adSJason Gunthorpe for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 196*2733ea14SJason Gunthorpe hmm_pfns[i] = pfn | cpu_flags; 19753f5c3f4SJérôme Glisse return 0; 19853f5c3f4SJérôme Glisse } 1999d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2009d3973d6SChristoph Hellwig /* stub to allow the code below to compile */ 2019d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 202*2733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); 2039d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 20453f5c3f4SJérôme Glisse 20508dddddaSChristoph Hellwig static inline bool hmm_is_device_private_entry(struct hmm_range *range, 20608dddddaSChristoph Hellwig swp_entry_t entry) 20708dddddaSChristoph Hellwig { 20808dddddaSChristoph Hellwig return is_device_private_entry(entry) && 20908dddddaSChristoph Hellwig device_private_entry_to_page(entry)->pgmap->owner == 21008dddddaSChristoph Hellwig range->dev_private_owner; 21108dddddaSChristoph Hellwig } 21208dddddaSChristoph Hellwig 213*2733ea14SJason Gunthorpe static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, 214*2733ea14SJason Gunthorpe pte_t pte) 2152aee09d8SJérôme Glisse { 216789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2172aee09d8SJérôme Glisse return 0; 218*2733ea14SJason Gunthorpe return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 2192aee09d8SJérôme Glisse } 2202aee09d8SJérôme Glisse 22153f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 22253f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 223*2733ea14SJason Gunthorpe unsigned long *hmm_pfn) 22453f5c3f4SJérôme Glisse { 22553f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 226f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 227a3eb13c1SJason Gunthorpe unsigned int required_fault; 228*2733ea14SJason Gunthorpe unsigned long cpu_flags; 22953f5c3f4SJérôme Glisse pte_t pte = *ptep; 230*2733ea14SJason Gunthorpe uint64_t pfn_req_flags = *hmm_pfn; 23153f5c3f4SJérôme Glisse 23253f5c3f4SJérôme Glisse if (pte_none(pte)) { 233*2733ea14SJason Gunthorpe required_fault = 234*2733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 235a3eb13c1SJason Gunthorpe if (required_fault) 23653f5c3f4SJérôme Glisse goto fault; 237*2733ea14SJason Gunthorpe *hmm_pfn = 0; 23853f5c3f4SJérôme Glisse return 0; 23953f5c3f4SJérôme Glisse } 24053f5c3f4SJérôme Glisse 24153f5c3f4SJérôme Glisse if (!pte_present(pte)) { 24253f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 24353f5c3f4SJérôme Glisse 24453f5c3f4SJérôme Glisse /* 24517ffdc48SChristoph Hellwig * Never fault in device private pages pages, but just report 24617ffdc48SChristoph Hellwig * the PFN even if not present. 24753f5c3f4SJérôme Glisse */ 24808dddddaSChristoph Hellwig if (hmm_is_device_private_entry(range, entry)) { 249*2733ea14SJason Gunthorpe cpu_flags = HMM_PFN_VALID; 25017ffdc48SChristoph Hellwig if (is_write_device_private_entry(entry)) 251*2733ea14SJason Gunthorpe cpu_flags |= HMM_PFN_WRITE; 252*2733ea14SJason Gunthorpe *hmm_pfn = device_private_entry_to_pfn(entry) | 253*2733ea14SJason Gunthorpe cpu_flags; 25453f5c3f4SJérôme Glisse return 0; 25553f5c3f4SJérôme Glisse } 25653f5c3f4SJérôme Glisse 257*2733ea14SJason Gunthorpe required_fault = 258*2733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); 259846babe8SJason Gunthorpe if (!required_fault) { 260*2733ea14SJason Gunthorpe *hmm_pfn = 0; 26176612d6cSJason Gunthorpe return 0; 262846babe8SJason Gunthorpe } 26376612d6cSJason Gunthorpe 26476612d6cSJason Gunthorpe if (!non_swap_entry(entry)) 26576612d6cSJason Gunthorpe goto fault; 26676612d6cSJason Gunthorpe 26753f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 26853f5c3f4SJérôme Glisse pte_unmap(ptep); 26953f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 270d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr); 27173231612SJérôme Glisse return -EBUSY; 27253f5c3f4SJérôme Glisse } 27353f5c3f4SJérôme Glisse 27453f5c3f4SJérôme Glisse /* Report error for everything else */ 275dfdc2207SJason Gunthorpe pte_unmap(ptep); 27653f5c3f4SJérôme Glisse return -EFAULT; 27753f5c3f4SJérôme Glisse } 27853f5c3f4SJérôme Glisse 27976612d6cSJason Gunthorpe cpu_flags = pte_to_hmm_pfn_flags(range, pte); 280*2733ea14SJason Gunthorpe required_fault = 281*2733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 282a3eb13c1SJason Gunthorpe if (required_fault) 28353f5c3f4SJérôme Glisse goto fault; 28453f5c3f4SJérôme Glisse 28540550627SJason Gunthorpe /* 28640550627SJason Gunthorpe * Since each architecture defines a struct page for the zero page, just 28740550627SJason Gunthorpe * fall through and treat it like a normal page. 28840550627SJason Gunthorpe */ 28940550627SJason Gunthorpe if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { 290*2733ea14SJason Gunthorpe if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { 291dfdc2207SJason Gunthorpe pte_unmap(ptep); 292992de9a8SJérôme Glisse return -EFAULT; 293992de9a8SJérôme Glisse } 294*2733ea14SJason Gunthorpe *hmm_pfn = HMM_PFN_ERROR; 29540550627SJason Gunthorpe return 0; 296ac541f25SRalph Campbell } 297992de9a8SJérôme Glisse 298*2733ea14SJason Gunthorpe *hmm_pfn = pte_pfn(pte) | cpu_flags; 29953f5c3f4SJérôme Glisse return 0; 30053f5c3f4SJérôme Glisse 30153f5c3f4SJérôme Glisse fault: 30253f5c3f4SJérôme Glisse pte_unmap(ptep); 30353f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 304a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 30553f5c3f4SJérôme Glisse } 30653f5c3f4SJérôme Glisse 307da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 308da4c3c73SJérôme Glisse unsigned long start, 309da4c3c73SJérôme Glisse unsigned long end, 310da4c3c73SJérôme Glisse struct mm_walk *walk) 311da4c3c73SJérôme Glisse { 31274eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 31374eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 314*2733ea14SJason Gunthorpe unsigned long *hmm_pfns = 315*2733ea14SJason Gunthorpe &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; 3162288a9a6SJason Gunthorpe unsigned long npages = (end - start) >> PAGE_SHIFT; 3172288a9a6SJason Gunthorpe unsigned long addr = start; 318da4c3c73SJérôme Glisse pte_t *ptep; 319da4c3c73SJérôme Glisse pmd_t pmd; 320da4c3c73SJérôme Glisse 321d08faca0SJérôme Glisse again: 322d08faca0SJérôme Glisse pmd = READ_ONCE(*pmdp); 323d08faca0SJérôme Glisse if (pmd_none(pmd)) 324b7a16c7aSSteven Price return hmm_vma_walk_hole(start, end, -1, walk); 325d08faca0SJérôme Glisse 326d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 327*2733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { 328d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 329d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp); 33073231612SJérôme Glisse return -EBUSY; 331d08faca0SJérôme Glisse } 332*2733ea14SJason Gunthorpe return hmm_pfns_fill(start, end, range, 0); 3332288a9a6SJason Gunthorpe } 3342288a9a6SJason Gunthorpe 3352288a9a6SJason Gunthorpe if (!pmd_present(pmd)) { 336*2733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3372288a9a6SJason Gunthorpe return -EFAULT; 338d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3392288a9a6SJason Gunthorpe } 340d08faca0SJérôme Glisse 341d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 342da4c3c73SJérôme Glisse /* 343d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread 344da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 345da4c3c73SJérôme Glisse * mmu_notifier callback. 346da4c3c73SJérôme Glisse * 347d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent 348da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 349da4c3c73SJérôme Glisse * values. 350da4c3c73SJérôme Glisse */ 351da4c3c73SJérôme Glisse pmd = pmd_read_atomic(pmdp); 352da4c3c73SJérôme Glisse barrier(); 353da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 354da4c3c73SJérôme Glisse goto again; 355da4c3c73SJérôme Glisse 356*2733ea14SJason Gunthorpe return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); 357da4c3c73SJérôme Glisse } 358da4c3c73SJérôme Glisse 359d08faca0SJérôme Glisse /* 360d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration, 361d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 362d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 363d08faca0SJérôme Glisse * recover. 364d08faca0SJérôme Glisse */ 3652288a9a6SJason Gunthorpe if (pmd_bad(pmd)) { 366*2733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) 3672288a9a6SJason Gunthorpe return -EFAULT; 368d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3692288a9a6SJason Gunthorpe } 370da4c3c73SJérôme Glisse 371da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 372*2733ea14SJason Gunthorpe for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { 37353f5c3f4SJérôme Glisse int r; 374da4c3c73SJérôme Glisse 375*2733ea14SJason Gunthorpe r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); 37653f5c3f4SJérôme Glisse if (r) { 377dfdc2207SJason Gunthorpe /* hmm_vma_handle_pte() did pte_unmap() */ 37853f5c3f4SJérôme Glisse return r; 37974eee180SJérôme Glisse } 380da4c3c73SJérôme Glisse } 381da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 382da4c3c73SJérôme Glisse return 0; 383da4c3c73SJérôme Glisse } 384da4c3c73SJérôme Glisse 385f0b3c45cSChristoph Hellwig #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 386f0b3c45cSChristoph Hellwig defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 387*2733ea14SJason Gunthorpe static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, 388*2733ea14SJason Gunthorpe pud_t pud) 389f0b3c45cSChristoph Hellwig { 390f0b3c45cSChristoph Hellwig if (!pud_present(pud)) 391f0b3c45cSChristoph Hellwig return 0; 392*2733ea14SJason Gunthorpe return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 393f0b3c45cSChristoph Hellwig } 394f0b3c45cSChristoph Hellwig 395f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 396992de9a8SJérôme Glisse struct mm_walk *walk) 397992de9a8SJérôme Glisse { 398992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 399992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4003afc4236SSteven Price unsigned long addr = start; 401992de9a8SJérôme Glisse pud_t pud; 4023afc4236SSteven Price int ret = 0; 4033afc4236SSteven Price spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 404992de9a8SJérôme Glisse 4053afc4236SSteven Price if (!ptl) 4063afc4236SSteven Price return 0; 4073afc4236SSteven Price 4083afc4236SSteven Price /* Normally we don't want to split the huge page */ 4093afc4236SSteven Price walk->action = ACTION_CONTINUE; 4103afc4236SSteven Price 411992de9a8SJérôme Glisse pud = READ_ONCE(*pudp); 4123afc4236SSteven Price if (pud_none(pud)) { 41305fc1df9SJason Gunthorpe spin_unlock(ptl); 41405fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4153afc4236SSteven Price } 416992de9a8SJérôme Glisse 417992de9a8SJérôme Glisse if (pud_huge(pud) && pud_devmap(pud)) { 418992de9a8SJérôme Glisse unsigned long i, npages, pfn; 419a3eb13c1SJason Gunthorpe unsigned int required_fault; 420*2733ea14SJason Gunthorpe unsigned long *hmm_pfns; 421*2733ea14SJason Gunthorpe unsigned long cpu_flags; 422992de9a8SJérôme Glisse 4233afc4236SSteven Price if (!pud_present(pud)) { 42405fc1df9SJason Gunthorpe spin_unlock(ptl); 42505fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4263afc4236SSteven Price } 427992de9a8SJérôme Glisse 428992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 429992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 430*2733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i]; 431992de9a8SJérôme Glisse 432992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud); 433*2733ea14SJason Gunthorpe required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, 434a3eb13c1SJason Gunthorpe npages, cpu_flags); 435a3eb13c1SJason Gunthorpe if (required_fault) { 43605fc1df9SJason Gunthorpe spin_unlock(ptl); 437a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 4383afc4236SSteven Price } 439992de9a8SJérôme Glisse 440992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 441068354adSJason Gunthorpe for (i = 0; i < npages; ++i, ++pfn) 442*2733ea14SJason Gunthorpe hmm_pfns[i] = pfn | cpu_flags; 4433afc4236SSteven Price goto out_unlock; 444992de9a8SJérôme Glisse } 445992de9a8SJérôme Glisse 4463afc4236SSteven Price /* Ask for the PUD to be split */ 4473afc4236SSteven Price walk->action = ACTION_SUBTREE; 448992de9a8SJérôme Glisse 4493afc4236SSteven Price out_unlock: 4503afc4236SSteven Price spin_unlock(ptl); 451992de9a8SJérôme Glisse return ret; 452992de9a8SJérôme Glisse } 453f0b3c45cSChristoph Hellwig #else 454f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL 455f0b3c45cSChristoph Hellwig #endif 456992de9a8SJérôme Glisse 457251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE 45863d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 45963d5066fSJérôme Glisse unsigned long start, unsigned long end, 46063d5066fSJérôme Glisse struct mm_walk *walk) 46163d5066fSJérôme Glisse { 46205c23af4SChristoph Hellwig unsigned long addr = start, i, pfn; 46363d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 46463d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 46563d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma; 466a3eb13c1SJason Gunthorpe unsigned int required_fault; 467*2733ea14SJason Gunthorpe unsigned long pfn_req_flags; 468*2733ea14SJason Gunthorpe unsigned long cpu_flags; 46963d5066fSJérôme Glisse spinlock_t *ptl; 47063d5066fSJérôme Glisse pte_t entry; 47163d5066fSJérôme Glisse 472d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 47363d5066fSJérôme Glisse entry = huge_ptep_get(pte); 47463d5066fSJérôme Glisse 4757f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT; 476*2733ea14SJason Gunthorpe pfn_req_flags = range->hmm_pfns[i]; 47763d5066fSJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, entry); 478*2733ea14SJason Gunthorpe required_fault = 479*2733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 480a3eb13c1SJason Gunthorpe if (required_fault) { 48145050692SChristoph Hellwig spin_unlock(ptl); 482a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 48363d5066fSJérôme Glisse } 48463d5066fSJérôme Glisse 48505c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 4867f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 487*2733ea14SJason Gunthorpe range->hmm_pfns[i] = pfn | cpu_flags; 488*2733ea14SJason Gunthorpe 48963d5066fSJérôme Glisse spin_unlock(ptl); 49045050692SChristoph Hellwig return 0; 49163d5066fSJérôme Glisse } 492251bbe59SChristoph Hellwig #else 493251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL 494251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */ 49563d5066fSJérôme Glisse 496d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end, 497d28c2c9aSRalph Campbell struct mm_walk *walk) 49833cd47dcSJérôme Glisse { 499d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private; 500d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range; 501d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma; 502d28c2c9aSRalph Campbell 503a3eb13c1SJason Gunthorpe if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && 504a3eb13c1SJason Gunthorpe vma->vm_flags & VM_READ) 505a3eb13c1SJason Gunthorpe return 0; 506a3eb13c1SJason Gunthorpe 507d28c2c9aSRalph Campbell /* 508a3eb13c1SJason Gunthorpe * vma ranges that don't have struct page backing them or map I/O 509a3eb13c1SJason Gunthorpe * devices directly cannot be handled by hmm_range_fault(). 510c2579c9cSJason Gunthorpe * 511d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not 512c2579c9cSJason Gunthorpe * allow write access either. HMM does not support architectures that 513c2579c9cSJason Gunthorpe * allow write without read. 514a3eb13c1SJason Gunthorpe * 515a3eb13c1SJason Gunthorpe * If a fault is requested for an unsupported range then it is a hard 516a3eb13c1SJason Gunthorpe * failure. 517d28c2c9aSRalph Campbell */ 518a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, 519*2733ea14SJason Gunthorpe range->hmm_pfns + 520d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT), 521a3eb13c1SJason Gunthorpe (end - start) >> PAGE_SHIFT, 0)) 522d28c2c9aSRalph Campbell return -EFAULT; 523d28c2c9aSRalph Campbell 524c2579c9cSJason Gunthorpe hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 525d28c2c9aSRalph Campbell 526d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */ 527d28c2c9aSRalph Campbell return 1; 528d28c2c9aSRalph Campbell } 529d28c2c9aSRalph Campbell 5307b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = { 5317b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud, 5327b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd, 5337b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole, 5347b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 535d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test, 5367b86ac33SChristoph Hellwig }; 5377b86ac33SChristoph Hellwig 5389a4903e4SChristoph Hellwig /** 5399a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range 540f970b977SJason Gunthorpe * @range: argument structure 54173231612SJérôme Glisse * 542be957c88SJason Gunthorpe * Returns 0 on success or one of the following error codes: 5439a4903e4SChristoph Hellwig * 5449a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 5459a4903e4SChristoph Hellwig * (e.g., device file vma). 54673231612SJérôme Glisse * -ENOMEM: Out of memory. 5479a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read 5489a4903e4SChristoph Hellwig * only). 5499a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for 5509a4903e4SChristoph Hellwig * the invalidation to finish. 551f970b977SJason Gunthorpe * -EFAULT: A page was requested to be valid and could not be made valid 552f970b977SJason Gunthorpe * ie it has no backing VMA or it is illegal to access 55374eee180SJérôme Glisse * 554f970b977SJason Gunthorpe * This is similar to get_user_pages(), except that it can read the page tables 555f970b977SJason Gunthorpe * without mutating them (ie causing faults). 55674eee180SJérôme Glisse */ 557be957c88SJason Gunthorpe int hmm_range_fault(struct hmm_range *range) 55874eee180SJérôme Glisse { 559d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = { 560d28c2c9aSRalph Campbell .range = range, 561d28c2c9aSRalph Campbell .last = range->start, 562d28c2c9aSRalph Campbell }; 563a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm; 56474eee180SJérôme Glisse int ret; 56574eee180SJérôme Glisse 56604ec32fbSJason Gunthorpe lockdep_assert_held(&mm->mmap_sem); 567a3e0d41cSJérôme Glisse 568a3e0d41cSJérôme Glisse do { 569a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 570a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 571a22dd506SJason Gunthorpe range->notifier_seq)) 5722bcbeaefSChristoph Hellwig return -EBUSY; 573d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 5747b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk); 575be957c88SJason Gunthorpe /* 576be957c88SJason Gunthorpe * When -EBUSY is returned the loop restarts with 577be957c88SJason Gunthorpe * hmm_vma_walk.last set to an address that has not been stored 578be957c88SJason Gunthorpe * in pfns. All entries < last in the pfn array are set to their 579be957c88SJason Gunthorpe * output, and all >= are still at their input values. 580be957c88SJason Gunthorpe */ 581d28c2c9aSRalph Campbell } while (ret == -EBUSY); 58273231612SJérôme Glisse return ret; 58374eee180SJérôme Glisse } 58473231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 585