1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2133ff0eaSJérôme Glisse /* 3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 4133ff0eaSJérôme Glisse * 5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 6133ff0eaSJérôme Glisse */ 7133ff0eaSJérôme Glisse /* 8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 9133ff0eaSJérôme Glisse * management or HMM for short. 10133ff0eaSJérôme Glisse */ 11a520110eSChristoph Hellwig #include <linux/pagewalk.h> 12133ff0eaSJérôme Glisse #include <linux/hmm.h> 13858b54daSJérôme Glisse #include <linux/init.h> 14da4c3c73SJérôme Glisse #include <linux/rmap.h> 15da4c3c73SJérôme Glisse #include <linux/swap.h> 16133ff0eaSJérôme Glisse #include <linux/slab.h> 17133ff0eaSJérôme Glisse #include <linux/sched.h> 184ef589dcSJérôme Glisse #include <linux/mmzone.h> 194ef589dcSJérôme Glisse #include <linux/pagemap.h> 20da4c3c73SJérôme Glisse #include <linux/swapops.h> 21da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 224ef589dcSJérôme Glisse #include <linux/memremap.h> 23c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h> 247b2d55d2SJérôme Glisse #include <linux/jump_label.h> 2555c0ece8SJérôme Glisse #include <linux/dma-mapping.h> 26c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 274ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 284ef589dcSJérôme Glisse 2974eee180SJérôme Glisse struct hmm_vma_walk { 3074eee180SJérôme Glisse struct hmm_range *range; 3174eee180SJérôme Glisse unsigned long last; 3274eee180SJérôme Glisse }; 3374eee180SJérôme Glisse 34a3eb13c1SJason Gunthorpe enum { 35a3eb13c1SJason Gunthorpe HMM_NEED_FAULT = 1 << 0, 36a3eb13c1SJason Gunthorpe HMM_NEED_WRITE_FAULT = 1 << 1, 37a3eb13c1SJason Gunthorpe HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, 38a3eb13c1SJason Gunthorpe }; 39a3eb13c1SJason Gunthorpe 40f970b977SJason Gunthorpe /* 41f970b977SJason Gunthorpe * hmm_device_entry_from_pfn() - create a valid device entry value from pfn 42f970b977SJason Gunthorpe * @range: range use to encode HMM pfn value 43f970b977SJason Gunthorpe * @pfn: pfn value for which to create the device entry 44f970b977SJason Gunthorpe * Return: valid device entry for the pfn 45f970b977SJason Gunthorpe */ 46f970b977SJason Gunthorpe static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, 47f970b977SJason Gunthorpe unsigned long pfn) 48f970b977SJason Gunthorpe { 49f970b977SJason Gunthorpe return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; 50f970b977SJason Gunthorpe } 51f970b977SJason Gunthorpe 52d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end, 53d28c2c9aSRalph Campbell struct hmm_range *range, enum hmm_pfn_value_e value) 54da4c3c73SJérôme Glisse { 55ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 56da4c3c73SJérôme Glisse unsigned long i; 57da4c3c73SJérôme Glisse 58da4c3c73SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 59da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 60d28c2c9aSRalph Campbell pfns[i] = range->values[value]; 61da4c3c73SJérôme Glisse 62da4c3c73SJérôme Glisse return 0; 63da4c3c73SJérôme Glisse } 64da4c3c73SJérôme Glisse 655504ed29SJérôme Glisse /* 66f8c888a3SChristoph Hellwig * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 67d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive) 685504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 69a3eb13c1SJason Gunthorpe * @required_fault: HMM_NEED_* flags 705504ed29SJérôme Glisse * @walk: mm_walk structure 71f8c888a3SChristoph Hellwig * Return: -EBUSY after page fault, or page fault error 725504ed29SJérôme Glisse * 735504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 745504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 755504ed29SJérôme Glisse */ 76f8c888a3SChristoph Hellwig static int hmm_vma_fault(unsigned long addr, unsigned long end, 77a3eb13c1SJason Gunthorpe unsigned int required_fault, struct mm_walk *walk) 78da4c3c73SJérôme Glisse { 7974eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 8074eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 815a0c38d3SChristoph Hellwig struct vm_area_struct *vma = walk->vma; 82ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 83f8c888a3SChristoph Hellwig unsigned long i = (addr - range->start) >> PAGE_SHIFT; 845a0c38d3SChristoph Hellwig unsigned int fault_flags = FAULT_FLAG_REMOTE; 85da4c3c73SJérôme Glisse 86a3eb13c1SJason Gunthorpe WARN_ON_ONCE(!required_fault); 8774eee180SJérôme Glisse hmm_vma_walk->last = addr; 8863d5066fSJérôme Glisse 895a0c38d3SChristoph Hellwig if (!vma) 905a0c38d3SChristoph Hellwig goto out_error; 915a0c38d3SChristoph Hellwig 92a3eb13c1SJason Gunthorpe if (required_fault & HMM_NEED_WRITE_FAULT) { 935a0c38d3SChristoph Hellwig if (!(vma->vm_flags & VM_WRITE)) 94c18ce674SRalph Campbell return -EPERM; 955a0c38d3SChristoph Hellwig fault_flags |= FAULT_FLAG_WRITE; 9674eee180SJérôme Glisse } 9774eee180SJérôme Glisse 985a0c38d3SChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++) 995a0c38d3SChristoph Hellwig if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) 1005a0c38d3SChristoph Hellwig goto out_error; 1015a0c38d3SChristoph Hellwig 102f8c888a3SChristoph Hellwig return -EBUSY; 1035a0c38d3SChristoph Hellwig 1045a0c38d3SChristoph Hellwig out_error: 1055a0c38d3SChristoph Hellwig pfns[i] = range->values[HMM_PFN_ERROR]; 1065a0c38d3SChristoph Hellwig return -EFAULT; 1072aee09d8SJérôme Glisse } 1082aee09d8SJérôme Glisse 109a3eb13c1SJason Gunthorpe static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 110a3eb13c1SJason Gunthorpe uint64_t pfns, uint64_t cpu_flags) 1112aee09d8SJérôme Glisse { 112f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 113f88a1e90SJérôme Glisse 114023a019aSJérôme Glisse /* 115023a019aSJérôme Glisse * So we not only consider the individual per page request we also 116023a019aSJérôme Glisse * consider the default flags requested for the range. The API can 117d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces 118d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for 119d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre- 120023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a 121023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default 122023a019aSJérôme Glisse * flags value. 123023a019aSJérôme Glisse */ 124023a019aSJérôme Glisse pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 125023a019aSJérôme Glisse 1262aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 127f88a1e90SJérôme Glisse if (!(pfns & range->flags[HMM_PFN_VALID])) 128a3eb13c1SJason Gunthorpe return 0; 129f88a1e90SJérôme Glisse 130f88a1e90SJérôme Glisse /* Need to write fault ? */ 131f88a1e90SJérôme Glisse if ((pfns & range->flags[HMM_PFN_WRITE]) && 132a3eb13c1SJason Gunthorpe !(cpu_flags & range->flags[HMM_PFN_WRITE])) 133a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; 134a3eb13c1SJason Gunthorpe 135a3eb13c1SJason Gunthorpe /* If CPU page table is not valid then we need to fault */ 136a3eb13c1SJason Gunthorpe if (!(cpu_flags & range->flags[HMM_PFN_VALID])) 137a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT; 138a3eb13c1SJason Gunthorpe return 0; 1392aee09d8SJérôme Glisse } 1402aee09d8SJérôme Glisse 141a3eb13c1SJason Gunthorpe static unsigned int 142a3eb13c1SJason Gunthorpe hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1432aee09d8SJérôme Glisse const uint64_t *pfns, unsigned long npages, 144a3eb13c1SJason Gunthorpe uint64_t cpu_flags) 1452aee09d8SJérôme Glisse { 146*6bfef2f9SJason Gunthorpe struct hmm_range *range = hmm_vma_walk->range; 147a3eb13c1SJason Gunthorpe unsigned int required_fault = 0; 1482aee09d8SJérôme Glisse unsigned long i; 1492aee09d8SJérôme Glisse 150*6bfef2f9SJason Gunthorpe /* 151*6bfef2f9SJason Gunthorpe * If the default flags do not request to fault pages, and the mask does 152*6bfef2f9SJason Gunthorpe * not allow for individual pages to be faulted, then 153*6bfef2f9SJason Gunthorpe * hmm_pte_need_fault() will always return 0. 154*6bfef2f9SJason Gunthorpe */ 155*6bfef2f9SJason Gunthorpe if (!((range->default_flags | range->pfn_flags_mask) & 156*6bfef2f9SJason Gunthorpe range->flags[HMM_PFN_VALID])) 157a3eb13c1SJason Gunthorpe return 0; 1582aee09d8SJérôme Glisse 1592aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 160a3eb13c1SJason Gunthorpe required_fault |= 161a3eb13c1SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); 162a3eb13c1SJason Gunthorpe if (required_fault == HMM_NEED_ALL_BITS) 163a3eb13c1SJason Gunthorpe return required_fault; 1642aee09d8SJérôme Glisse } 165a3eb13c1SJason Gunthorpe return required_fault; 1662aee09d8SJérôme Glisse } 1672aee09d8SJérôme Glisse 1682aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 169b7a16c7aSSteven Price __always_unused int depth, struct mm_walk *walk) 1702aee09d8SJérôme Glisse { 1712aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 1722aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 173a3eb13c1SJason Gunthorpe unsigned int required_fault; 1742aee09d8SJérôme Glisse unsigned long i, npages; 1752aee09d8SJérôme Glisse uint64_t *pfns; 1762aee09d8SJérôme Glisse 1772aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 1782aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 1792aee09d8SJérôme Glisse pfns = &range->pfns[i]; 180a3eb13c1SJason Gunthorpe required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); 181a3eb13c1SJason Gunthorpe if (required_fault) 182a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 183f8c888a3SChristoph Hellwig hmm_vma_walk->last = addr; 184f8c888a3SChristoph Hellwig return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); 1852aee09d8SJérôme Glisse } 1862aee09d8SJérôme Glisse 187f88a1e90SJérôme Glisse static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 1882aee09d8SJérôme Glisse { 1892aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 1902aee09d8SJérôme Glisse return 0; 191f88a1e90SJérôme Glisse return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 192f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 193f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 194da4c3c73SJérôme Glisse } 195da4c3c73SJérôme Glisse 196992de9a8SJérôme Glisse #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1979d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 1989d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd) 1999d3973d6SChristoph Hellwig { 20053f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 201f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 2022aee09d8SJérôme Glisse unsigned long pfn, npages, i; 203a3eb13c1SJason Gunthorpe unsigned int required_fault; 204f88a1e90SJérôme Glisse uint64_t cpu_flags; 20553f5c3f4SJérôme Glisse 2062aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 207f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 208a3eb13c1SJason Gunthorpe required_fault = 209a3eb13c1SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); 210a3eb13c1SJason Gunthorpe if (required_fault) 211a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 21253f5c3f4SJérôme Glisse 213309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 214068354adSJason Gunthorpe for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 215391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 21653f5c3f4SJérôme Glisse hmm_vma_walk->last = end; 21753f5c3f4SJérôme Glisse return 0; 21853f5c3f4SJérôme Glisse } 2199d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2209d3973d6SChristoph Hellwig /* stub to allow the code below to compile */ 2219d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2229d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd); 2239d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 22453f5c3f4SJérôme Glisse 22508dddddaSChristoph Hellwig static inline bool hmm_is_device_private_entry(struct hmm_range *range, 22608dddddaSChristoph Hellwig swp_entry_t entry) 22708dddddaSChristoph Hellwig { 22808dddddaSChristoph Hellwig return is_device_private_entry(entry) && 22908dddddaSChristoph Hellwig device_private_entry_to_page(entry)->pgmap->owner == 23008dddddaSChristoph Hellwig range->dev_private_owner; 23108dddddaSChristoph Hellwig } 23208dddddaSChristoph Hellwig 233f88a1e90SJérôme Glisse static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 2342aee09d8SJérôme Glisse { 235789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2362aee09d8SJérôme Glisse return 0; 237f88a1e90SJérôme Glisse return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 238f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 239f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 2402aee09d8SJérôme Glisse } 2412aee09d8SJérôme Glisse 24253f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 24353f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 24453f5c3f4SJérôme Glisse uint64_t *pfn) 24553f5c3f4SJérôme Glisse { 24653f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 247f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 248a3eb13c1SJason Gunthorpe unsigned int required_fault; 2492aee09d8SJérôme Glisse uint64_t cpu_flags; 25053f5c3f4SJérôme Glisse pte_t pte = *ptep; 251f88a1e90SJérôme Glisse uint64_t orig_pfn = *pfn; 25253f5c3f4SJérôme Glisse 253f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_NONE]; 25453f5c3f4SJérôme Glisse if (pte_none(pte)) { 255a3eb13c1SJason Gunthorpe required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); 256a3eb13c1SJason Gunthorpe if (required_fault) 25753f5c3f4SJérôme Glisse goto fault; 25853f5c3f4SJérôme Glisse return 0; 25953f5c3f4SJérôme Glisse } 26053f5c3f4SJérôme Glisse 26153f5c3f4SJérôme Glisse if (!pte_present(pte)) { 26253f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 26353f5c3f4SJérôme Glisse 26453f5c3f4SJérôme Glisse /* 26517ffdc48SChristoph Hellwig * Never fault in device private pages pages, but just report 26617ffdc48SChristoph Hellwig * the PFN even if not present. 26753f5c3f4SJérôme Glisse */ 26808dddddaSChristoph Hellwig if (hmm_is_device_private_entry(range, entry)) { 269391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, 270391aab11SJérôme Glisse swp_offset(entry)); 27117ffdc48SChristoph Hellwig *pfn |= range->flags[HMM_PFN_VALID]; 27217ffdc48SChristoph Hellwig if (is_write_device_private_entry(entry)) 27317ffdc48SChristoph Hellwig *pfn |= range->flags[HMM_PFN_WRITE]; 27453f5c3f4SJérôme Glisse return 0; 27553f5c3f4SJérôme Glisse } 27653f5c3f4SJérôme Glisse 277a3eb13c1SJason Gunthorpe required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); 278a3eb13c1SJason Gunthorpe if (!required_fault) 27976612d6cSJason Gunthorpe return 0; 28076612d6cSJason Gunthorpe 28176612d6cSJason Gunthorpe if (!non_swap_entry(entry)) 28276612d6cSJason Gunthorpe goto fault; 28376612d6cSJason Gunthorpe 28453f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 28553f5c3f4SJérôme Glisse pte_unmap(ptep); 28653f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 287d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr); 28873231612SJérôme Glisse return -EBUSY; 28953f5c3f4SJérôme Glisse } 29053f5c3f4SJérôme Glisse 29153f5c3f4SJérôme Glisse /* Report error for everything else */ 292dfdc2207SJason Gunthorpe pte_unmap(ptep); 293f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_ERROR]; 29453f5c3f4SJérôme Glisse return -EFAULT; 29553f5c3f4SJérôme Glisse } 29653f5c3f4SJérôme Glisse 29776612d6cSJason Gunthorpe cpu_flags = pte_to_hmm_pfn_flags(range, pte); 298a3eb13c1SJason Gunthorpe required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); 299a3eb13c1SJason Gunthorpe if (required_fault) 30053f5c3f4SJérôme Glisse goto fault; 30153f5c3f4SJérôme Glisse 30240550627SJason Gunthorpe /* 30340550627SJason Gunthorpe * Since each architecture defines a struct page for the zero page, just 30440550627SJason Gunthorpe * fall through and treat it like a normal page. 30540550627SJason Gunthorpe */ 30640550627SJason Gunthorpe if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { 307a3eb13c1SJason Gunthorpe if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { 308dfdc2207SJason Gunthorpe pte_unmap(ptep); 309992de9a8SJérôme Glisse return -EFAULT; 310992de9a8SJérôme Glisse } 31140550627SJason Gunthorpe *pfn = range->values[HMM_PFN_SPECIAL]; 31240550627SJason Gunthorpe return 0; 313ac541f25SRalph Campbell } 314992de9a8SJérôme Glisse 315391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 31653f5c3f4SJérôme Glisse return 0; 31753f5c3f4SJérôme Glisse 31853f5c3f4SJérôme Glisse fault: 31953f5c3f4SJérôme Glisse pte_unmap(ptep); 32053f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 321a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 32253f5c3f4SJérôme Glisse } 32353f5c3f4SJérôme Glisse 324da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 325da4c3c73SJérôme Glisse unsigned long start, 326da4c3c73SJérôme Glisse unsigned long end, 327da4c3c73SJérôme Glisse struct mm_walk *walk) 328da4c3c73SJérôme Glisse { 32974eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 33074eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 3312288a9a6SJason Gunthorpe uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; 3322288a9a6SJason Gunthorpe unsigned long npages = (end - start) >> PAGE_SHIFT; 3332288a9a6SJason Gunthorpe unsigned long addr = start; 334da4c3c73SJérôme Glisse pte_t *ptep; 335da4c3c73SJérôme Glisse pmd_t pmd; 336da4c3c73SJérôme Glisse 337d08faca0SJérôme Glisse again: 338d08faca0SJérôme Glisse pmd = READ_ONCE(*pmdp); 339d08faca0SJérôme Glisse if (pmd_none(pmd)) 340b7a16c7aSSteven Price return hmm_vma_walk_hole(start, end, -1, walk); 341d08faca0SJérôme Glisse 342d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 343a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { 344d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 345d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp); 34673231612SJérôme Glisse return -EBUSY; 347d08faca0SJérôme Glisse } 3487d082987SJason Gunthorpe return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 3492288a9a6SJason Gunthorpe } 3502288a9a6SJason Gunthorpe 3512288a9a6SJason Gunthorpe if (!pmd_present(pmd)) { 352a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) 3532288a9a6SJason Gunthorpe return -EFAULT; 354d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3552288a9a6SJason Gunthorpe } 356d08faca0SJérôme Glisse 357d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 358da4c3c73SJérôme Glisse /* 359d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread 360da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 361da4c3c73SJérôme Glisse * mmu_notifier callback. 362da4c3c73SJérôme Glisse * 363d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent 364da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 365da4c3c73SJérôme Glisse * values. 366da4c3c73SJérôme Glisse */ 367da4c3c73SJérôme Glisse pmd = pmd_read_atomic(pmdp); 368da4c3c73SJérôme Glisse barrier(); 369da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 370da4c3c73SJérôme Glisse goto again; 371da4c3c73SJérôme Glisse 3722288a9a6SJason Gunthorpe return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); 373da4c3c73SJérôme Glisse } 374da4c3c73SJérôme Glisse 375d08faca0SJérôme Glisse /* 376d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration, 377d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 378d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 379d08faca0SJérôme Glisse * recover. 380d08faca0SJérôme Glisse */ 3812288a9a6SJason Gunthorpe if (pmd_bad(pmd)) { 382a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) 3832288a9a6SJason Gunthorpe return -EFAULT; 384d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3852288a9a6SJason Gunthorpe } 386da4c3c73SJérôme Glisse 387da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 3882288a9a6SJason Gunthorpe for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { 38953f5c3f4SJérôme Glisse int r; 390da4c3c73SJérôme Glisse 3912288a9a6SJason Gunthorpe r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); 39253f5c3f4SJérôme Glisse if (r) { 393dfdc2207SJason Gunthorpe /* hmm_vma_handle_pte() did pte_unmap() */ 39474eee180SJérôme Glisse hmm_vma_walk->last = addr; 39553f5c3f4SJérôme Glisse return r; 39674eee180SJérôme Glisse } 397da4c3c73SJérôme Glisse } 398da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 399da4c3c73SJérôme Glisse 40053f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 401da4c3c73SJérôme Glisse return 0; 402da4c3c73SJérôme Glisse } 403da4c3c73SJérôme Glisse 404f0b3c45cSChristoph Hellwig #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 405f0b3c45cSChristoph Hellwig defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 406f0b3c45cSChristoph Hellwig static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 407f0b3c45cSChristoph Hellwig { 408f0b3c45cSChristoph Hellwig if (!pud_present(pud)) 409f0b3c45cSChristoph Hellwig return 0; 410f0b3c45cSChristoph Hellwig return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 411f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_WRITE] : 412f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_VALID]; 413f0b3c45cSChristoph Hellwig } 414f0b3c45cSChristoph Hellwig 415f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 416992de9a8SJérôme Glisse struct mm_walk *walk) 417992de9a8SJérôme Glisse { 418992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 419992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4203afc4236SSteven Price unsigned long addr = start; 421992de9a8SJérôme Glisse pud_t pud; 4223afc4236SSteven Price int ret = 0; 4233afc4236SSteven Price spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 424992de9a8SJérôme Glisse 4253afc4236SSteven Price if (!ptl) 4263afc4236SSteven Price return 0; 4273afc4236SSteven Price 4283afc4236SSteven Price /* Normally we don't want to split the huge page */ 4293afc4236SSteven Price walk->action = ACTION_CONTINUE; 4303afc4236SSteven Price 431992de9a8SJérôme Glisse pud = READ_ONCE(*pudp); 4323afc4236SSteven Price if (pud_none(pud)) { 43305fc1df9SJason Gunthorpe spin_unlock(ptl); 43405fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4353afc4236SSteven Price } 436992de9a8SJérôme Glisse 437992de9a8SJérôme Glisse if (pud_huge(pud) && pud_devmap(pud)) { 438992de9a8SJérôme Glisse unsigned long i, npages, pfn; 439a3eb13c1SJason Gunthorpe unsigned int required_fault; 440992de9a8SJérôme Glisse uint64_t *pfns, cpu_flags; 441992de9a8SJérôme Glisse 4423afc4236SSteven Price if (!pud_present(pud)) { 44305fc1df9SJason Gunthorpe spin_unlock(ptl); 44405fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4453afc4236SSteven Price } 446992de9a8SJérôme Glisse 447992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 448992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 449992de9a8SJérôme Glisse pfns = &range->pfns[i]; 450992de9a8SJérôme Glisse 451992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud); 452a3eb13c1SJason Gunthorpe required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, 453a3eb13c1SJason Gunthorpe npages, cpu_flags); 454a3eb13c1SJason Gunthorpe if (required_fault) { 45505fc1df9SJason Gunthorpe spin_unlock(ptl); 456a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 4573afc4236SSteven Price } 458992de9a8SJérôme Glisse 459992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 460068354adSJason Gunthorpe for (i = 0; i < npages; ++i, ++pfn) 461391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 462391aab11SJérôme Glisse cpu_flags; 463992de9a8SJérôme Glisse hmm_vma_walk->last = end; 4643afc4236SSteven Price goto out_unlock; 465992de9a8SJérôme Glisse } 466992de9a8SJérôme Glisse 4673afc4236SSteven Price /* Ask for the PUD to be split */ 4683afc4236SSteven Price walk->action = ACTION_SUBTREE; 469992de9a8SJérôme Glisse 4703afc4236SSteven Price out_unlock: 4713afc4236SSteven Price spin_unlock(ptl); 472992de9a8SJérôme Glisse return ret; 473992de9a8SJérôme Glisse } 474f0b3c45cSChristoph Hellwig #else 475f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL 476f0b3c45cSChristoph Hellwig #endif 477992de9a8SJérôme Glisse 478251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE 47963d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 48063d5066fSJérôme Glisse unsigned long start, unsigned long end, 48163d5066fSJérôme Glisse struct mm_walk *walk) 48263d5066fSJérôme Glisse { 48305c23af4SChristoph Hellwig unsigned long addr = start, i, pfn; 48463d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 48563d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 48663d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma; 48763d5066fSJérôme Glisse uint64_t orig_pfn, cpu_flags; 488a3eb13c1SJason Gunthorpe unsigned int required_fault; 48963d5066fSJérôme Glisse spinlock_t *ptl; 49063d5066fSJérôme Glisse pte_t entry; 49163d5066fSJérôme Glisse 492d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 49363d5066fSJérôme Glisse entry = huge_ptep_get(pte); 49463d5066fSJérôme Glisse 4957f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT; 49663d5066fSJérôme Glisse orig_pfn = range->pfns[i]; 49763d5066fSJérôme Glisse range->pfns[i] = range->values[HMM_PFN_NONE]; 49863d5066fSJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, entry); 499a3eb13c1SJason Gunthorpe required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); 500a3eb13c1SJason Gunthorpe if (required_fault) { 50145050692SChristoph Hellwig spin_unlock(ptl); 502a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk); 50363d5066fSJérôme Glisse } 50463d5066fSJérôme Glisse 50505c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 5067f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 507391aab11SJérôme Glisse range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 508391aab11SJérôme Glisse cpu_flags; 50963d5066fSJérôme Glisse hmm_vma_walk->last = end; 51063d5066fSJérôme Glisse spin_unlock(ptl); 51145050692SChristoph Hellwig return 0; 51263d5066fSJérôme Glisse } 513251bbe59SChristoph Hellwig #else 514251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL 515251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */ 51663d5066fSJérôme Glisse 517d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end, 518d28c2c9aSRalph Campbell struct mm_walk *walk) 51933cd47dcSJérôme Glisse { 520d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private; 521d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range; 522d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma; 523d28c2c9aSRalph Campbell 524a3eb13c1SJason Gunthorpe if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && 525a3eb13c1SJason Gunthorpe vma->vm_flags & VM_READ) 526a3eb13c1SJason Gunthorpe return 0; 527a3eb13c1SJason Gunthorpe 528d28c2c9aSRalph Campbell /* 529a3eb13c1SJason Gunthorpe * vma ranges that don't have struct page backing them or map I/O 530a3eb13c1SJason Gunthorpe * devices directly cannot be handled by hmm_range_fault(). 531c2579c9cSJason Gunthorpe * 532d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not 533c2579c9cSJason Gunthorpe * allow write access either. HMM does not support architectures that 534c2579c9cSJason Gunthorpe * allow write without read. 535a3eb13c1SJason Gunthorpe * 536a3eb13c1SJason Gunthorpe * If a fault is requested for an unsupported range then it is a hard 537a3eb13c1SJason Gunthorpe * failure. 538d28c2c9aSRalph Campbell */ 539a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, 540a3eb13c1SJason Gunthorpe range->pfns + 541d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT), 542a3eb13c1SJason Gunthorpe (end - start) >> PAGE_SHIFT, 0)) 543d28c2c9aSRalph Campbell return -EFAULT; 544d28c2c9aSRalph Campbell 545c2579c9cSJason Gunthorpe hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 546d28c2c9aSRalph Campbell hmm_vma_walk->last = end; 547d28c2c9aSRalph Campbell 548d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */ 549d28c2c9aSRalph Campbell return 1; 550d28c2c9aSRalph Campbell } 551d28c2c9aSRalph Campbell 5527b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = { 5537b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud, 5547b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd, 5557b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole, 5567b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 557d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test, 5587b86ac33SChristoph Hellwig }; 5597b86ac33SChristoph Hellwig 5609a4903e4SChristoph Hellwig /** 5619a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range 562f970b977SJason Gunthorpe * @range: argument structure 56373231612SJérôme Glisse * 5649a4903e4SChristoph Hellwig * Return: the number of valid pages in range->pfns[] (from range start 5659a4903e4SChristoph Hellwig * address), which may be zero. On error one of the following status codes 5669a4903e4SChristoph Hellwig * can be returned: 5679a4903e4SChristoph Hellwig * 5689a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 5699a4903e4SChristoph Hellwig * (e.g., device file vma). 57073231612SJérôme Glisse * -ENOMEM: Out of memory. 5719a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read 5729a4903e4SChristoph Hellwig * only). 5739a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for 5749a4903e4SChristoph Hellwig * the invalidation to finish. 575f970b977SJason Gunthorpe * -EFAULT: A page was requested to be valid and could not be made valid 576f970b977SJason Gunthorpe * ie it has no backing VMA or it is illegal to access 57774eee180SJérôme Glisse * 578f970b977SJason Gunthorpe * This is similar to get_user_pages(), except that it can read the page tables 579f970b977SJason Gunthorpe * without mutating them (ie causing faults). 58074eee180SJérôme Glisse * 581ff05c0c6SJérôme Glisse * On error, for one virtual address in the range, the function will mark the 582ff05c0c6SJérôme Glisse * corresponding HMM pfn entry with an error flag. 58374eee180SJérôme Glisse */ 584*6bfef2f9SJason Gunthorpe long hmm_range_fault(struct hmm_range *range) 58574eee180SJérôme Glisse { 586d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = { 587d28c2c9aSRalph Campbell .range = range, 588d28c2c9aSRalph Campbell .last = range->start, 589d28c2c9aSRalph Campbell }; 590a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm; 59174eee180SJérôme Glisse int ret; 59274eee180SJérôme Glisse 59304ec32fbSJason Gunthorpe lockdep_assert_held(&mm->mmap_sem); 594a3e0d41cSJérôme Glisse 595a3e0d41cSJérôme Glisse do { 596a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 597a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 598a22dd506SJason Gunthorpe range->notifier_seq)) 5992bcbeaefSChristoph Hellwig return -EBUSY; 600d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 6017b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk); 602d28c2c9aSRalph Campbell } while (ret == -EBUSY); 603a3e0d41cSJérôme Glisse 604d28c2c9aSRalph Campbell if (ret) 60573231612SJérôme Glisse return ret; 60673231612SJérôme Glisse return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 60774eee180SJérôme Glisse } 60873231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 609