1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2133ff0eaSJérôme Glisse /* 3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 4133ff0eaSJérôme Glisse * 5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 6133ff0eaSJérôme Glisse */ 7133ff0eaSJérôme Glisse /* 8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 9133ff0eaSJérôme Glisse * management or HMM for short. 10133ff0eaSJérôme Glisse */ 11a520110eSChristoph Hellwig #include <linux/pagewalk.h> 12133ff0eaSJérôme Glisse #include <linux/hmm.h> 13858b54daSJérôme Glisse #include <linux/init.h> 14da4c3c73SJérôme Glisse #include <linux/rmap.h> 15da4c3c73SJérôme Glisse #include <linux/swap.h> 16133ff0eaSJérôme Glisse #include <linux/slab.h> 17133ff0eaSJérôme Glisse #include <linux/sched.h> 184ef589dcSJérôme Glisse #include <linux/mmzone.h> 194ef589dcSJérôme Glisse #include <linux/pagemap.h> 20da4c3c73SJérôme Glisse #include <linux/swapops.h> 21da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 224ef589dcSJérôme Glisse #include <linux/memremap.h> 23c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h> 247b2d55d2SJérôme Glisse #include <linux/jump_label.h> 2555c0ece8SJérôme Glisse #include <linux/dma-mapping.h> 26c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 274ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 284ef589dcSJérôme Glisse 2974eee180SJérôme Glisse struct hmm_vma_walk { 3074eee180SJérôme Glisse struct hmm_range *range; 31992de9a8SJérôme Glisse struct dev_pagemap *pgmap; 3274eee180SJérôme Glisse unsigned long last; 339a4903e4SChristoph Hellwig unsigned int flags; 3474eee180SJérôme Glisse }; 3574eee180SJérôme Glisse 36d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end, 37d28c2c9aSRalph Campbell struct hmm_range *range, enum hmm_pfn_value_e value) 38da4c3c73SJérôme Glisse { 39ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 40da4c3c73SJérôme Glisse unsigned long i; 41da4c3c73SJérôme Glisse 42da4c3c73SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 43da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 44d28c2c9aSRalph Campbell pfns[i] = range->values[value]; 45da4c3c73SJérôme Glisse 46da4c3c73SJérôme Glisse return 0; 47da4c3c73SJérôme Glisse } 48da4c3c73SJérôme Glisse 495504ed29SJérôme Glisse /* 50f8c888a3SChristoph Hellwig * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) 51d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive) 525504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 532aee09d8SJérôme Glisse * @fault: should we fault or not ? 542aee09d8SJérôme Glisse * @write_fault: write fault ? 555504ed29SJérôme Glisse * @walk: mm_walk structure 56f8c888a3SChristoph Hellwig * Return: -EBUSY after page fault, or page fault error 575504ed29SJérôme Glisse * 585504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 595504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 605504ed29SJérôme Glisse */ 61f8c888a3SChristoph Hellwig static int hmm_vma_fault(unsigned long addr, unsigned long end, 622aee09d8SJérôme Glisse bool fault, bool write_fault, 63da4c3c73SJérôme Glisse struct mm_walk *walk) 64da4c3c73SJérôme Glisse { 6574eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 6674eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 67*5a0c38d3SChristoph Hellwig struct vm_area_struct *vma = walk->vma; 68ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 69f8c888a3SChristoph Hellwig unsigned long i = (addr - range->start) >> PAGE_SHIFT; 70*5a0c38d3SChristoph Hellwig unsigned int fault_flags = FAULT_FLAG_REMOTE; 71da4c3c73SJérôme Glisse 72f8c888a3SChristoph Hellwig WARN_ON_ONCE(!fault && !write_fault); 7374eee180SJérôme Glisse hmm_vma_walk->last = addr; 7463d5066fSJérôme Glisse 75*5a0c38d3SChristoph Hellwig if (!vma) 76*5a0c38d3SChristoph Hellwig goto out_error; 77*5a0c38d3SChristoph Hellwig 78*5a0c38d3SChristoph Hellwig if (write_fault) { 79*5a0c38d3SChristoph Hellwig if (!(vma->vm_flags & VM_WRITE)) 80c18ce674SRalph Campbell return -EPERM; 81*5a0c38d3SChristoph Hellwig fault_flags |= FAULT_FLAG_WRITE; 8274eee180SJérôme Glisse } 8374eee180SJérôme Glisse 84*5a0c38d3SChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++) 85*5a0c38d3SChristoph Hellwig if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) 86*5a0c38d3SChristoph Hellwig goto out_error; 87*5a0c38d3SChristoph Hellwig 88f8c888a3SChristoph Hellwig return -EBUSY; 89*5a0c38d3SChristoph Hellwig 90*5a0c38d3SChristoph Hellwig out_error: 91*5a0c38d3SChristoph Hellwig pfns[i] = range->values[HMM_PFN_ERROR]; 92*5a0c38d3SChristoph Hellwig return -EFAULT; 932aee09d8SJérôme Glisse } 942aee09d8SJérôme Glisse 952aee09d8SJérôme Glisse static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 962aee09d8SJérôme Glisse uint64_t pfns, uint64_t cpu_flags, 972aee09d8SJérôme Glisse bool *fault, bool *write_fault) 982aee09d8SJérôme Glisse { 99f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 100f88a1e90SJérôme Glisse 101d45d464bSChristoph Hellwig if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 1022aee09d8SJérôme Glisse return; 1032aee09d8SJérôme Glisse 104023a019aSJérôme Glisse /* 105023a019aSJérôme Glisse * So we not only consider the individual per page request we also 106023a019aSJérôme Glisse * consider the default flags requested for the range. The API can 107d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces 108d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for 109d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre- 110023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a 111023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default 112023a019aSJérôme Glisse * flags value. 113023a019aSJérôme Glisse */ 114023a019aSJérôme Glisse pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 115023a019aSJérôme Glisse 1162aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 117f88a1e90SJérôme Glisse if (!(pfns & range->flags[HMM_PFN_VALID])) 1182aee09d8SJérôme Glisse return; 119d2e8d551SRalph Campbell /* If this is device memory then only fault if explicitly requested */ 120f88a1e90SJérôme Glisse if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 121f88a1e90SJérôme Glisse /* Do we fault on device memory ? */ 122f88a1e90SJérôme Glisse if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 123f88a1e90SJérôme Glisse *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 124f88a1e90SJérôme Glisse *fault = true; 125f88a1e90SJérôme Glisse } 1262aee09d8SJérôme Glisse return; 1272aee09d8SJérôme Glisse } 128f88a1e90SJérôme Glisse 129f88a1e90SJérôme Glisse /* If CPU page table is not valid then we need to fault */ 130f88a1e90SJérôme Glisse *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 131f88a1e90SJérôme Glisse /* Need to write fault ? */ 132f88a1e90SJérôme Glisse if ((pfns & range->flags[HMM_PFN_WRITE]) && 133f88a1e90SJérôme Glisse !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 134f88a1e90SJérôme Glisse *write_fault = true; 1352aee09d8SJérôme Glisse *fault = true; 1362aee09d8SJérôme Glisse } 1372aee09d8SJérôme Glisse } 1382aee09d8SJérôme Glisse 1392aee09d8SJérôme Glisse static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1402aee09d8SJérôme Glisse const uint64_t *pfns, unsigned long npages, 1412aee09d8SJérôme Glisse uint64_t cpu_flags, bool *fault, 1422aee09d8SJérôme Glisse bool *write_fault) 1432aee09d8SJérôme Glisse { 1442aee09d8SJérôme Glisse unsigned long i; 1452aee09d8SJérôme Glisse 146d45d464bSChristoph Hellwig if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 1472aee09d8SJérôme Glisse *fault = *write_fault = false; 1482aee09d8SJérôme Glisse return; 1492aee09d8SJérôme Glisse } 1502aee09d8SJérôme Glisse 151a3e0d41cSJérôme Glisse *fault = *write_fault = false; 1522aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 1532aee09d8SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 1542aee09d8SJérôme Glisse fault, write_fault); 155a3e0d41cSJérôme Glisse if ((*write_fault)) 1562aee09d8SJérôme Glisse return; 1572aee09d8SJérôme Glisse } 1582aee09d8SJérôme Glisse } 1592aee09d8SJérôme Glisse 1602aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 161b7a16c7aSSteven Price __always_unused int depth, struct mm_walk *walk) 1622aee09d8SJérôme Glisse { 1632aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 1642aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 1652aee09d8SJérôme Glisse bool fault, write_fault; 1662aee09d8SJérôme Glisse unsigned long i, npages; 1672aee09d8SJérôme Glisse uint64_t *pfns; 1682aee09d8SJérôme Glisse 1692aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 1702aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 1712aee09d8SJérôme Glisse pfns = &range->pfns[i]; 1722aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 1732aee09d8SJérôme Glisse 0, &fault, &write_fault); 174f8c888a3SChristoph Hellwig if (fault || write_fault) 175f8c888a3SChristoph Hellwig return hmm_vma_fault(addr, end, fault, write_fault, walk); 176f8c888a3SChristoph Hellwig hmm_vma_walk->last = addr; 177f8c888a3SChristoph Hellwig return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); 1782aee09d8SJérôme Glisse } 1792aee09d8SJérôme Glisse 180f88a1e90SJérôme Glisse static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 1812aee09d8SJérôme Glisse { 1822aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 1832aee09d8SJérôme Glisse return 0; 184f88a1e90SJérôme Glisse return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 185f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 186f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 187da4c3c73SJérôme Glisse } 188da4c3c73SJérôme Glisse 189992de9a8SJérôme Glisse #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1909d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 1919d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd) 1929d3973d6SChristoph Hellwig { 19353f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 194f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 1952aee09d8SJérôme Glisse unsigned long pfn, npages, i; 1962aee09d8SJérôme Glisse bool fault, write_fault; 197f88a1e90SJérôme Glisse uint64_t cpu_flags; 19853f5c3f4SJérôme Glisse 1992aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 200f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 2012aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 2022aee09d8SJérôme Glisse &fault, &write_fault); 20353f5c3f4SJérôme Glisse 20424cee8abSJason Gunthorpe if (fault || write_fault) 205f8c888a3SChristoph Hellwig return hmm_vma_fault(addr, end, fault, write_fault, walk); 20653f5c3f4SJérôme Glisse 207309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 208992de9a8SJérôme Glisse for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 209992de9a8SJérôme Glisse if (pmd_devmap(pmd)) { 210992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 211992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 212992de9a8SJérôme Glisse if (unlikely(!hmm_vma_walk->pgmap)) 213992de9a8SJérôme Glisse return -EBUSY; 214992de9a8SJérôme Glisse } 215391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 216992de9a8SJérôme Glisse } 217992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 218992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 219992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 220992de9a8SJérôme Glisse } 22153f5c3f4SJérôme Glisse hmm_vma_walk->last = end; 22253f5c3f4SJérôme Glisse return 0; 22353f5c3f4SJérôme Glisse } 2249d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2259d3973d6SChristoph Hellwig /* stub to allow the code below to compile */ 2269d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2279d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd); 2289d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 22953f5c3f4SJérôme Glisse 230f88a1e90SJérôme Glisse static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 2312aee09d8SJérôme Glisse { 232789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2332aee09d8SJérôme Glisse return 0; 234f88a1e90SJérôme Glisse return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 235f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 236f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 2372aee09d8SJérôme Glisse } 2382aee09d8SJérôme Glisse 23953f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 24053f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 24153f5c3f4SJérôme Glisse uint64_t *pfn) 24253f5c3f4SJérôme Glisse { 24353f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 244f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 2452aee09d8SJérôme Glisse bool fault, write_fault; 2462aee09d8SJérôme Glisse uint64_t cpu_flags; 24753f5c3f4SJérôme Glisse pte_t pte = *ptep; 248f88a1e90SJérôme Glisse uint64_t orig_pfn = *pfn; 24953f5c3f4SJérôme Glisse 250f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_NONE]; 25173231612SJérôme Glisse fault = write_fault = false; 25253f5c3f4SJérôme Glisse 25353f5c3f4SJérôme Glisse if (pte_none(pte)) { 25473231612SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 25573231612SJérôme Glisse &fault, &write_fault); 2562aee09d8SJérôme Glisse if (fault || write_fault) 25753f5c3f4SJérôme Glisse goto fault; 25853f5c3f4SJérôme Glisse return 0; 25953f5c3f4SJérôme Glisse } 26053f5c3f4SJérôme Glisse 26153f5c3f4SJérôme Glisse if (!pte_present(pte)) { 26253f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 26353f5c3f4SJérôme Glisse 26453f5c3f4SJérôme Glisse /* 26553f5c3f4SJérôme Glisse * This is a special swap entry, ignore migration, use 26653f5c3f4SJérôme Glisse * device and report anything else as error. 26753f5c3f4SJérôme Glisse */ 26853f5c3f4SJérôme Glisse if (is_device_private_entry(entry)) { 269f88a1e90SJérôme Glisse cpu_flags = range->flags[HMM_PFN_VALID] | 270f88a1e90SJérôme Glisse range->flags[HMM_PFN_DEVICE_PRIVATE]; 2712aee09d8SJérôme Glisse cpu_flags |= is_write_device_private_entry(entry) ? 272f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 0; 273f88a1e90SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 274f88a1e90SJérôme Glisse &fault, &write_fault); 275f88a1e90SJérôme Glisse if (fault || write_fault) 276f88a1e90SJérôme Glisse goto fault; 277391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, 278391aab11SJérôme Glisse swp_offset(entry)); 279f88a1e90SJérôme Glisse *pfn |= cpu_flags; 28053f5c3f4SJérôme Glisse return 0; 28153f5c3f4SJérôme Glisse } 28253f5c3f4SJérôme Glisse 28376612d6cSJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault, 28476612d6cSJason Gunthorpe &write_fault); 28576612d6cSJason Gunthorpe if (!fault && !write_fault) 28676612d6cSJason Gunthorpe return 0; 28776612d6cSJason Gunthorpe 28876612d6cSJason Gunthorpe if (!non_swap_entry(entry)) 28976612d6cSJason Gunthorpe goto fault; 29076612d6cSJason Gunthorpe 29153f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 29253f5c3f4SJérôme Glisse pte_unmap(ptep); 29353f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 294d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr); 29573231612SJérôme Glisse return -EBUSY; 29653f5c3f4SJérôme Glisse } 29753f5c3f4SJérôme Glisse 29853f5c3f4SJérôme Glisse /* Report error for everything else */ 299dfdc2207SJason Gunthorpe pte_unmap(ptep); 300f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_ERROR]; 30153f5c3f4SJérôme Glisse return -EFAULT; 30253f5c3f4SJérôme Glisse } 30353f5c3f4SJérôme Glisse 30476612d6cSJason Gunthorpe cpu_flags = pte_to_hmm_pfn_flags(range, pte); 30576612d6cSJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, &fault, 30676612d6cSJason Gunthorpe &write_fault); 3072aee09d8SJérôme Glisse if (fault || write_fault) 30853f5c3f4SJérôme Glisse goto fault; 30953f5c3f4SJérôme Glisse 310992de9a8SJérôme Glisse if (pte_devmap(pte)) { 311992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 312992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 313dfdc2207SJason Gunthorpe if (unlikely(!hmm_vma_walk->pgmap)) { 314dfdc2207SJason Gunthorpe pte_unmap(ptep); 315992de9a8SJérôme Glisse return -EBUSY; 316dfdc2207SJason Gunthorpe } 31740550627SJason Gunthorpe } 31840550627SJason Gunthorpe 31940550627SJason Gunthorpe /* 32040550627SJason Gunthorpe * Since each architecture defines a struct page for the zero page, just 32140550627SJason Gunthorpe * fall through and treat it like a normal page. 32240550627SJason Gunthorpe */ 32340550627SJason Gunthorpe if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { 32440550627SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault, 32540550627SJason Gunthorpe &write_fault); 32640550627SJason Gunthorpe if (fault || write_fault) { 327dfdc2207SJason Gunthorpe pte_unmap(ptep); 328992de9a8SJérôme Glisse return -EFAULT; 329992de9a8SJérôme Glisse } 33040550627SJason Gunthorpe *pfn = range->values[HMM_PFN_SPECIAL]; 33140550627SJason Gunthorpe return 0; 332ac541f25SRalph Campbell } 333992de9a8SJérôme Glisse 334391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 33553f5c3f4SJérôme Glisse return 0; 33653f5c3f4SJérôme Glisse 33753f5c3f4SJérôme Glisse fault: 338992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 339992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 340992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 341992de9a8SJérôme Glisse } 34253f5c3f4SJérôme Glisse pte_unmap(ptep); 34353f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 344f8c888a3SChristoph Hellwig return hmm_vma_fault(addr, end, fault, write_fault, walk); 34553f5c3f4SJérôme Glisse } 34653f5c3f4SJérôme Glisse 347da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 348da4c3c73SJérôme Glisse unsigned long start, 349da4c3c73SJérôme Glisse unsigned long end, 350da4c3c73SJérôme Glisse struct mm_walk *walk) 351da4c3c73SJérôme Glisse { 35274eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 35374eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 3542288a9a6SJason Gunthorpe uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; 3552288a9a6SJason Gunthorpe unsigned long npages = (end - start) >> PAGE_SHIFT; 3562288a9a6SJason Gunthorpe unsigned long addr = start; 3572288a9a6SJason Gunthorpe bool fault, write_fault; 358da4c3c73SJérôme Glisse pte_t *ptep; 359da4c3c73SJérôme Glisse pmd_t pmd; 360da4c3c73SJérôme Glisse 361d08faca0SJérôme Glisse again: 362d08faca0SJérôme Glisse pmd = READ_ONCE(*pmdp); 363d08faca0SJérôme Glisse if (pmd_none(pmd)) 364b7a16c7aSSteven Price return hmm_vma_walk_hole(start, end, -1, walk); 365d08faca0SJérôme Glisse 366d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 367d08faca0SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 368d08faca0SJérôme Glisse 0, &fault, &write_fault); 369d08faca0SJérôme Glisse if (fault || write_fault) { 370d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 371d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp); 37273231612SJérôme Glisse return -EBUSY; 373d08faca0SJérôme Glisse } 3747d082987SJason Gunthorpe return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 3752288a9a6SJason Gunthorpe } 3762288a9a6SJason Gunthorpe 3772288a9a6SJason Gunthorpe if (!pmd_present(pmd)) { 3782288a9a6SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, 3792288a9a6SJason Gunthorpe &write_fault); 3802288a9a6SJason Gunthorpe if (fault || write_fault) 3812288a9a6SJason Gunthorpe return -EFAULT; 382d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 3832288a9a6SJason Gunthorpe } 384d08faca0SJérôme Glisse 385d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 386da4c3c73SJérôme Glisse /* 387d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread 388da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 389da4c3c73SJérôme Glisse * mmu_notifier callback. 390da4c3c73SJérôme Glisse * 391d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent 392da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 393da4c3c73SJérôme Glisse * values. 394da4c3c73SJérôme Glisse */ 395da4c3c73SJérôme Glisse pmd = pmd_read_atomic(pmdp); 396da4c3c73SJérôme Glisse barrier(); 397da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 398da4c3c73SJérôme Glisse goto again; 399da4c3c73SJérôme Glisse 4002288a9a6SJason Gunthorpe return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); 401da4c3c73SJérôme Glisse } 402da4c3c73SJérôme Glisse 403d08faca0SJérôme Glisse /* 404d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration, 405d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 406d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 407d08faca0SJérôme Glisse * recover. 408d08faca0SJérôme Glisse */ 4092288a9a6SJason Gunthorpe if (pmd_bad(pmd)) { 4102288a9a6SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, 4112288a9a6SJason Gunthorpe &write_fault); 4122288a9a6SJason Gunthorpe if (fault || write_fault) 4132288a9a6SJason Gunthorpe return -EFAULT; 414d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 4152288a9a6SJason Gunthorpe } 416da4c3c73SJérôme Glisse 417da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 4182288a9a6SJason Gunthorpe for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { 41953f5c3f4SJérôme Glisse int r; 420da4c3c73SJérôme Glisse 4212288a9a6SJason Gunthorpe r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); 42253f5c3f4SJérôme Glisse if (r) { 423dfdc2207SJason Gunthorpe /* hmm_vma_handle_pte() did pte_unmap() */ 42474eee180SJérôme Glisse hmm_vma_walk->last = addr; 42553f5c3f4SJérôme Glisse return r; 42674eee180SJérôme Glisse } 427da4c3c73SJérôme Glisse } 428992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 429992de9a8SJérôme Glisse /* 430992de9a8SJérôme Glisse * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 431992de9a8SJérôme Glisse * so that we can leverage get_dev_pagemap() optimization which 432992de9a8SJérôme Glisse * will not re-take a reference on a pgmap if we already have 433992de9a8SJérôme Glisse * one. 434992de9a8SJérôme Glisse */ 435992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 436992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 437992de9a8SJérôme Glisse } 438da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 439da4c3c73SJérôme Glisse 44053f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 441da4c3c73SJérôme Glisse return 0; 442da4c3c73SJérôme Glisse } 443da4c3c73SJérôme Glisse 444f0b3c45cSChristoph Hellwig #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 445f0b3c45cSChristoph Hellwig defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 446f0b3c45cSChristoph Hellwig static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 447f0b3c45cSChristoph Hellwig { 448f0b3c45cSChristoph Hellwig if (!pud_present(pud)) 449f0b3c45cSChristoph Hellwig return 0; 450f0b3c45cSChristoph Hellwig return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 451f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_WRITE] : 452f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_VALID]; 453f0b3c45cSChristoph Hellwig } 454f0b3c45cSChristoph Hellwig 455f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 456992de9a8SJérôme Glisse struct mm_walk *walk) 457992de9a8SJérôme Glisse { 458992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 459992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4603afc4236SSteven Price unsigned long addr = start; 461992de9a8SJérôme Glisse pud_t pud; 4623afc4236SSteven Price int ret = 0; 4633afc4236SSteven Price spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 464992de9a8SJérôme Glisse 4653afc4236SSteven Price if (!ptl) 4663afc4236SSteven Price return 0; 4673afc4236SSteven Price 4683afc4236SSteven Price /* Normally we don't want to split the huge page */ 4693afc4236SSteven Price walk->action = ACTION_CONTINUE; 4703afc4236SSteven Price 471992de9a8SJérôme Glisse pud = READ_ONCE(*pudp); 4723afc4236SSteven Price if (pud_none(pud)) { 47305fc1df9SJason Gunthorpe spin_unlock(ptl); 47405fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4753afc4236SSteven Price } 476992de9a8SJérôme Glisse 477992de9a8SJérôme Glisse if (pud_huge(pud) && pud_devmap(pud)) { 478992de9a8SJérôme Glisse unsigned long i, npages, pfn; 479992de9a8SJérôme Glisse uint64_t *pfns, cpu_flags; 480992de9a8SJérôme Glisse bool fault, write_fault; 481992de9a8SJérôme Glisse 4823afc4236SSteven Price if (!pud_present(pud)) { 48305fc1df9SJason Gunthorpe spin_unlock(ptl); 48405fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk); 4853afc4236SSteven Price } 486992de9a8SJérôme Glisse 487992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 488992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 489992de9a8SJérôme Glisse pfns = &range->pfns[i]; 490992de9a8SJérôme Glisse 491992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud); 492992de9a8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 493992de9a8SJérôme Glisse cpu_flags, &fault, &write_fault); 4943afc4236SSteven Price if (fault || write_fault) { 49505fc1df9SJason Gunthorpe spin_unlock(ptl); 496f8c888a3SChristoph Hellwig return hmm_vma_fault(addr, end, fault, write_fault, 49705fc1df9SJason Gunthorpe walk); 4983afc4236SSteven Price } 499992de9a8SJérôme Glisse 500992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 501992de9a8SJérôme Glisse for (i = 0; i < npages; ++i, ++pfn) { 502992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 503992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 5043afc4236SSteven Price if (unlikely(!hmm_vma_walk->pgmap)) { 5053afc4236SSteven Price ret = -EBUSY; 5063afc4236SSteven Price goto out_unlock; 5073afc4236SSteven Price } 508391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 509391aab11SJérôme Glisse cpu_flags; 510992de9a8SJérôme Glisse } 511992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 512992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 513992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 514992de9a8SJérôme Glisse } 515992de9a8SJérôme Glisse hmm_vma_walk->last = end; 5163afc4236SSteven Price goto out_unlock; 517992de9a8SJérôme Glisse } 518992de9a8SJérôme Glisse 5193afc4236SSteven Price /* Ask for the PUD to be split */ 5203afc4236SSteven Price walk->action = ACTION_SUBTREE; 521992de9a8SJérôme Glisse 5223afc4236SSteven Price out_unlock: 5233afc4236SSteven Price spin_unlock(ptl); 524992de9a8SJérôme Glisse return ret; 525992de9a8SJérôme Glisse } 526f0b3c45cSChristoph Hellwig #else 527f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL 528f0b3c45cSChristoph Hellwig #endif 529992de9a8SJérôme Glisse 530251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE 53163d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 53263d5066fSJérôme Glisse unsigned long start, unsigned long end, 53363d5066fSJérôme Glisse struct mm_walk *walk) 53463d5066fSJérôme Glisse { 53505c23af4SChristoph Hellwig unsigned long addr = start, i, pfn; 53663d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 53763d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 53863d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma; 53963d5066fSJérôme Glisse uint64_t orig_pfn, cpu_flags; 54063d5066fSJérôme Glisse bool fault, write_fault; 54163d5066fSJérôme Glisse spinlock_t *ptl; 54263d5066fSJérôme Glisse pte_t entry; 54363d5066fSJérôme Glisse 544d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 54563d5066fSJérôme Glisse entry = huge_ptep_get(pte); 54663d5066fSJérôme Glisse 5477f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT; 54863d5066fSJérôme Glisse orig_pfn = range->pfns[i]; 54963d5066fSJérôme Glisse range->pfns[i] = range->values[HMM_PFN_NONE]; 55063d5066fSJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, entry); 55163d5066fSJérôme Glisse fault = write_fault = false; 55263d5066fSJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 55363d5066fSJérôme Glisse &fault, &write_fault); 55463d5066fSJérôme Glisse if (fault || write_fault) { 55545050692SChristoph Hellwig spin_unlock(ptl); 556f8c888a3SChristoph Hellwig return hmm_vma_fault(addr, end, fault, write_fault, walk); 55763d5066fSJérôme Glisse } 55863d5066fSJérôme Glisse 55905c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 5607f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 561391aab11SJérôme Glisse range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 562391aab11SJérôme Glisse cpu_flags; 56363d5066fSJérôme Glisse hmm_vma_walk->last = end; 56463d5066fSJérôme Glisse spin_unlock(ptl); 56545050692SChristoph Hellwig return 0; 56663d5066fSJérôme Glisse } 567251bbe59SChristoph Hellwig #else 568251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL 569251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */ 57063d5066fSJérôme Glisse 571d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end, 572d28c2c9aSRalph Campbell struct mm_walk *walk) 57333cd47dcSJérôme Glisse { 574d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private; 575d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range; 576d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma; 577d28c2c9aSRalph Campbell 578d28c2c9aSRalph Campbell /* 579c2579c9cSJason Gunthorpe * Skip vma ranges that don't have struct page backing them or map I/O 580c2579c9cSJason Gunthorpe * devices directly. 581c2579c9cSJason Gunthorpe * 582d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not 583c2579c9cSJason Gunthorpe * allow write access either. HMM does not support architectures that 584c2579c9cSJason Gunthorpe * allow write without read. 585d28c2c9aSRalph Campbell */ 586c2579c9cSJason Gunthorpe if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) || 587c2579c9cSJason Gunthorpe !(vma->vm_flags & VM_READ)) { 588d28c2c9aSRalph Campbell bool fault, write_fault; 589d28c2c9aSRalph Campbell 590d28c2c9aSRalph Campbell /* 591d28c2c9aSRalph Campbell * Check to see if a fault is requested for any page in the 592d28c2c9aSRalph Campbell * range. 593d28c2c9aSRalph Campbell */ 594d28c2c9aSRalph Campbell hmm_range_need_fault(hmm_vma_walk, range->pfns + 595d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT), 596d28c2c9aSRalph Campbell (end - start) >> PAGE_SHIFT, 597d28c2c9aSRalph Campbell 0, &fault, &write_fault); 598d28c2c9aSRalph Campbell if (fault || write_fault) 599d28c2c9aSRalph Campbell return -EFAULT; 600d28c2c9aSRalph Campbell 601c2579c9cSJason Gunthorpe hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 602d28c2c9aSRalph Campbell hmm_vma_walk->last = end; 603d28c2c9aSRalph Campbell 604d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */ 605d28c2c9aSRalph Campbell return 1; 606d28c2c9aSRalph Campbell } 607d28c2c9aSRalph Campbell 608d28c2c9aSRalph Campbell return 0; 60933cd47dcSJérôme Glisse } 61033cd47dcSJérôme Glisse 6117b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = { 6127b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud, 6137b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd, 6147b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole, 6157b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 616d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test, 6177b86ac33SChristoph Hellwig }; 6187b86ac33SChristoph Hellwig 6199a4903e4SChristoph Hellwig /** 6209a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range 62108232a45SJérôme Glisse * @range: range being faulted 6229a4903e4SChristoph Hellwig * @flags: HMM_FAULT_* flags 62373231612SJérôme Glisse * 6249a4903e4SChristoph Hellwig * Return: the number of valid pages in range->pfns[] (from range start 6259a4903e4SChristoph Hellwig * address), which may be zero. On error one of the following status codes 6269a4903e4SChristoph Hellwig * can be returned: 6279a4903e4SChristoph Hellwig * 6289a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 6299a4903e4SChristoph Hellwig * (e.g., device file vma). 63073231612SJérôme Glisse * -ENOMEM: Out of memory. 6319a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read 6329a4903e4SChristoph Hellwig * only). 6339a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for 6349a4903e4SChristoph Hellwig * the invalidation to finish. 6359a4903e4SChristoph Hellwig * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 6369a4903e4SChristoph Hellwig * that range) number of valid pages in range->pfns[] (from 63773231612SJérôme Glisse * range start address). 63874eee180SJérôme Glisse * 63974eee180SJérôme Glisse * This is similar to a regular CPU page fault except that it will not trigger 64073231612SJérôme Glisse * any memory migration if the memory being faulted is not accessible by CPUs 64173231612SJérôme Glisse * and caller does not ask for migration. 64274eee180SJérôme Glisse * 643ff05c0c6SJérôme Glisse * On error, for one virtual address in the range, the function will mark the 644ff05c0c6SJérôme Glisse * corresponding HMM pfn entry with an error flag. 64574eee180SJérôme Glisse */ 6469a4903e4SChristoph Hellwig long hmm_range_fault(struct hmm_range *range, unsigned int flags) 64774eee180SJérôme Glisse { 648d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = { 649d28c2c9aSRalph Campbell .range = range, 650d28c2c9aSRalph Campbell .last = range->start, 651d28c2c9aSRalph Campbell .flags = flags, 652d28c2c9aSRalph Campbell }; 653a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm; 65474eee180SJérôme Glisse int ret; 65574eee180SJérôme Glisse 65604ec32fbSJason Gunthorpe lockdep_assert_held(&mm->mmap_sem); 657a3e0d41cSJérôme Glisse 658a3e0d41cSJérôme Glisse do { 659a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 660a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 661a22dd506SJason Gunthorpe range->notifier_seq)) 6622bcbeaefSChristoph Hellwig return -EBUSY; 663d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 6647b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk); 665d28c2c9aSRalph Campbell } while (ret == -EBUSY); 666a3e0d41cSJérôme Glisse 667d28c2c9aSRalph Campbell if (ret) 66873231612SJérôme Glisse return ret; 66973231612SJérôme Glisse return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 67074eee180SJérôme Glisse } 67173231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 672