hmm.c (05fc1df95e5dc09802813bab9c1e718f1e419d93) | hmm.c (7d082987e5e562c07a208503a607a733d50553ba) |
---|---|
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Copyright 2013 Red Hat Inc. 4 * 5 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 7/* 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 10 */ 11#include <linux/pagewalk.h> 12#include <linux/hmm.h> 13#include <linux/init.h> 14#include <linux/rmap.h> 15#include <linux/swap.h> 16#include <linux/slab.h> 17#include <linux/sched.h> 18#include <linux/mmzone.h> 19#include <linux/pagemap.h> 20#include <linux/swapops.h> 21#include <linux/hugetlb.h> 22#include <linux/memremap.h> 23#include <linux/sched/mm.h> 24#include <linux/jump_label.h> 25#include <linux/dma-mapping.h> 26#include <linux/mmu_notifier.h> 27#include <linux/memory_hotplug.h> 28 29struct hmm_vma_walk { 30 struct hmm_range *range; 31 struct dev_pagemap *pgmap; 32 unsigned long last; 33 unsigned int flags; 34}; 35 36static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 37 bool write_fault, uint64_t *pfn) 38{ 39 unsigned int flags = FAULT_FLAG_REMOTE; 40 struct hmm_vma_walk *hmm_vma_walk = walk->private; 41 struct hmm_range *range = hmm_vma_walk->range; 42 struct vm_area_struct *vma = walk->vma; 43 vm_fault_t ret; 44 45 if (!vma) 46 goto err; 47 48 if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) 49 flags |= FAULT_FLAG_ALLOW_RETRY; 50 if (write_fault) 51 flags |= FAULT_FLAG_WRITE; 52 53 ret = handle_mm_fault(vma, addr, flags); 54 if (ret & VM_FAULT_RETRY) { 55 /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ 56 return -EAGAIN; 57 } 58 if (ret & VM_FAULT_ERROR) 59 goto err; 60 61 return -EBUSY; 62 63err: 64 *pfn = range->values[HMM_PFN_ERROR]; 65 return -EFAULT; 66} 67 68static int hmm_pfns_fill(unsigned long addr, unsigned long end, 69 struct hmm_range *range, enum hmm_pfn_value_e value) 70{ 71 uint64_t *pfns = range->pfns; 72 unsigned long i; 73 74 i = (addr - range->start) >> PAGE_SHIFT; 75 for (; addr < end; addr += PAGE_SIZE, i++) 76 pfns[i] = range->values[value]; 77 78 return 0; 79} 80 81/* 82 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) 83 * @addr: range virtual start address (inclusive) 84 * @end: range virtual end address (exclusive) 85 * @fault: should we fault or not ? 86 * @write_fault: write fault ? 87 * @walk: mm_walk structure 88 * Return: 0 on success, -EBUSY after page fault, or page fault error 89 * 90 * This function will be called whenever pmd_none() or pte_none() returns true, 91 * or whenever there is no page directory covering the virtual address range. 92 */ 93static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 94 bool fault, bool write_fault, 95 struct mm_walk *walk) 96{ 97 struct hmm_vma_walk *hmm_vma_walk = walk->private; 98 struct hmm_range *range = hmm_vma_walk->range; 99 uint64_t *pfns = range->pfns; 100 unsigned long i; 101 102 hmm_vma_walk->last = addr; 103 i = (addr - range->start) >> PAGE_SHIFT; 104 105 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) 106 return -EPERM; 107 108 for (; addr < end; addr += PAGE_SIZE, i++) { 109 pfns[i] = range->values[HMM_PFN_NONE]; 110 if (fault || write_fault) { 111 int ret; 112 113 ret = hmm_vma_do_fault(walk, addr, write_fault, 114 &pfns[i]); 115 if (ret != -EBUSY) 116 return ret; 117 } 118 } 119 120 return (fault || write_fault) ? -EBUSY : 0; 121} 122 123static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 124 uint64_t pfns, uint64_t cpu_flags, 125 bool *fault, bool *write_fault) 126{ 127 struct hmm_range *range = hmm_vma_walk->range; 128 129 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 130 return; 131 132 /* 133 * So we not only consider the individual per page request we also 134 * consider the default flags requested for the range. The API can 135 * be used 2 ways. The first one where the HMM user coalesces 136 * multiple page faults into one request and sets flags per pfn for 137 * those faults. The second one where the HMM user wants to pre- 138 * fault a range with specific flags. For the latter one it is a 139 * waste to have the user pre-fill the pfn arrays with a default 140 * flags value. 141 */ 142 pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 143 144 /* We aren't ask to do anything ... */ 145 if (!(pfns & range->flags[HMM_PFN_VALID])) 146 return; 147 /* If this is device memory then only fault if explicitly requested */ 148 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 149 /* Do we fault on device memory ? */ 150 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 151 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 152 *fault = true; 153 } 154 return; 155 } 156 157 /* If CPU page table is not valid then we need to fault */ 158 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 159 /* Need to write fault ? */ 160 if ((pfns & range->flags[HMM_PFN_WRITE]) && 161 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 162 *write_fault = true; 163 *fault = true; 164 } 165} 166 167static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 168 const uint64_t *pfns, unsigned long npages, 169 uint64_t cpu_flags, bool *fault, 170 bool *write_fault) 171{ 172 unsigned long i; 173 174 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 175 *fault = *write_fault = false; 176 return; 177 } 178 179 *fault = *write_fault = false; 180 for (i = 0; i < npages; ++i) { 181 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 182 fault, write_fault); 183 if ((*write_fault)) 184 return; 185 } 186} 187 188static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 189 __always_unused int depth, struct mm_walk *walk) 190{ 191 struct hmm_vma_walk *hmm_vma_walk = walk->private; 192 struct hmm_range *range = hmm_vma_walk->range; 193 bool fault, write_fault; 194 unsigned long i, npages; 195 uint64_t *pfns; 196 197 i = (addr - range->start) >> PAGE_SHIFT; 198 npages = (end - addr) >> PAGE_SHIFT; 199 pfns = &range->pfns[i]; 200 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 201 0, &fault, &write_fault); 202 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 203} 204 205static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 206{ 207 if (pmd_protnone(pmd)) 208 return 0; 209 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 210 range->flags[HMM_PFN_WRITE] : 211 range->flags[HMM_PFN_VALID]; 212} 213 214#ifdef CONFIG_TRANSPARENT_HUGEPAGE 215static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 216 unsigned long end, uint64_t *pfns, pmd_t pmd) 217{ 218 struct hmm_vma_walk *hmm_vma_walk = walk->private; 219 struct hmm_range *range = hmm_vma_walk->range; 220 unsigned long pfn, npages, i; 221 bool fault, write_fault; 222 uint64_t cpu_flags; 223 224 npages = (end - addr) >> PAGE_SHIFT; 225 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 226 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 227 &fault, &write_fault); 228 229 if (pmd_protnone(pmd) || fault || write_fault) 230 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 231 232 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 233 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 234 if (pmd_devmap(pmd)) { 235 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 236 hmm_vma_walk->pgmap); 237 if (unlikely(!hmm_vma_walk->pgmap)) 238 return -EBUSY; 239 } 240 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 241 } 242 if (hmm_vma_walk->pgmap) { 243 put_dev_pagemap(hmm_vma_walk->pgmap); 244 hmm_vma_walk->pgmap = NULL; 245 } 246 hmm_vma_walk->last = end; 247 return 0; 248} 249#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 250/* stub to allow the code below to compile */ 251int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 252 unsigned long end, uint64_t *pfns, pmd_t pmd); 253#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 254 255static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 256{ 257 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 258 return 0; 259 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 260 range->flags[HMM_PFN_WRITE] : 261 range->flags[HMM_PFN_VALID]; 262} 263 264static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 265 unsigned long end, pmd_t *pmdp, pte_t *ptep, 266 uint64_t *pfn) 267{ 268 struct hmm_vma_walk *hmm_vma_walk = walk->private; 269 struct hmm_range *range = hmm_vma_walk->range; 270 bool fault, write_fault; 271 uint64_t cpu_flags; 272 pte_t pte = *ptep; 273 uint64_t orig_pfn = *pfn; 274 275 *pfn = range->values[HMM_PFN_NONE]; 276 fault = write_fault = false; 277 278 if (pte_none(pte)) { 279 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 280 &fault, &write_fault); 281 if (fault || write_fault) 282 goto fault; 283 return 0; 284 } 285 286 if (!pte_present(pte)) { 287 swp_entry_t entry = pte_to_swp_entry(pte); 288 289 if (!non_swap_entry(entry)) { 290 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 291 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 292 &fault, &write_fault); 293 if (fault || write_fault) 294 goto fault; 295 return 0; 296 } 297 298 /* 299 * This is a special swap entry, ignore migration, use 300 * device and report anything else as error. 301 */ 302 if (is_device_private_entry(entry)) { 303 cpu_flags = range->flags[HMM_PFN_VALID] | 304 range->flags[HMM_PFN_DEVICE_PRIVATE]; 305 cpu_flags |= is_write_device_private_entry(entry) ? 306 range->flags[HMM_PFN_WRITE] : 0; 307 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 308 &fault, &write_fault); 309 if (fault || write_fault) 310 goto fault; 311 *pfn = hmm_device_entry_from_pfn(range, 312 swp_offset(entry)); 313 *pfn |= cpu_flags; 314 return 0; 315 } 316 317 if (is_migration_entry(entry)) { 318 if (fault || write_fault) { 319 pte_unmap(ptep); 320 hmm_vma_walk->last = addr; 321 migration_entry_wait(walk->mm, pmdp, addr); 322 return -EBUSY; 323 } 324 return 0; 325 } 326 327 /* Report error for everything else */ 328 pte_unmap(ptep); 329 *pfn = range->values[HMM_PFN_ERROR]; 330 return -EFAULT; 331 } else { 332 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 333 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 334 &fault, &write_fault); 335 } 336 337 if (fault || write_fault) 338 goto fault; 339 340 if (pte_devmap(pte)) { 341 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 342 hmm_vma_walk->pgmap); 343 if (unlikely(!hmm_vma_walk->pgmap)) { 344 pte_unmap(ptep); 345 return -EBUSY; 346 } 347 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 348 if (!is_zero_pfn(pte_pfn(pte))) { 349 pte_unmap(ptep); 350 *pfn = range->values[HMM_PFN_SPECIAL]; 351 return -EFAULT; 352 } 353 /* 354 * Since each architecture defines a struct page for the zero 355 * page, just fall through and treat it like a normal page. 356 */ 357 } 358 359 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 360 return 0; 361 362fault: 363 if (hmm_vma_walk->pgmap) { 364 put_dev_pagemap(hmm_vma_walk->pgmap); 365 hmm_vma_walk->pgmap = NULL; 366 } 367 pte_unmap(ptep); 368 /* Fault any virtual address we were asked to fault */ 369 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 370} 371 372static int hmm_vma_walk_pmd(pmd_t *pmdp, 373 unsigned long start, 374 unsigned long end, 375 struct mm_walk *walk) 376{ 377 struct hmm_vma_walk *hmm_vma_walk = walk->private; 378 struct hmm_range *range = hmm_vma_walk->range; 379 uint64_t *pfns = range->pfns; 380 unsigned long addr = start, i; 381 pte_t *ptep; 382 pmd_t pmd; 383 384again: 385 pmd = READ_ONCE(*pmdp); 386 if (pmd_none(pmd)) 387 return hmm_vma_walk_hole(start, end, -1, walk); 388 389 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 390 bool fault, write_fault; 391 unsigned long npages; 392 uint64_t *pfns; 393 394 i = (addr - range->start) >> PAGE_SHIFT; 395 npages = (end - addr) >> PAGE_SHIFT; 396 pfns = &range->pfns[i]; 397 398 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 399 0, &fault, &write_fault); 400 if (fault || write_fault) { 401 hmm_vma_walk->last = addr; 402 pmd_migration_entry_wait(walk->mm, pmdp); 403 return -EBUSY; 404 } | 1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Copyright 2013 Red Hat Inc. 4 * 5 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 7/* 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 10 */ 11#include <linux/pagewalk.h> 12#include <linux/hmm.h> 13#include <linux/init.h> 14#include <linux/rmap.h> 15#include <linux/swap.h> 16#include <linux/slab.h> 17#include <linux/sched.h> 18#include <linux/mmzone.h> 19#include <linux/pagemap.h> 20#include <linux/swapops.h> 21#include <linux/hugetlb.h> 22#include <linux/memremap.h> 23#include <linux/sched/mm.h> 24#include <linux/jump_label.h> 25#include <linux/dma-mapping.h> 26#include <linux/mmu_notifier.h> 27#include <linux/memory_hotplug.h> 28 29struct hmm_vma_walk { 30 struct hmm_range *range; 31 struct dev_pagemap *pgmap; 32 unsigned long last; 33 unsigned int flags; 34}; 35 36static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 37 bool write_fault, uint64_t *pfn) 38{ 39 unsigned int flags = FAULT_FLAG_REMOTE; 40 struct hmm_vma_walk *hmm_vma_walk = walk->private; 41 struct hmm_range *range = hmm_vma_walk->range; 42 struct vm_area_struct *vma = walk->vma; 43 vm_fault_t ret; 44 45 if (!vma) 46 goto err; 47 48 if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) 49 flags |= FAULT_FLAG_ALLOW_RETRY; 50 if (write_fault) 51 flags |= FAULT_FLAG_WRITE; 52 53 ret = handle_mm_fault(vma, addr, flags); 54 if (ret & VM_FAULT_RETRY) { 55 /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ 56 return -EAGAIN; 57 } 58 if (ret & VM_FAULT_ERROR) 59 goto err; 60 61 return -EBUSY; 62 63err: 64 *pfn = range->values[HMM_PFN_ERROR]; 65 return -EFAULT; 66} 67 68static int hmm_pfns_fill(unsigned long addr, unsigned long end, 69 struct hmm_range *range, enum hmm_pfn_value_e value) 70{ 71 uint64_t *pfns = range->pfns; 72 unsigned long i; 73 74 i = (addr - range->start) >> PAGE_SHIFT; 75 for (; addr < end; addr += PAGE_SIZE, i++) 76 pfns[i] = range->values[value]; 77 78 return 0; 79} 80 81/* 82 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) 83 * @addr: range virtual start address (inclusive) 84 * @end: range virtual end address (exclusive) 85 * @fault: should we fault or not ? 86 * @write_fault: write fault ? 87 * @walk: mm_walk structure 88 * Return: 0 on success, -EBUSY after page fault, or page fault error 89 * 90 * This function will be called whenever pmd_none() or pte_none() returns true, 91 * or whenever there is no page directory covering the virtual address range. 92 */ 93static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 94 bool fault, bool write_fault, 95 struct mm_walk *walk) 96{ 97 struct hmm_vma_walk *hmm_vma_walk = walk->private; 98 struct hmm_range *range = hmm_vma_walk->range; 99 uint64_t *pfns = range->pfns; 100 unsigned long i; 101 102 hmm_vma_walk->last = addr; 103 i = (addr - range->start) >> PAGE_SHIFT; 104 105 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) 106 return -EPERM; 107 108 for (; addr < end; addr += PAGE_SIZE, i++) { 109 pfns[i] = range->values[HMM_PFN_NONE]; 110 if (fault || write_fault) { 111 int ret; 112 113 ret = hmm_vma_do_fault(walk, addr, write_fault, 114 &pfns[i]); 115 if (ret != -EBUSY) 116 return ret; 117 } 118 } 119 120 return (fault || write_fault) ? -EBUSY : 0; 121} 122 123static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 124 uint64_t pfns, uint64_t cpu_flags, 125 bool *fault, bool *write_fault) 126{ 127 struct hmm_range *range = hmm_vma_walk->range; 128 129 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 130 return; 131 132 /* 133 * So we not only consider the individual per page request we also 134 * consider the default flags requested for the range. The API can 135 * be used 2 ways. The first one where the HMM user coalesces 136 * multiple page faults into one request and sets flags per pfn for 137 * those faults. The second one where the HMM user wants to pre- 138 * fault a range with specific flags. For the latter one it is a 139 * waste to have the user pre-fill the pfn arrays with a default 140 * flags value. 141 */ 142 pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 143 144 /* We aren't ask to do anything ... */ 145 if (!(pfns & range->flags[HMM_PFN_VALID])) 146 return; 147 /* If this is device memory then only fault if explicitly requested */ 148 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 149 /* Do we fault on device memory ? */ 150 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 151 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 152 *fault = true; 153 } 154 return; 155 } 156 157 /* If CPU page table is not valid then we need to fault */ 158 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 159 /* Need to write fault ? */ 160 if ((pfns & range->flags[HMM_PFN_WRITE]) && 161 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 162 *write_fault = true; 163 *fault = true; 164 } 165} 166 167static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 168 const uint64_t *pfns, unsigned long npages, 169 uint64_t cpu_flags, bool *fault, 170 bool *write_fault) 171{ 172 unsigned long i; 173 174 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 175 *fault = *write_fault = false; 176 return; 177 } 178 179 *fault = *write_fault = false; 180 for (i = 0; i < npages; ++i) { 181 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 182 fault, write_fault); 183 if ((*write_fault)) 184 return; 185 } 186} 187 188static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 189 __always_unused int depth, struct mm_walk *walk) 190{ 191 struct hmm_vma_walk *hmm_vma_walk = walk->private; 192 struct hmm_range *range = hmm_vma_walk->range; 193 bool fault, write_fault; 194 unsigned long i, npages; 195 uint64_t *pfns; 196 197 i = (addr - range->start) >> PAGE_SHIFT; 198 npages = (end - addr) >> PAGE_SHIFT; 199 pfns = &range->pfns[i]; 200 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 201 0, &fault, &write_fault); 202 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 203} 204 205static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 206{ 207 if (pmd_protnone(pmd)) 208 return 0; 209 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 210 range->flags[HMM_PFN_WRITE] : 211 range->flags[HMM_PFN_VALID]; 212} 213 214#ifdef CONFIG_TRANSPARENT_HUGEPAGE 215static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 216 unsigned long end, uint64_t *pfns, pmd_t pmd) 217{ 218 struct hmm_vma_walk *hmm_vma_walk = walk->private; 219 struct hmm_range *range = hmm_vma_walk->range; 220 unsigned long pfn, npages, i; 221 bool fault, write_fault; 222 uint64_t cpu_flags; 223 224 npages = (end - addr) >> PAGE_SHIFT; 225 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 226 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 227 &fault, &write_fault); 228 229 if (pmd_protnone(pmd) || fault || write_fault) 230 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 231 232 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 233 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 234 if (pmd_devmap(pmd)) { 235 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 236 hmm_vma_walk->pgmap); 237 if (unlikely(!hmm_vma_walk->pgmap)) 238 return -EBUSY; 239 } 240 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 241 } 242 if (hmm_vma_walk->pgmap) { 243 put_dev_pagemap(hmm_vma_walk->pgmap); 244 hmm_vma_walk->pgmap = NULL; 245 } 246 hmm_vma_walk->last = end; 247 return 0; 248} 249#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 250/* stub to allow the code below to compile */ 251int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 252 unsigned long end, uint64_t *pfns, pmd_t pmd); 253#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 254 255static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 256{ 257 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 258 return 0; 259 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 260 range->flags[HMM_PFN_WRITE] : 261 range->flags[HMM_PFN_VALID]; 262} 263 264static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 265 unsigned long end, pmd_t *pmdp, pte_t *ptep, 266 uint64_t *pfn) 267{ 268 struct hmm_vma_walk *hmm_vma_walk = walk->private; 269 struct hmm_range *range = hmm_vma_walk->range; 270 bool fault, write_fault; 271 uint64_t cpu_flags; 272 pte_t pte = *ptep; 273 uint64_t orig_pfn = *pfn; 274 275 *pfn = range->values[HMM_PFN_NONE]; 276 fault = write_fault = false; 277 278 if (pte_none(pte)) { 279 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 280 &fault, &write_fault); 281 if (fault || write_fault) 282 goto fault; 283 return 0; 284 } 285 286 if (!pte_present(pte)) { 287 swp_entry_t entry = pte_to_swp_entry(pte); 288 289 if (!non_swap_entry(entry)) { 290 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 291 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 292 &fault, &write_fault); 293 if (fault || write_fault) 294 goto fault; 295 return 0; 296 } 297 298 /* 299 * This is a special swap entry, ignore migration, use 300 * device and report anything else as error. 301 */ 302 if (is_device_private_entry(entry)) { 303 cpu_flags = range->flags[HMM_PFN_VALID] | 304 range->flags[HMM_PFN_DEVICE_PRIVATE]; 305 cpu_flags |= is_write_device_private_entry(entry) ? 306 range->flags[HMM_PFN_WRITE] : 0; 307 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 308 &fault, &write_fault); 309 if (fault || write_fault) 310 goto fault; 311 *pfn = hmm_device_entry_from_pfn(range, 312 swp_offset(entry)); 313 *pfn |= cpu_flags; 314 return 0; 315 } 316 317 if (is_migration_entry(entry)) { 318 if (fault || write_fault) { 319 pte_unmap(ptep); 320 hmm_vma_walk->last = addr; 321 migration_entry_wait(walk->mm, pmdp, addr); 322 return -EBUSY; 323 } 324 return 0; 325 } 326 327 /* Report error for everything else */ 328 pte_unmap(ptep); 329 *pfn = range->values[HMM_PFN_ERROR]; 330 return -EFAULT; 331 } else { 332 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 333 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 334 &fault, &write_fault); 335 } 336 337 if (fault || write_fault) 338 goto fault; 339 340 if (pte_devmap(pte)) { 341 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 342 hmm_vma_walk->pgmap); 343 if (unlikely(!hmm_vma_walk->pgmap)) { 344 pte_unmap(ptep); 345 return -EBUSY; 346 } 347 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 348 if (!is_zero_pfn(pte_pfn(pte))) { 349 pte_unmap(ptep); 350 *pfn = range->values[HMM_PFN_SPECIAL]; 351 return -EFAULT; 352 } 353 /* 354 * Since each architecture defines a struct page for the zero 355 * page, just fall through and treat it like a normal page. 356 */ 357 } 358 359 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 360 return 0; 361 362fault: 363 if (hmm_vma_walk->pgmap) { 364 put_dev_pagemap(hmm_vma_walk->pgmap); 365 hmm_vma_walk->pgmap = NULL; 366 } 367 pte_unmap(ptep); 368 /* Fault any virtual address we were asked to fault */ 369 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 370} 371 372static int hmm_vma_walk_pmd(pmd_t *pmdp, 373 unsigned long start, 374 unsigned long end, 375 struct mm_walk *walk) 376{ 377 struct hmm_vma_walk *hmm_vma_walk = walk->private; 378 struct hmm_range *range = hmm_vma_walk->range; 379 uint64_t *pfns = range->pfns; 380 unsigned long addr = start, i; 381 pte_t *ptep; 382 pmd_t pmd; 383 384again: 385 pmd = READ_ONCE(*pmdp); 386 if (pmd_none(pmd)) 387 return hmm_vma_walk_hole(start, end, -1, walk); 388 389 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 390 bool fault, write_fault; 391 unsigned long npages; 392 uint64_t *pfns; 393 394 i = (addr - range->start) >> PAGE_SHIFT; 395 npages = (end - addr) >> PAGE_SHIFT; 396 pfns = &range->pfns[i]; 397 398 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 399 0, &fault, &write_fault); 400 if (fault || write_fault) { 401 hmm_vma_walk->last = addr; 402 pmd_migration_entry_wait(walk->mm, pmdp); 403 return -EBUSY; 404 } |
405 return 0; | 405 return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); |
406 } else if (!pmd_present(pmd)) 407 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 408 409 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 410 /* 411 * No need to take pmd_lock here, even if some other thread 412 * is splitting the huge pmd we will get that event through 413 * mmu_notifier callback. 414 * 415 * So just read pmd value and check again it's a transparent 416 * huge or device mapping one and compute corresponding pfn 417 * values. 418 */ 419 pmd = pmd_read_atomic(pmdp); 420 barrier(); 421 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 422 goto again; 423 424 i = (addr - range->start) >> PAGE_SHIFT; 425 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 426 } 427 428 /* 429 * We have handled all the valid cases above ie either none, migration, 430 * huge or transparent huge. At this point either it is a valid pmd 431 * entry pointing to pte directory or it is a bad pmd that will not 432 * recover. 433 */ 434 if (pmd_bad(pmd)) 435 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 436 437 ptep = pte_offset_map(pmdp, addr); 438 i = (addr - range->start) >> PAGE_SHIFT; 439 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 440 int r; 441 442 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 443 if (r) { 444 /* hmm_vma_handle_pte() did pte_unmap() */ 445 hmm_vma_walk->last = addr; 446 return r; 447 } 448 } 449 if (hmm_vma_walk->pgmap) { 450 /* 451 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 452 * so that we can leverage get_dev_pagemap() optimization which 453 * will not re-take a reference on a pgmap if we already have 454 * one. 455 */ 456 put_dev_pagemap(hmm_vma_walk->pgmap); 457 hmm_vma_walk->pgmap = NULL; 458 } 459 pte_unmap(ptep - 1); 460 461 hmm_vma_walk->last = addr; 462 return 0; 463} 464 465#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 466 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 467static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 468{ 469 if (!pud_present(pud)) 470 return 0; 471 return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 472 range->flags[HMM_PFN_WRITE] : 473 range->flags[HMM_PFN_VALID]; 474} 475 476static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 477 struct mm_walk *walk) 478{ 479 struct hmm_vma_walk *hmm_vma_walk = walk->private; 480 struct hmm_range *range = hmm_vma_walk->range; 481 unsigned long addr = start; 482 pud_t pud; 483 int ret = 0; 484 spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 485 486 if (!ptl) 487 return 0; 488 489 /* Normally we don't want to split the huge page */ 490 walk->action = ACTION_CONTINUE; 491 492 pud = READ_ONCE(*pudp); 493 if (pud_none(pud)) { 494 spin_unlock(ptl); 495 return hmm_vma_walk_hole(start, end, -1, walk); 496 } 497 498 if (pud_huge(pud) && pud_devmap(pud)) { 499 unsigned long i, npages, pfn; 500 uint64_t *pfns, cpu_flags; 501 bool fault, write_fault; 502 503 if (!pud_present(pud)) { 504 spin_unlock(ptl); 505 return hmm_vma_walk_hole(start, end, -1, walk); 506 } 507 508 i = (addr - range->start) >> PAGE_SHIFT; 509 npages = (end - addr) >> PAGE_SHIFT; 510 pfns = &range->pfns[i]; 511 512 cpu_flags = pud_to_hmm_pfn_flags(range, pud); 513 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 514 cpu_flags, &fault, &write_fault); 515 if (fault || write_fault) { 516 spin_unlock(ptl); 517 return hmm_vma_walk_hole_(addr, end, fault, write_fault, 518 walk); 519 } 520 521 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 522 for (i = 0; i < npages; ++i, ++pfn) { 523 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 524 hmm_vma_walk->pgmap); 525 if (unlikely(!hmm_vma_walk->pgmap)) { 526 ret = -EBUSY; 527 goto out_unlock; 528 } 529 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 530 cpu_flags; 531 } 532 if (hmm_vma_walk->pgmap) { 533 put_dev_pagemap(hmm_vma_walk->pgmap); 534 hmm_vma_walk->pgmap = NULL; 535 } 536 hmm_vma_walk->last = end; 537 goto out_unlock; 538 } 539 540 /* Ask for the PUD to be split */ 541 walk->action = ACTION_SUBTREE; 542 543out_unlock: 544 spin_unlock(ptl); 545 return ret; 546} 547#else 548#define hmm_vma_walk_pud NULL 549#endif 550 551#ifdef CONFIG_HUGETLB_PAGE 552static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 553 unsigned long start, unsigned long end, 554 struct mm_walk *walk) 555{ 556 unsigned long addr = start, i, pfn; 557 struct hmm_vma_walk *hmm_vma_walk = walk->private; 558 struct hmm_range *range = hmm_vma_walk->range; 559 struct vm_area_struct *vma = walk->vma; 560 uint64_t orig_pfn, cpu_flags; 561 bool fault, write_fault; 562 spinlock_t *ptl; 563 pte_t entry; 564 int ret = 0; 565 566 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 567 entry = huge_ptep_get(pte); 568 569 i = (start - range->start) >> PAGE_SHIFT; 570 orig_pfn = range->pfns[i]; 571 range->pfns[i] = range->values[HMM_PFN_NONE]; 572 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 573 fault = write_fault = false; 574 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 575 &fault, &write_fault); 576 if (fault || write_fault) { 577 ret = -ENOENT; 578 goto unlock; 579 } 580 581 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 582 for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 583 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 584 cpu_flags; 585 hmm_vma_walk->last = end; 586 587unlock: 588 spin_unlock(ptl); 589 590 if (ret == -ENOENT) 591 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 592 593 return ret; 594} 595#else 596#define hmm_vma_walk_hugetlb_entry NULL 597#endif /* CONFIG_HUGETLB_PAGE */ 598 599static int hmm_vma_walk_test(unsigned long start, unsigned long end, 600 struct mm_walk *walk) 601{ 602 struct hmm_vma_walk *hmm_vma_walk = walk->private; 603 struct hmm_range *range = hmm_vma_walk->range; 604 struct vm_area_struct *vma = walk->vma; 605 606 /* 607 * Skip vma ranges that don't have struct page backing them or 608 * map I/O devices directly. 609 */ 610 if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) 611 return -EFAULT; 612 613 /* 614 * If the vma does not allow read access, then assume that it does not 615 * allow write access either. HMM does not support architectures 616 * that allow write without read. 617 */ 618 if (!(vma->vm_flags & VM_READ)) { 619 bool fault, write_fault; 620 621 /* 622 * Check to see if a fault is requested for any page in the 623 * range. 624 */ 625 hmm_range_need_fault(hmm_vma_walk, range->pfns + 626 ((start - range->start) >> PAGE_SHIFT), 627 (end - start) >> PAGE_SHIFT, 628 0, &fault, &write_fault); 629 if (fault || write_fault) 630 return -EFAULT; 631 632 hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 633 hmm_vma_walk->last = end; 634 635 /* Skip this vma and continue processing the next vma. */ 636 return 1; 637 } 638 639 return 0; 640} 641 642static const struct mm_walk_ops hmm_walk_ops = { 643 .pud_entry = hmm_vma_walk_pud, 644 .pmd_entry = hmm_vma_walk_pmd, 645 .pte_hole = hmm_vma_walk_hole, 646 .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 647 .test_walk = hmm_vma_walk_test, 648}; 649 650/** 651 * hmm_range_fault - try to fault some address in a virtual address range 652 * @range: range being faulted 653 * @flags: HMM_FAULT_* flags 654 * 655 * Return: the number of valid pages in range->pfns[] (from range start 656 * address), which may be zero. On error one of the following status codes 657 * can be returned: 658 * 659 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 660 * (e.g., device file vma). 661 * -ENOMEM: Out of memory. 662 * -EPERM: Invalid permission (e.g., asking for write and range is read 663 * only). 664 * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. 665 * -EBUSY: The range has been invalidated and the caller needs to wait for 666 * the invalidation to finish. 667 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 668 * that range) number of valid pages in range->pfns[] (from 669 * range start address). 670 * 671 * This is similar to a regular CPU page fault except that it will not trigger 672 * any memory migration if the memory being faulted is not accessible by CPUs 673 * and caller does not ask for migration. 674 * 675 * On error, for one virtual address in the range, the function will mark the 676 * corresponding HMM pfn entry with an error flag. 677 */ 678long hmm_range_fault(struct hmm_range *range, unsigned int flags) 679{ 680 struct hmm_vma_walk hmm_vma_walk = { 681 .range = range, 682 .last = range->start, 683 .flags = flags, 684 }; 685 struct mm_struct *mm = range->notifier->mm; 686 int ret; 687 688 lockdep_assert_held(&mm->mmap_sem); 689 690 do { 691 /* If range is no longer valid force retry. */ 692 if (mmu_interval_check_retry(range->notifier, 693 range->notifier_seq)) 694 return -EBUSY; 695 ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 696 &hmm_walk_ops, &hmm_vma_walk); 697 } while (ret == -EBUSY); 698 699 if (ret) 700 return ret; 701 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 702} 703EXPORT_SYMBOL(hmm_range_fault); | 406 } else if (!pmd_present(pmd)) 407 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 408 409 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 410 /* 411 * No need to take pmd_lock here, even if some other thread 412 * is splitting the huge pmd we will get that event through 413 * mmu_notifier callback. 414 * 415 * So just read pmd value and check again it's a transparent 416 * huge or device mapping one and compute corresponding pfn 417 * values. 418 */ 419 pmd = pmd_read_atomic(pmdp); 420 barrier(); 421 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 422 goto again; 423 424 i = (addr - range->start) >> PAGE_SHIFT; 425 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 426 } 427 428 /* 429 * We have handled all the valid cases above ie either none, migration, 430 * huge or transparent huge. At this point either it is a valid pmd 431 * entry pointing to pte directory or it is a bad pmd that will not 432 * recover. 433 */ 434 if (pmd_bad(pmd)) 435 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 436 437 ptep = pte_offset_map(pmdp, addr); 438 i = (addr - range->start) >> PAGE_SHIFT; 439 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 440 int r; 441 442 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 443 if (r) { 444 /* hmm_vma_handle_pte() did pte_unmap() */ 445 hmm_vma_walk->last = addr; 446 return r; 447 } 448 } 449 if (hmm_vma_walk->pgmap) { 450 /* 451 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 452 * so that we can leverage get_dev_pagemap() optimization which 453 * will not re-take a reference on a pgmap if we already have 454 * one. 455 */ 456 put_dev_pagemap(hmm_vma_walk->pgmap); 457 hmm_vma_walk->pgmap = NULL; 458 } 459 pte_unmap(ptep - 1); 460 461 hmm_vma_walk->last = addr; 462 return 0; 463} 464 465#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 466 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 467static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 468{ 469 if (!pud_present(pud)) 470 return 0; 471 return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 472 range->flags[HMM_PFN_WRITE] : 473 range->flags[HMM_PFN_VALID]; 474} 475 476static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 477 struct mm_walk *walk) 478{ 479 struct hmm_vma_walk *hmm_vma_walk = walk->private; 480 struct hmm_range *range = hmm_vma_walk->range; 481 unsigned long addr = start; 482 pud_t pud; 483 int ret = 0; 484 spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); 485 486 if (!ptl) 487 return 0; 488 489 /* Normally we don't want to split the huge page */ 490 walk->action = ACTION_CONTINUE; 491 492 pud = READ_ONCE(*pudp); 493 if (pud_none(pud)) { 494 spin_unlock(ptl); 495 return hmm_vma_walk_hole(start, end, -1, walk); 496 } 497 498 if (pud_huge(pud) && pud_devmap(pud)) { 499 unsigned long i, npages, pfn; 500 uint64_t *pfns, cpu_flags; 501 bool fault, write_fault; 502 503 if (!pud_present(pud)) { 504 spin_unlock(ptl); 505 return hmm_vma_walk_hole(start, end, -1, walk); 506 } 507 508 i = (addr - range->start) >> PAGE_SHIFT; 509 npages = (end - addr) >> PAGE_SHIFT; 510 pfns = &range->pfns[i]; 511 512 cpu_flags = pud_to_hmm_pfn_flags(range, pud); 513 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 514 cpu_flags, &fault, &write_fault); 515 if (fault || write_fault) { 516 spin_unlock(ptl); 517 return hmm_vma_walk_hole_(addr, end, fault, write_fault, 518 walk); 519 } 520 521 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 522 for (i = 0; i < npages; ++i, ++pfn) { 523 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 524 hmm_vma_walk->pgmap); 525 if (unlikely(!hmm_vma_walk->pgmap)) { 526 ret = -EBUSY; 527 goto out_unlock; 528 } 529 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 530 cpu_flags; 531 } 532 if (hmm_vma_walk->pgmap) { 533 put_dev_pagemap(hmm_vma_walk->pgmap); 534 hmm_vma_walk->pgmap = NULL; 535 } 536 hmm_vma_walk->last = end; 537 goto out_unlock; 538 } 539 540 /* Ask for the PUD to be split */ 541 walk->action = ACTION_SUBTREE; 542 543out_unlock: 544 spin_unlock(ptl); 545 return ret; 546} 547#else 548#define hmm_vma_walk_pud NULL 549#endif 550 551#ifdef CONFIG_HUGETLB_PAGE 552static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 553 unsigned long start, unsigned long end, 554 struct mm_walk *walk) 555{ 556 unsigned long addr = start, i, pfn; 557 struct hmm_vma_walk *hmm_vma_walk = walk->private; 558 struct hmm_range *range = hmm_vma_walk->range; 559 struct vm_area_struct *vma = walk->vma; 560 uint64_t orig_pfn, cpu_flags; 561 bool fault, write_fault; 562 spinlock_t *ptl; 563 pte_t entry; 564 int ret = 0; 565 566 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 567 entry = huge_ptep_get(pte); 568 569 i = (start - range->start) >> PAGE_SHIFT; 570 orig_pfn = range->pfns[i]; 571 range->pfns[i] = range->values[HMM_PFN_NONE]; 572 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 573 fault = write_fault = false; 574 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 575 &fault, &write_fault); 576 if (fault || write_fault) { 577 ret = -ENOENT; 578 goto unlock; 579 } 580 581 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 582 for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 583 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 584 cpu_flags; 585 hmm_vma_walk->last = end; 586 587unlock: 588 spin_unlock(ptl); 589 590 if (ret == -ENOENT) 591 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 592 593 return ret; 594} 595#else 596#define hmm_vma_walk_hugetlb_entry NULL 597#endif /* CONFIG_HUGETLB_PAGE */ 598 599static int hmm_vma_walk_test(unsigned long start, unsigned long end, 600 struct mm_walk *walk) 601{ 602 struct hmm_vma_walk *hmm_vma_walk = walk->private; 603 struct hmm_range *range = hmm_vma_walk->range; 604 struct vm_area_struct *vma = walk->vma; 605 606 /* 607 * Skip vma ranges that don't have struct page backing them or 608 * map I/O devices directly. 609 */ 610 if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) 611 return -EFAULT; 612 613 /* 614 * If the vma does not allow read access, then assume that it does not 615 * allow write access either. HMM does not support architectures 616 * that allow write without read. 617 */ 618 if (!(vma->vm_flags & VM_READ)) { 619 bool fault, write_fault; 620 621 /* 622 * Check to see if a fault is requested for any page in the 623 * range. 624 */ 625 hmm_range_need_fault(hmm_vma_walk, range->pfns + 626 ((start - range->start) >> PAGE_SHIFT), 627 (end - start) >> PAGE_SHIFT, 628 0, &fault, &write_fault); 629 if (fault || write_fault) 630 return -EFAULT; 631 632 hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 633 hmm_vma_walk->last = end; 634 635 /* Skip this vma and continue processing the next vma. */ 636 return 1; 637 } 638 639 return 0; 640} 641 642static const struct mm_walk_ops hmm_walk_ops = { 643 .pud_entry = hmm_vma_walk_pud, 644 .pmd_entry = hmm_vma_walk_pmd, 645 .pte_hole = hmm_vma_walk_hole, 646 .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 647 .test_walk = hmm_vma_walk_test, 648}; 649 650/** 651 * hmm_range_fault - try to fault some address in a virtual address range 652 * @range: range being faulted 653 * @flags: HMM_FAULT_* flags 654 * 655 * Return: the number of valid pages in range->pfns[] (from range start 656 * address), which may be zero. On error one of the following status codes 657 * can be returned: 658 * 659 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 660 * (e.g., device file vma). 661 * -ENOMEM: Out of memory. 662 * -EPERM: Invalid permission (e.g., asking for write and range is read 663 * only). 664 * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. 665 * -EBUSY: The range has been invalidated and the caller needs to wait for 666 * the invalidation to finish. 667 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 668 * that range) number of valid pages in range->pfns[] (from 669 * range start address). 670 * 671 * This is similar to a regular CPU page fault except that it will not trigger 672 * any memory migration if the memory being faulted is not accessible by CPUs 673 * and caller does not ask for migration. 674 * 675 * On error, for one virtual address in the range, the function will mark the 676 * corresponding HMM pfn entry with an error flag. 677 */ 678long hmm_range_fault(struct hmm_range *range, unsigned int flags) 679{ 680 struct hmm_vma_walk hmm_vma_walk = { 681 .range = range, 682 .last = range->start, 683 .flags = flags, 684 }; 685 struct mm_struct *mm = range->notifier->mm; 686 int ret; 687 688 lockdep_assert_held(&mm->mmap_sem); 689 690 do { 691 /* If range is no longer valid force retry. */ 692 if (mmu_interval_check_retry(range->notifier, 693 range->notifier_seq)) 694 return -EBUSY; 695 ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 696 &hmm_walk_ops, &hmm_vma_walk); 697 } while (ret == -EBUSY); 698 699 if (ret) 700 return ret; 701 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 702} 703EXPORT_SYMBOL(hmm_range_fault); |