1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 2133ff0eaSJérôme Glisse /* 3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 4133ff0eaSJérôme Glisse * 5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 6133ff0eaSJérôme Glisse */ 7133ff0eaSJérôme Glisse /* 8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 9133ff0eaSJérôme Glisse * management or HMM for short. 10133ff0eaSJérôme Glisse */ 11a520110eSChristoph Hellwig #include <linux/pagewalk.h> 12133ff0eaSJérôme Glisse #include <linux/hmm.h> 13858b54daSJérôme Glisse #include <linux/init.h> 14da4c3c73SJérôme Glisse #include <linux/rmap.h> 15da4c3c73SJérôme Glisse #include <linux/swap.h> 16133ff0eaSJérôme Glisse #include <linux/slab.h> 17133ff0eaSJérôme Glisse #include <linux/sched.h> 184ef589dcSJérôme Glisse #include <linux/mmzone.h> 194ef589dcSJérôme Glisse #include <linux/pagemap.h> 20da4c3c73SJérôme Glisse #include <linux/swapops.h> 21da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 224ef589dcSJérôme Glisse #include <linux/memremap.h> 23c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h> 247b2d55d2SJérôme Glisse #include <linux/jump_label.h> 2555c0ece8SJérôme Glisse #include <linux/dma-mapping.h> 26c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 274ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 284ef589dcSJérôme Glisse 2974eee180SJérôme Glisse struct hmm_vma_walk { 3074eee180SJérôme Glisse struct hmm_range *range; 31992de9a8SJérôme Glisse struct dev_pagemap *pgmap; 3274eee180SJérôme Glisse unsigned long last; 339a4903e4SChristoph Hellwig unsigned int flags; 3474eee180SJérôme Glisse }; 3574eee180SJérôme Glisse 362aee09d8SJérôme Glisse static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 372aee09d8SJérôme Glisse bool write_fault, uint64_t *pfn) 3874eee180SJérôme Glisse { 399b1ae605SKuehling, Felix unsigned int flags = FAULT_FLAG_REMOTE; 4074eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 41f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4274eee180SJérôme Glisse struct vm_area_struct *vma = walk->vma; 4350a7ca3cSSouptick Joarder vm_fault_t ret; 4474eee180SJérôme Glisse 456c64f2bbSRalph Campbell if (!vma) 466c64f2bbSRalph Campbell goto err; 476c64f2bbSRalph Campbell 489a4903e4SChristoph Hellwig if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) 499a4903e4SChristoph Hellwig flags |= FAULT_FLAG_ALLOW_RETRY; 509a4903e4SChristoph Hellwig if (write_fault) 519a4903e4SChristoph Hellwig flags |= FAULT_FLAG_WRITE; 529a4903e4SChristoph Hellwig 5350a7ca3cSSouptick Joarder ret = handle_mm_fault(vma, addr, flags); 54e709acccSJason Gunthorpe if (ret & VM_FAULT_RETRY) { 55e709acccSJason Gunthorpe /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ 5673231612SJérôme Glisse return -EAGAIN; 57e709acccSJason Gunthorpe } 586c64f2bbSRalph Campbell if (ret & VM_FAULT_ERROR) 596c64f2bbSRalph Campbell goto err; 6074eee180SJérôme Glisse 6173231612SJérôme Glisse return -EBUSY; 626c64f2bbSRalph Campbell 636c64f2bbSRalph Campbell err: 646c64f2bbSRalph Campbell *pfn = range->values[HMM_PFN_ERROR]; 656c64f2bbSRalph Campbell return -EFAULT; 6674eee180SJérôme Glisse } 6774eee180SJérôme Glisse 68*d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end, 69*d28c2c9aSRalph Campbell struct hmm_range *range, enum hmm_pfn_value_e value) 70da4c3c73SJérôme Glisse { 71ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 72da4c3c73SJérôme Glisse unsigned long i; 73da4c3c73SJérôme Glisse 74da4c3c73SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 75da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 76*d28c2c9aSRalph Campbell pfns[i] = range->values[value]; 77da4c3c73SJérôme Glisse 78da4c3c73SJérôme Glisse return 0; 79da4c3c73SJérôme Glisse } 80da4c3c73SJérôme Glisse 815504ed29SJérôme Glisse /* 82d2e8d551SRalph Campbell * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) 83d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive) 845504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 852aee09d8SJérôme Glisse * @fault: should we fault or not ? 862aee09d8SJérôme Glisse * @write_fault: write fault ? 875504ed29SJérôme Glisse * @walk: mm_walk structure 88085ea250SRalph Campbell * Return: 0 on success, -EBUSY after page fault, or page fault error 895504ed29SJérôme Glisse * 905504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 915504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 925504ed29SJérôme Glisse */ 932aee09d8SJérôme Glisse static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 942aee09d8SJérôme Glisse bool fault, bool write_fault, 95da4c3c73SJérôme Glisse struct mm_walk *walk) 96da4c3c73SJérôme Glisse { 9774eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 9874eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 99ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 1007f08263dSChristoph Hellwig unsigned long i; 101da4c3c73SJérôme Glisse 10274eee180SJérôme Glisse hmm_vma_walk->last = addr; 1037f08263dSChristoph Hellwig i = (addr - range->start) >> PAGE_SHIFT; 10463d5066fSJérôme Glisse 105c18ce674SRalph Campbell if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) 106c18ce674SRalph Campbell return -EPERM; 107c18ce674SRalph Campbell 1087f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++) { 109f88a1e90SJérôme Glisse pfns[i] = range->values[HMM_PFN_NONE]; 1102aee09d8SJérôme Glisse if (fault || write_fault) { 11174eee180SJérôme Glisse int ret; 112da4c3c73SJérôme Glisse 1132aee09d8SJérôme Glisse ret = hmm_vma_do_fault(walk, addr, write_fault, 1142aee09d8SJérôme Glisse &pfns[i]); 11573231612SJérôme Glisse if (ret != -EBUSY) 11674eee180SJérôme Glisse return ret; 11774eee180SJérôme Glisse } 11874eee180SJérôme Glisse } 11974eee180SJérôme Glisse 12073231612SJérôme Glisse return (fault || write_fault) ? -EBUSY : 0; 1212aee09d8SJérôme Glisse } 1222aee09d8SJérôme Glisse 1232aee09d8SJérôme Glisse static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1242aee09d8SJérôme Glisse uint64_t pfns, uint64_t cpu_flags, 1252aee09d8SJérôme Glisse bool *fault, bool *write_fault) 1262aee09d8SJérôme Glisse { 127f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 128f88a1e90SJérôme Glisse 129d45d464bSChristoph Hellwig if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 1302aee09d8SJérôme Glisse return; 1312aee09d8SJérôme Glisse 132023a019aSJérôme Glisse /* 133023a019aSJérôme Glisse * So we not only consider the individual per page request we also 134023a019aSJérôme Glisse * consider the default flags requested for the range. The API can 135d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces 136d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for 137d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre- 138023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a 139023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default 140023a019aSJérôme Glisse * flags value. 141023a019aSJérôme Glisse */ 142023a019aSJérôme Glisse pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 143023a019aSJérôme Glisse 1442aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 145f88a1e90SJérôme Glisse if (!(pfns & range->flags[HMM_PFN_VALID])) 1462aee09d8SJérôme Glisse return; 147d2e8d551SRalph Campbell /* If this is device memory then only fault if explicitly requested */ 148f88a1e90SJérôme Glisse if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 149f88a1e90SJérôme Glisse /* Do we fault on device memory ? */ 150f88a1e90SJérôme Glisse if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 151f88a1e90SJérôme Glisse *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 152f88a1e90SJérôme Glisse *fault = true; 153f88a1e90SJérôme Glisse } 1542aee09d8SJérôme Glisse return; 1552aee09d8SJérôme Glisse } 156f88a1e90SJérôme Glisse 157f88a1e90SJérôme Glisse /* If CPU page table is not valid then we need to fault */ 158f88a1e90SJérôme Glisse *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 159f88a1e90SJérôme Glisse /* Need to write fault ? */ 160f88a1e90SJérôme Glisse if ((pfns & range->flags[HMM_PFN_WRITE]) && 161f88a1e90SJérôme Glisse !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 162f88a1e90SJérôme Glisse *write_fault = true; 1632aee09d8SJérôme Glisse *fault = true; 1642aee09d8SJérôme Glisse } 1652aee09d8SJérôme Glisse } 1662aee09d8SJérôme Glisse 1672aee09d8SJérôme Glisse static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 1682aee09d8SJérôme Glisse const uint64_t *pfns, unsigned long npages, 1692aee09d8SJérôme Glisse uint64_t cpu_flags, bool *fault, 1702aee09d8SJérôme Glisse bool *write_fault) 1712aee09d8SJérôme Glisse { 1722aee09d8SJérôme Glisse unsigned long i; 1732aee09d8SJérôme Glisse 174d45d464bSChristoph Hellwig if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 1752aee09d8SJérôme Glisse *fault = *write_fault = false; 1762aee09d8SJérôme Glisse return; 1772aee09d8SJérôme Glisse } 1782aee09d8SJérôme Glisse 179a3e0d41cSJérôme Glisse *fault = *write_fault = false; 1802aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 1812aee09d8SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 1822aee09d8SJérôme Glisse fault, write_fault); 183a3e0d41cSJérôme Glisse if ((*write_fault)) 1842aee09d8SJérôme Glisse return; 1852aee09d8SJérôme Glisse } 1862aee09d8SJérôme Glisse } 1872aee09d8SJérôme Glisse 1882aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 1892aee09d8SJérôme Glisse struct mm_walk *walk) 1902aee09d8SJérôme Glisse { 1912aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 1922aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 1932aee09d8SJérôme Glisse bool fault, write_fault; 1942aee09d8SJérôme Glisse unsigned long i, npages; 1952aee09d8SJérôme Glisse uint64_t *pfns; 1962aee09d8SJérôme Glisse 1972aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 1982aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 1992aee09d8SJérôme Glisse pfns = &range->pfns[i]; 2002aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 2012aee09d8SJérôme Glisse 0, &fault, &write_fault); 2022aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 2032aee09d8SJérôme Glisse } 2042aee09d8SJérôme Glisse 205f88a1e90SJérôme Glisse static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 2062aee09d8SJérôme Glisse { 2072aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 2082aee09d8SJérôme Glisse return 0; 209f88a1e90SJérôme Glisse return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 210f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 211f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 212da4c3c73SJérôme Glisse } 213da4c3c73SJérôme Glisse 214992de9a8SJérôme Glisse #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2159d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2169d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd) 2179d3973d6SChristoph Hellwig { 21853f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 219f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 2202aee09d8SJérôme Glisse unsigned long pfn, npages, i; 2212aee09d8SJérôme Glisse bool fault, write_fault; 222f88a1e90SJérôme Glisse uint64_t cpu_flags; 22353f5c3f4SJérôme Glisse 2242aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 225f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 2262aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 2272aee09d8SJérôme Glisse &fault, &write_fault); 22853f5c3f4SJérôme Glisse 2292aee09d8SJérôme Glisse if (pmd_protnone(pmd) || fault || write_fault) 2302aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 23153f5c3f4SJérôme Glisse 232309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 233992de9a8SJérôme Glisse for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 234992de9a8SJérôme Glisse if (pmd_devmap(pmd)) { 235992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 236992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 237992de9a8SJérôme Glisse if (unlikely(!hmm_vma_walk->pgmap)) 238992de9a8SJérôme Glisse return -EBUSY; 239992de9a8SJérôme Glisse } 240391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 241992de9a8SJérôme Glisse } 242992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 243992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 244992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 245992de9a8SJérôme Glisse } 24653f5c3f4SJérôme Glisse hmm_vma_walk->last = end; 24753f5c3f4SJérôme Glisse return 0; 24853f5c3f4SJérôme Glisse } 2499d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 2509d3973d6SChristoph Hellwig /* stub to allow the code below to compile */ 2519d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 2529d3973d6SChristoph Hellwig unsigned long end, uint64_t *pfns, pmd_t pmd); 2539d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 25453f5c3f4SJérôme Glisse 255f88a1e90SJérôme Glisse static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 2562aee09d8SJérôme Glisse { 257789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) 2582aee09d8SJérôme Glisse return 0; 259f88a1e90SJérôme Glisse return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 260f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 261f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 2622aee09d8SJérôme Glisse } 2632aee09d8SJérôme Glisse 26453f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 26553f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 26653f5c3f4SJérôme Glisse uint64_t *pfn) 26753f5c3f4SJérôme Glisse { 26853f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 269f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 2702aee09d8SJérôme Glisse bool fault, write_fault; 2712aee09d8SJérôme Glisse uint64_t cpu_flags; 27253f5c3f4SJérôme Glisse pte_t pte = *ptep; 273f88a1e90SJérôme Glisse uint64_t orig_pfn = *pfn; 27453f5c3f4SJérôme Glisse 275f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_NONE]; 27673231612SJérôme Glisse fault = write_fault = false; 27753f5c3f4SJérôme Glisse 27853f5c3f4SJérôme Glisse if (pte_none(pte)) { 27973231612SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 28073231612SJérôme Glisse &fault, &write_fault); 2812aee09d8SJérôme Glisse if (fault || write_fault) 28253f5c3f4SJérôme Glisse goto fault; 28353f5c3f4SJérôme Glisse return 0; 28453f5c3f4SJérôme Glisse } 28553f5c3f4SJérôme Glisse 28653f5c3f4SJérôme Glisse if (!pte_present(pte)) { 28753f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 28853f5c3f4SJérôme Glisse 28953f5c3f4SJérôme Glisse if (!non_swap_entry(entry)) { 290e3fe8e55SYang, Philip cpu_flags = pte_to_hmm_pfn_flags(range, pte); 291e3fe8e55SYang, Philip hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 292e3fe8e55SYang, Philip &fault, &write_fault); 2932aee09d8SJérôme Glisse if (fault || write_fault) 29453f5c3f4SJérôme Glisse goto fault; 29553f5c3f4SJérôme Glisse return 0; 29653f5c3f4SJérôme Glisse } 29753f5c3f4SJérôme Glisse 29853f5c3f4SJérôme Glisse /* 29953f5c3f4SJérôme Glisse * This is a special swap entry, ignore migration, use 30053f5c3f4SJérôme Glisse * device and report anything else as error. 30153f5c3f4SJérôme Glisse */ 30253f5c3f4SJérôme Glisse if (is_device_private_entry(entry)) { 303f88a1e90SJérôme Glisse cpu_flags = range->flags[HMM_PFN_VALID] | 304f88a1e90SJérôme Glisse range->flags[HMM_PFN_DEVICE_PRIVATE]; 3052aee09d8SJérôme Glisse cpu_flags |= is_write_device_private_entry(entry) ? 306f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 0; 307f88a1e90SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 308f88a1e90SJérôme Glisse &fault, &write_fault); 309f88a1e90SJérôme Glisse if (fault || write_fault) 310f88a1e90SJérôme Glisse goto fault; 311391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, 312391aab11SJérôme Glisse swp_offset(entry)); 313f88a1e90SJérôme Glisse *pfn |= cpu_flags; 31453f5c3f4SJérôme Glisse return 0; 31553f5c3f4SJérôme Glisse } 31653f5c3f4SJérôme Glisse 31753f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 3182aee09d8SJérôme Glisse if (fault || write_fault) { 31953f5c3f4SJérôme Glisse pte_unmap(ptep); 32053f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 321d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr); 32273231612SJérôme Glisse return -EBUSY; 32353f5c3f4SJérôme Glisse } 32453f5c3f4SJérôme Glisse return 0; 32553f5c3f4SJérôme Glisse } 32653f5c3f4SJérôme Glisse 32753f5c3f4SJérôme Glisse /* Report error for everything else */ 328f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_ERROR]; 32953f5c3f4SJérôme Glisse return -EFAULT; 33073231612SJérôme Glisse } else { 33173231612SJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, pte); 33273231612SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 33373231612SJérôme Glisse &fault, &write_fault); 33453f5c3f4SJérôme Glisse } 33553f5c3f4SJérôme Glisse 3362aee09d8SJérôme Glisse if (fault || write_fault) 33753f5c3f4SJérôme Glisse goto fault; 33853f5c3f4SJérôme Glisse 339992de9a8SJérôme Glisse if (pte_devmap(pte)) { 340992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 341992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 342992de9a8SJérôme Glisse if (unlikely(!hmm_vma_walk->pgmap)) 343992de9a8SJérôme Glisse return -EBUSY; 344992de9a8SJérôme Glisse } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 345ac541f25SRalph Campbell if (!is_zero_pfn(pte_pfn(pte))) { 346992de9a8SJérôme Glisse *pfn = range->values[HMM_PFN_SPECIAL]; 347992de9a8SJérôme Glisse return -EFAULT; 348992de9a8SJérôme Glisse } 349ac541f25SRalph Campbell /* 350ac541f25SRalph Campbell * Since each architecture defines a struct page for the zero 351ac541f25SRalph Campbell * page, just fall through and treat it like a normal page. 352ac541f25SRalph Campbell */ 353ac541f25SRalph Campbell } 354992de9a8SJérôme Glisse 355391aab11SJérôme Glisse *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 35653f5c3f4SJérôme Glisse return 0; 35753f5c3f4SJérôme Glisse 35853f5c3f4SJérôme Glisse fault: 359992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 360992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 361992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 362992de9a8SJérôme Glisse } 36353f5c3f4SJérôme Glisse pte_unmap(ptep); 36453f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 3652aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 36653f5c3f4SJérôme Glisse } 36753f5c3f4SJérôme Glisse 368da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 369da4c3c73SJérôme Glisse unsigned long start, 370da4c3c73SJérôme Glisse unsigned long end, 371da4c3c73SJérôme Glisse struct mm_walk *walk) 372da4c3c73SJérôme Glisse { 37374eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 37474eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 375ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 376da4c3c73SJérôme Glisse unsigned long addr = start, i; 377da4c3c73SJérôme Glisse pte_t *ptep; 378da4c3c73SJérôme Glisse pmd_t pmd; 379da4c3c73SJérôme Glisse 380d08faca0SJérôme Glisse again: 381d08faca0SJérôme Glisse pmd = READ_ONCE(*pmdp); 382d08faca0SJérôme Glisse if (pmd_none(pmd)) 383d08faca0SJérôme Glisse return hmm_vma_walk_hole(start, end, walk); 384d08faca0SJérôme Glisse 385d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 386d08faca0SJérôme Glisse bool fault, write_fault; 387d08faca0SJérôme Glisse unsigned long npages; 388d08faca0SJérôme Glisse uint64_t *pfns; 389d08faca0SJérôme Glisse 390d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 391d08faca0SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 392d08faca0SJérôme Glisse pfns = &range->pfns[i]; 393d08faca0SJérôme Glisse 394d08faca0SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 395d08faca0SJérôme Glisse 0, &fault, &write_fault); 396d08faca0SJérôme Glisse if (fault || write_fault) { 397d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 398d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp); 39973231612SJérôme Glisse return -EBUSY; 400d08faca0SJérôme Glisse } 401d08faca0SJérôme Glisse return 0; 402d08faca0SJérôme Glisse } else if (!pmd_present(pmd)) 403*d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 404d08faca0SJérôme Glisse 405d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 406da4c3c73SJérôme Glisse /* 407d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread 408da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 409da4c3c73SJérôme Glisse * mmu_notifier callback. 410da4c3c73SJérôme Glisse * 411d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent 412da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 413da4c3c73SJérôme Glisse * values. 414da4c3c73SJérôme Glisse */ 415da4c3c73SJérôme Glisse pmd = pmd_read_atomic(pmdp); 416da4c3c73SJérôme Glisse barrier(); 417da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 418da4c3c73SJérôme Glisse goto again; 419da4c3c73SJérôme Glisse 420d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 42153f5c3f4SJérôme Glisse return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 422da4c3c73SJérôme Glisse } 423da4c3c73SJérôme Glisse 424d08faca0SJérôme Glisse /* 425d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration, 426d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 427d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 428d08faca0SJérôme Glisse * recover. 429d08faca0SJérôme Glisse */ 430d08faca0SJérôme Glisse if (pmd_bad(pmd)) 431*d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); 432da4c3c73SJérôme Glisse 433da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 434d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 435da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 43653f5c3f4SJérôme Glisse int r; 437da4c3c73SJérôme Glisse 43853f5c3f4SJérôme Glisse r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 43953f5c3f4SJérôme Glisse if (r) { 44053f5c3f4SJérôme Glisse /* hmm_vma_handle_pte() did unmap pte directory */ 44174eee180SJérôme Glisse hmm_vma_walk->last = addr; 44253f5c3f4SJérôme Glisse return r; 44374eee180SJérôme Glisse } 444da4c3c73SJérôme Glisse } 445992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 446992de9a8SJérôme Glisse /* 447992de9a8SJérôme Glisse * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 448992de9a8SJérôme Glisse * so that we can leverage get_dev_pagemap() optimization which 449992de9a8SJérôme Glisse * will not re-take a reference on a pgmap if we already have 450992de9a8SJérôme Glisse * one. 451992de9a8SJérôme Glisse */ 452992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 453992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 454992de9a8SJérôme Glisse } 455da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 456da4c3c73SJérôme Glisse 45753f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 458da4c3c73SJérôme Glisse return 0; 459da4c3c73SJérôme Glisse } 460da4c3c73SJérôme Glisse 461f0b3c45cSChristoph Hellwig #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 462f0b3c45cSChristoph Hellwig defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 463f0b3c45cSChristoph Hellwig static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 464f0b3c45cSChristoph Hellwig { 465f0b3c45cSChristoph Hellwig if (!pud_present(pud)) 466f0b3c45cSChristoph Hellwig return 0; 467f0b3c45cSChristoph Hellwig return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 468f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_WRITE] : 469f0b3c45cSChristoph Hellwig range->flags[HMM_PFN_VALID]; 470f0b3c45cSChristoph Hellwig } 471f0b3c45cSChristoph Hellwig 472f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 473992de9a8SJérôme Glisse struct mm_walk *walk) 474992de9a8SJérôme Glisse { 475992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 476992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 477992de9a8SJérôme Glisse unsigned long addr = start, next; 478992de9a8SJérôme Glisse pmd_t *pmdp; 479992de9a8SJérôme Glisse pud_t pud; 480992de9a8SJérôme Glisse int ret; 481992de9a8SJérôme Glisse 482992de9a8SJérôme Glisse again: 483992de9a8SJérôme Glisse pud = READ_ONCE(*pudp); 484992de9a8SJérôme Glisse if (pud_none(pud)) 485992de9a8SJérôme Glisse return hmm_vma_walk_hole(start, end, walk); 486992de9a8SJérôme Glisse 487992de9a8SJérôme Glisse if (pud_huge(pud) && pud_devmap(pud)) { 488992de9a8SJérôme Glisse unsigned long i, npages, pfn; 489992de9a8SJérôme Glisse uint64_t *pfns, cpu_flags; 490992de9a8SJérôme Glisse bool fault, write_fault; 491992de9a8SJérôme Glisse 492992de9a8SJérôme Glisse if (!pud_present(pud)) 493992de9a8SJérôme Glisse return hmm_vma_walk_hole(start, end, walk); 494992de9a8SJérôme Glisse 495992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 496992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 497992de9a8SJérôme Glisse pfns = &range->pfns[i]; 498992de9a8SJérôme Glisse 499992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud); 500992de9a8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 501992de9a8SJérôme Glisse cpu_flags, &fault, &write_fault); 502992de9a8SJérôme Glisse if (fault || write_fault) 503992de9a8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, 504992de9a8SJérôme Glisse write_fault, walk); 505992de9a8SJérôme Glisse 506992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 507992de9a8SJérôme Glisse for (i = 0; i < npages; ++i, ++pfn) { 508992de9a8SJérôme Glisse hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 509992de9a8SJérôme Glisse hmm_vma_walk->pgmap); 510992de9a8SJérôme Glisse if (unlikely(!hmm_vma_walk->pgmap)) 511992de9a8SJérôme Glisse return -EBUSY; 512391aab11SJérôme Glisse pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 513391aab11SJérôme Glisse cpu_flags; 514992de9a8SJérôme Glisse } 515992de9a8SJérôme Glisse if (hmm_vma_walk->pgmap) { 516992de9a8SJérôme Glisse put_dev_pagemap(hmm_vma_walk->pgmap); 517992de9a8SJérôme Glisse hmm_vma_walk->pgmap = NULL; 518992de9a8SJérôme Glisse } 519992de9a8SJérôme Glisse hmm_vma_walk->last = end; 520992de9a8SJérôme Glisse return 0; 521992de9a8SJérôme Glisse } 522992de9a8SJérôme Glisse 523992de9a8SJérôme Glisse split_huge_pud(walk->vma, pudp, addr); 524992de9a8SJérôme Glisse if (pud_none(*pudp)) 525992de9a8SJérôme Glisse goto again; 526992de9a8SJérôme Glisse 527992de9a8SJérôme Glisse pmdp = pmd_offset(pudp, addr); 528992de9a8SJérôme Glisse do { 529992de9a8SJérôme Glisse next = pmd_addr_end(addr, end); 530992de9a8SJérôme Glisse ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); 531992de9a8SJérôme Glisse if (ret) 532992de9a8SJérôme Glisse return ret; 533992de9a8SJérôme Glisse } while (pmdp++, addr = next, addr != end); 534992de9a8SJérôme Glisse 535992de9a8SJérôme Glisse return 0; 536992de9a8SJérôme Glisse } 537f0b3c45cSChristoph Hellwig #else 538f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL 539f0b3c45cSChristoph Hellwig #endif 540992de9a8SJérôme Glisse 541251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE 54263d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 54363d5066fSJérôme Glisse unsigned long start, unsigned long end, 54463d5066fSJérôme Glisse struct mm_walk *walk) 54563d5066fSJérôme Glisse { 54605c23af4SChristoph Hellwig unsigned long addr = start, i, pfn; 54763d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 54863d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 54963d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma; 55063d5066fSJérôme Glisse uint64_t orig_pfn, cpu_flags; 55163d5066fSJérôme Glisse bool fault, write_fault; 55263d5066fSJérôme Glisse spinlock_t *ptl; 55363d5066fSJérôme Glisse pte_t entry; 55463d5066fSJérôme Glisse int ret = 0; 55563d5066fSJérôme Glisse 556d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 55763d5066fSJérôme Glisse entry = huge_ptep_get(pte); 55863d5066fSJérôme Glisse 5597f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT; 56063d5066fSJérôme Glisse orig_pfn = range->pfns[i]; 56163d5066fSJérôme Glisse range->pfns[i] = range->values[HMM_PFN_NONE]; 56263d5066fSJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, entry); 56363d5066fSJérôme Glisse fault = write_fault = false; 56463d5066fSJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 56563d5066fSJérôme Glisse &fault, &write_fault); 56663d5066fSJérôme Glisse if (fault || write_fault) { 56763d5066fSJérôme Glisse ret = -ENOENT; 56863d5066fSJérôme Glisse goto unlock; 56963d5066fSJérôme Glisse } 57063d5066fSJérôme Glisse 57105c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 5727f08263dSChristoph Hellwig for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 573391aab11SJérôme Glisse range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 574391aab11SJérôme Glisse cpu_flags; 57563d5066fSJérôme Glisse hmm_vma_walk->last = end; 57663d5066fSJérôme Glisse 57763d5066fSJérôme Glisse unlock: 57863d5066fSJérôme Glisse spin_unlock(ptl); 57963d5066fSJérôme Glisse 58063d5066fSJérôme Glisse if (ret == -ENOENT) 58163d5066fSJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 58263d5066fSJérôme Glisse 58363d5066fSJérôme Glisse return ret; 58463d5066fSJérôme Glisse } 585251bbe59SChristoph Hellwig #else 586251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL 587251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */ 58863d5066fSJérôme Glisse 589*d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end, 590*d28c2c9aSRalph Campbell struct mm_walk *walk) 59133cd47dcSJérôme Glisse { 592*d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private; 593*d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range; 594*d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma; 595*d28c2c9aSRalph Campbell 596*d28c2c9aSRalph Campbell /* 597*d28c2c9aSRalph Campbell * Skip vma ranges that don't have struct page backing them or 598*d28c2c9aSRalph Campbell * map I/O devices directly. 599*d28c2c9aSRalph Campbell */ 600*d28c2c9aSRalph Campbell if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) 601*d28c2c9aSRalph Campbell return -EFAULT; 602*d28c2c9aSRalph Campbell 603*d28c2c9aSRalph Campbell /* 604*d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not 605*d28c2c9aSRalph Campbell * allow write access either. HMM does not support architectures 606*d28c2c9aSRalph Campbell * that allow write without read. 607*d28c2c9aSRalph Campbell */ 608*d28c2c9aSRalph Campbell if (!(vma->vm_flags & VM_READ)) { 609*d28c2c9aSRalph Campbell bool fault, write_fault; 610*d28c2c9aSRalph Campbell 611*d28c2c9aSRalph Campbell /* 612*d28c2c9aSRalph Campbell * Check to see if a fault is requested for any page in the 613*d28c2c9aSRalph Campbell * range. 614*d28c2c9aSRalph Campbell */ 615*d28c2c9aSRalph Campbell hmm_range_need_fault(hmm_vma_walk, range->pfns + 616*d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT), 617*d28c2c9aSRalph Campbell (end - start) >> PAGE_SHIFT, 618*d28c2c9aSRalph Campbell 0, &fault, &write_fault); 619*d28c2c9aSRalph Campbell if (fault || write_fault) 620*d28c2c9aSRalph Campbell return -EFAULT; 621*d28c2c9aSRalph Campbell 622*d28c2c9aSRalph Campbell hmm_pfns_fill(start, end, range, HMM_PFN_NONE); 623*d28c2c9aSRalph Campbell hmm_vma_walk->last = end; 624*d28c2c9aSRalph Campbell 625*d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */ 626*d28c2c9aSRalph Campbell return 1; 627*d28c2c9aSRalph Campbell } 628*d28c2c9aSRalph Campbell 629*d28c2c9aSRalph Campbell return 0; 63033cd47dcSJérôme Glisse } 63133cd47dcSJérôme Glisse 6327b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = { 6337b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud, 6347b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd, 6357b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole, 6367b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 637*d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test, 6387b86ac33SChristoph Hellwig }; 6397b86ac33SChristoph Hellwig 6409a4903e4SChristoph Hellwig /** 6419a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range 64208232a45SJérôme Glisse * @range: range being faulted 6439a4903e4SChristoph Hellwig * @flags: HMM_FAULT_* flags 64473231612SJérôme Glisse * 6459a4903e4SChristoph Hellwig * Return: the number of valid pages in range->pfns[] (from range start 6469a4903e4SChristoph Hellwig * address), which may be zero. On error one of the following status codes 6479a4903e4SChristoph Hellwig * can be returned: 6489a4903e4SChristoph Hellwig * 6499a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 6509a4903e4SChristoph Hellwig * (e.g., device file vma). 65173231612SJérôme Glisse * -ENOMEM: Out of memory. 6529a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read 6539a4903e4SChristoph Hellwig * only). 6549a4903e4SChristoph Hellwig * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. 6559a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for 6569a4903e4SChristoph Hellwig * the invalidation to finish. 6579a4903e4SChristoph Hellwig * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 6589a4903e4SChristoph Hellwig * that range) number of valid pages in range->pfns[] (from 65973231612SJérôme Glisse * range start address). 66074eee180SJérôme Glisse * 66174eee180SJérôme Glisse * This is similar to a regular CPU page fault except that it will not trigger 66273231612SJérôme Glisse * any memory migration if the memory being faulted is not accessible by CPUs 66373231612SJérôme Glisse * and caller does not ask for migration. 66474eee180SJérôme Glisse * 665ff05c0c6SJérôme Glisse * On error, for one virtual address in the range, the function will mark the 666ff05c0c6SJérôme Glisse * corresponding HMM pfn entry with an error flag. 66774eee180SJérôme Glisse */ 6689a4903e4SChristoph Hellwig long hmm_range_fault(struct hmm_range *range, unsigned int flags) 66974eee180SJérôme Glisse { 670*d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = { 671*d28c2c9aSRalph Campbell .range = range, 672*d28c2c9aSRalph Campbell .last = range->start, 673*d28c2c9aSRalph Campbell .flags = flags, 674*d28c2c9aSRalph Campbell }; 675a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm; 67674eee180SJérôme Glisse int ret; 67774eee180SJérôme Glisse 67804ec32fbSJason Gunthorpe lockdep_assert_held(&mm->mmap_sem); 679a3e0d41cSJérôme Glisse 680a3e0d41cSJérôme Glisse do { 681a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 682a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 683a22dd506SJason Gunthorpe range->notifier_seq)) 6842bcbeaefSChristoph Hellwig return -EBUSY; 685*d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end, 6867b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk); 687*d28c2c9aSRalph Campbell } while (ret == -EBUSY); 688a3e0d41cSJérôme Glisse 689*d28c2c9aSRalph Campbell if (ret) 69073231612SJérôme Glisse return ret; 69173231612SJérôme Glisse return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 69274eee180SJérôme Glisse } 69373231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 69455c0ece8SJérôme Glisse 69555c0ece8SJérôme Glisse /** 6969a4903e4SChristoph Hellwig * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. 69755c0ece8SJérôme Glisse * @range: range being faulted 6989a4903e4SChristoph Hellwig * @device: device to map page to 6999a4903e4SChristoph Hellwig * @daddrs: array of dma addresses for the mapped pages 7009a4903e4SChristoph Hellwig * @flags: HMM_FAULT_* 70155c0ece8SJérôme Glisse * 7029a4903e4SChristoph Hellwig * Return: the number of pages mapped on success (including zero), or any 7039a4903e4SChristoph Hellwig * status return from hmm_range_fault() otherwise. 70455c0ece8SJérôme Glisse */ 7059a4903e4SChristoph Hellwig long hmm_range_dma_map(struct hmm_range *range, struct device *device, 7069a4903e4SChristoph Hellwig dma_addr_t *daddrs, unsigned int flags) 70755c0ece8SJérôme Glisse { 70855c0ece8SJérôme Glisse unsigned long i, npages, mapped; 70955c0ece8SJérôme Glisse long ret; 71055c0ece8SJérôme Glisse 7119a4903e4SChristoph Hellwig ret = hmm_range_fault(range, flags); 71255c0ece8SJérôme Glisse if (ret <= 0) 71355c0ece8SJérôme Glisse return ret ? ret : -EBUSY; 71455c0ece8SJérôme Glisse 71555c0ece8SJérôme Glisse npages = (range->end - range->start) >> PAGE_SHIFT; 71655c0ece8SJérôme Glisse for (i = 0, mapped = 0; i < npages; ++i) { 71755c0ece8SJérôme Glisse enum dma_data_direction dir = DMA_TO_DEVICE; 71855c0ece8SJérôme Glisse struct page *page; 71955c0ece8SJérôme Glisse 72055c0ece8SJérôme Glisse /* 72155c0ece8SJérôme Glisse * FIXME need to update DMA API to provide invalid DMA address 72255c0ece8SJérôme Glisse * value instead of a function to test dma address value. This 72355c0ece8SJérôme Glisse * would remove lot of dumb code duplicated accross many arch. 72455c0ece8SJérôme Glisse * 72555c0ece8SJérôme Glisse * For now setting it to 0 here is good enough as the pfns[] 72655c0ece8SJérôme Glisse * value is what is use to check what is valid and what isn't. 72755c0ece8SJérôme Glisse */ 72855c0ece8SJérôme Glisse daddrs[i] = 0; 72955c0ece8SJérôme Glisse 730391aab11SJérôme Glisse page = hmm_device_entry_to_page(range, range->pfns[i]); 73155c0ece8SJérôme Glisse if (page == NULL) 73255c0ece8SJérôme Glisse continue; 73355c0ece8SJérôme Glisse 73455c0ece8SJérôme Glisse /* Check if range is being invalidated */ 735a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier, 736a22dd506SJason Gunthorpe range->notifier_seq)) { 73755c0ece8SJérôme Glisse ret = -EBUSY; 73855c0ece8SJérôme Glisse goto unmap; 73955c0ece8SJérôme Glisse } 74055c0ece8SJérôme Glisse 74155c0ece8SJérôme Glisse /* If it is read and write than map bi-directional. */ 74255c0ece8SJérôme Glisse if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 74355c0ece8SJérôme Glisse dir = DMA_BIDIRECTIONAL; 74455c0ece8SJérôme Glisse 74555c0ece8SJérôme Glisse daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); 74655c0ece8SJérôme Glisse if (dma_mapping_error(device, daddrs[i])) { 74755c0ece8SJérôme Glisse ret = -EFAULT; 74855c0ece8SJérôme Glisse goto unmap; 74955c0ece8SJérôme Glisse } 75055c0ece8SJérôme Glisse 75155c0ece8SJérôme Glisse mapped++; 75255c0ece8SJérôme Glisse } 75355c0ece8SJérôme Glisse 75455c0ece8SJérôme Glisse return mapped; 75555c0ece8SJérôme Glisse 75655c0ece8SJérôme Glisse unmap: 75755c0ece8SJérôme Glisse for (npages = i, i = 0; (i < npages) && mapped; ++i) { 75855c0ece8SJérôme Glisse enum dma_data_direction dir = DMA_TO_DEVICE; 75955c0ece8SJérôme Glisse struct page *page; 76055c0ece8SJérôme Glisse 761391aab11SJérôme Glisse page = hmm_device_entry_to_page(range, range->pfns[i]); 76255c0ece8SJérôme Glisse if (page == NULL) 76355c0ece8SJérôme Glisse continue; 76455c0ece8SJérôme Glisse 76555c0ece8SJérôme Glisse if (dma_mapping_error(device, daddrs[i])) 76655c0ece8SJérôme Glisse continue; 76755c0ece8SJérôme Glisse 76855c0ece8SJérôme Glisse /* If it is read and write than map bi-directional. */ 76955c0ece8SJérôme Glisse if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 77055c0ece8SJérôme Glisse dir = DMA_BIDIRECTIONAL; 77155c0ece8SJérôme Glisse 77255c0ece8SJérôme Glisse dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 77355c0ece8SJérôme Glisse mapped--; 77455c0ece8SJérôme Glisse } 77555c0ece8SJérôme Glisse 77655c0ece8SJérôme Glisse return ret; 77755c0ece8SJérôme Glisse } 77855c0ece8SJérôme Glisse EXPORT_SYMBOL(hmm_range_dma_map); 77955c0ece8SJérôme Glisse 78055c0ece8SJérôme Glisse /** 78155c0ece8SJérôme Glisse * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 78255c0ece8SJérôme Glisse * @range: range being unmapped 78355c0ece8SJérôme Glisse * @device: device against which dma map was done 78455c0ece8SJérôme Glisse * @daddrs: dma address of mapped pages 78555c0ece8SJérôme Glisse * @dirty: dirty page if it had the write flag set 786085ea250SRalph Campbell * Return: number of page unmapped on success, -EINVAL otherwise 78755c0ece8SJérôme Glisse * 78855c0ece8SJérôme Glisse * Note that caller MUST abide by mmu notifier or use HMM mirror and abide 78955c0ece8SJérôme Glisse * to the sync_cpu_device_pagetables() callback so that it is safe here to 79055c0ece8SJérôme Glisse * call set_page_dirty(). Caller must also take appropriate locks to avoid 79155c0ece8SJérôme Glisse * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 79255c0ece8SJérôme Glisse */ 79355c0ece8SJérôme Glisse long hmm_range_dma_unmap(struct hmm_range *range, 79455c0ece8SJérôme Glisse struct device *device, 79555c0ece8SJérôme Glisse dma_addr_t *daddrs, 79655c0ece8SJérôme Glisse bool dirty) 79755c0ece8SJérôme Glisse { 79855c0ece8SJérôme Glisse unsigned long i, npages; 79955c0ece8SJérôme Glisse long cpages = 0; 80055c0ece8SJérôme Glisse 80155c0ece8SJérôme Glisse /* Sanity check. */ 80255c0ece8SJérôme Glisse if (range->end <= range->start) 80355c0ece8SJérôme Glisse return -EINVAL; 80455c0ece8SJérôme Glisse if (!daddrs) 80555c0ece8SJérôme Glisse return -EINVAL; 80655c0ece8SJérôme Glisse if (!range->pfns) 80755c0ece8SJérôme Glisse return -EINVAL; 80855c0ece8SJérôme Glisse 80955c0ece8SJérôme Glisse npages = (range->end - range->start) >> PAGE_SHIFT; 81055c0ece8SJérôme Glisse for (i = 0; i < npages; ++i) { 81155c0ece8SJérôme Glisse enum dma_data_direction dir = DMA_TO_DEVICE; 81255c0ece8SJérôme Glisse struct page *page; 81355c0ece8SJérôme Glisse 814391aab11SJérôme Glisse page = hmm_device_entry_to_page(range, range->pfns[i]); 81555c0ece8SJérôme Glisse if (page == NULL) 81655c0ece8SJérôme Glisse continue; 81755c0ece8SJérôme Glisse 81855c0ece8SJérôme Glisse /* If it is read and write than map bi-directional. */ 81955c0ece8SJérôme Glisse if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { 82055c0ece8SJérôme Glisse dir = DMA_BIDIRECTIONAL; 82155c0ece8SJérôme Glisse 82255c0ece8SJérôme Glisse /* 82355c0ece8SJérôme Glisse * See comments in function description on why it is 82455c0ece8SJérôme Glisse * safe here to call set_page_dirty() 82555c0ece8SJérôme Glisse */ 82655c0ece8SJérôme Glisse if (dirty) 82755c0ece8SJérôme Glisse set_page_dirty(page); 82855c0ece8SJérôme Glisse } 82955c0ece8SJérôme Glisse 83055c0ece8SJérôme Glisse /* Unmap and clear pfns/dma address */ 83155c0ece8SJérôme Glisse dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 83255c0ece8SJérôme Glisse range->pfns[i] = range->values[HMM_PFN_NONE]; 83355c0ece8SJérôme Glisse /* FIXME see comments in hmm_vma_dma_map() */ 83455c0ece8SJérôme Glisse daddrs[i] = 0; 83555c0ece8SJérôme Glisse cpages++; 83655c0ece8SJérôme Glisse } 83755c0ece8SJérôme Glisse 83855c0ece8SJérôme Glisse return cpages; 83955c0ece8SJérôme Glisse } 84055c0ece8SJérôme Glisse EXPORT_SYMBOL(hmm_range_dma_unmap); 841