Lines Matching +full:line +full:- +full:orders

1 // SPDX-License-Identifier: GPL-2.0-only
9 * demand-loading started 01.12.91 - seems it is high on the list of
10 * things wanted, and it should be easy to implement. - Linus
14 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
15 * pages started 02.12.91, seems to work. - Linus.
21 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
27 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
29 * 20.12.91 - Ok, making the swap-device changeable like the root.
33 * 05.04.94 - Multi-page memory management added for v1.1.
36 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
68 #include <linux/memory-tiers.h>
88 #include "pgalloc-track.h"
93 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
101 * Return true if the original pte was a uffd-wp pte marker (so the pte was
102 * wr-protected).
106 if (!userfaultfd_wp(vmf->vma)) in vmf_orig_pte_uffd_wp()
108 if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) in vmf_orig_pte_uffd_wp()
111 return pte_marker_uffd_wp(vmf->orig_pte); in vmf_orig_pte_uffd_wp()
194 mm_dec_nr_ptes(tlb->mm); in free_pte_range()
222 if (end - 1 > ceiling - 1) in free_pmd_range()
228 mm_dec_nr_pmds(tlb->mm); in free_pmd_range()
256 if (end - 1 > ceiling - 1) in free_pud_range()
262 mm_dec_nr_puds(tlb->mm); in free_pud_range()
290 if (end - 1 > ceiling - 1) in free_p4d_range()
299 * free_pgd_range - Unmap and free page tables in the range
306 * This function tears down all user-level page tables in the
324 * Why all these "- 1"s? Because 0 represents both the bottom in free_pgd_range()
325 * of the address space and the top of it (using -1 for the in free_pgd_range()
329 * Comparisons need to use "end - 1" and "ceiling - 1" (though in free_pgd_range()
340 * bother to round floor or end up - the tests don't need that. in free_pgd_range()
354 if (end - 1 > ceiling - 1) in free_pgd_range()
355 end -= PMD_SIZE; in free_pgd_range()
356 if (addr > end - 1) in free_pgd_range()
363 pgd = pgd_offset(tlb->mm, addr); in free_pgd_range()
381 unsigned long addr = vma->vm_start; in free_pgtables()
388 next = mas_find(mas, ceiling - 1); in free_pgtables()
406 while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { in free_pgtables()
408 next = mas_find(mas, ceiling - 1); in free_pgtables()
418 free_pgd_range(tlb, addr, vma->vm_end, in free_pgtables()
419 floor, next ? next->vm_start : ceiling); in free_pgtables()
438 * of a chain of data-dependent loads, meaning most CPUs (alpha in pmd_install()
440 * seen in-order. See the alpha page table accessors for the in pmd_install()
454 return -ENOMEM; in __pte_alloc()
466 return -ENOMEM; in __pte_alloc_kernel()
576 * PFN-mapped pte in a region that doesn't allow it.
581 * re-walk the page table to dump information: the caller MUST prevent page
595 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; in print_bad_page_map()
598 pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, in print_bad_page_map()
600 __print_bad_page_map_pgtable(vma->vm_mm, addr); in print_bad_page_map()
604 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); in print_bad_page_map()
606 vma->vm_file, in print_bad_page_map()
607 vma->vm_ops ? vma->vm_ops->fault : NULL, in print_bad_page_map()
608 vma->vm_file ? vma->vm_file->f_op->mmap : NULL, in print_bad_page_map()
609 vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, in print_bad_page_map()
610 mapping ? mapping->a_ops->read_folio : NULL); in print_bad_page_map()
618 * __vm_normal_page() - Get the "struct page" associated with a page table entry.
645 * instead will be looked up through vm_ops->find_normal_page(). So far, this
657 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
689 if (vma->vm_ops && vma->vm_ops->find_normal_page) in __vm_normal_page()
690 return vma->vm_ops->find_normal_page(vma, addr); in __vm_normal_page()
692 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) in __vm_normal_page()
705 if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { in __vm_normal_page()
706 if (vma->vm_flags & VM_MIXEDMAP) { in __vm_normal_page()
711 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; in __vm_normal_page()
714 if (pfn == vma->vm_pgoff + off) in __vm_normal_page()
716 if (!is_cow_mapping(vma->vm_flags)) in __vm_normal_page()
739 * vm_normal_page() - Get the "struct page" associated with a PTE
758 * vm_normal_folio() - Get the "struct folio" associated with a PTE
781 * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
800 * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
822 * vm_normal_page_pud() - Get the "struct page" associated with a PUD
842 * restore_exclusive_pte - Restore a device-exclusive entry
850 * Restore a device-exclusive non-swap entry to an ordinary present pte.
856 * a device-exclusive entry can map it into the device to make forward
875 pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); in restore_exclusive_pte()
882 if ((vma->vm_flags & VM_WRITE) && in restore_exclusive_pte()
888 set_pte_at(vma->vm_mm, address, ptep, pte); in restore_exclusive_pte()
891 * No need to invalidate - it was non-present before. However in restore_exclusive_pte()
913 return -EBUSY; in try_restore_exclusive_pte()
927 vm_flags_t vm_flags = dst_vma->vm_flags; in copy_nonpresent_pte()
936 return -EIO; in copy_nonpresent_pte()
939 if (unlikely(list_empty(&dst_mm->mmlist))) { in copy_nonpresent_pte()
941 if (list_empty(&dst_mm->mmlist)) in copy_nonpresent_pte()
942 list_add(&dst_mm->mmlist, in copy_nonpresent_pte()
943 &src_mm->mmlist); in copy_nonpresent_pte()
992 * We do not preserve soft-dirty information, because so in copy_nonpresent_pte()
1014 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); in copy_nonpresent_pte()
1016 return -EBUSY; in copy_nonpresent_pte()
1017 return -ENOENT; in copy_nonpresent_pte()
1037 * and re-use the pte the traditional way.
1039 * And if we need a pre-allocated page but don't yet have
1054 return -EAGAIN; in copy_present_page()
1061 if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) in copy_present_page()
1062 return -EHWPOISON; in copy_present_page()
1071 pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); in copy_present_page()
1074 /* Uffd-wp needs to be delivered to dest pte as well */ in copy_present_page()
1076 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); in copy_present_page()
1084 struct mm_struct *src_mm = src_vma->vm_mm; in __copy_present_ptes()
1087 if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { in __copy_present_ptes()
1093 if (src_vma->vm_flags & VM_SHARED) in __copy_present_ptes()
1100 set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); in __copy_present_ptes()
1104 * Copy one present PTE, trying to batch-process subsequent PTEs that map
1107 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
1132 if (!(src_vma->vm_flags & VM_SHARED)) in copy_present_ptes()
1143 return -EAGAIN; in copy_present_ptes()
1210 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pte_range()
1211 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pte_range()
1231 * protected by mmap_lock-less collapse skipping areas with anon_vma in copy_pte_range()
1237 ret = -ENOMEM; in copy_pte_range()
1243 * retract_page_tables() are using vma->anon_vma to be exclusive, so in copy_pte_range()
1263 * We are holding two locks at this point - either of them in copy_pte_range()
1282 if (ret == -EIO) { in copy_pte_range()
1285 } else if (ret == -EBUSY) { in copy_pte_range()
1298 WARN_ON_ONCE(ret != -ENOENT); in copy_pte_range()
1301 max_nr = (end - addr) / PAGE_SIZE; in copy_pte_range()
1305 * If we need a pre-allocated page for this pte, drop the in copy_pte_range()
1309 if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) in copy_pte_range()
1313 * pre-alloc page cannot be reused by next time so as in copy_pte_range()
1332 if (ret == -EIO) { in copy_pte_range()
1335 ret = -ENOMEM; in copy_pte_range()
1339 } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { in copy_pte_range()
1341 } else if (ret == -EAGAIN) { in copy_pte_range()
1344 return -ENOMEM; in copy_pte_range()
1365 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pmd_range()
1366 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pmd_range()
1372 return -ENOMEM; in copy_pmd_range()
1378 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); in copy_pmd_range()
1381 if (err == -ENOMEM) in copy_pmd_range()
1382 return -ENOMEM; in copy_pmd_range()
1391 return -ENOMEM; in copy_pmd_range()
1401 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pud_range()
1402 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pud_range()
1408 return -ENOMEM; in copy_pud_range()
1415 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); in copy_pud_range()
1418 if (err == -ENOMEM) in copy_pud_range()
1419 return -ENOMEM; in copy_pud_range()
1428 return -ENOMEM; in copy_pud_range()
1438 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_p4d_range()
1444 return -ENOMEM; in copy_p4d_range()
1452 return -ENOMEM; in copy_p4d_range()
1466 * Always copy pgtables when dst_vma has uffd-wp enabled even if it's in vma_needs_copy()
1467 * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable in vma_needs_copy()
1468 * contains uffd-wp protection information, that's something we can't in vma_needs_copy()
1474 if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) in vma_needs_copy()
1477 if (src_vma->anon_vma) in vma_needs_copy()
1493 unsigned long addr = src_vma->vm_start; in copy_page_range()
1494 unsigned long end = src_vma->vm_end; in copy_page_range()
1495 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_page_range()
1496 struct mm_struct *src_mm = src_vma->vm_mm; in copy_page_range()
1514 is_cow = is_cow_mapping(src_vma->vm_flags); in copy_page_range()
1528 raw_write_seqcount_begin(&src_mm->write_protect_seq); in copy_page_range()
1540 ret = -ENOMEM; in copy_page_range()
1546 raw_write_seqcount_end(&src_mm->write_protect_seq); in copy_page_range()
1556 if (!details || details->reclaim_pt) in should_zap_cows()
1560 return details->even_cows; in should_zap_cows()
1571 /* Otherwise we should only zap non-anon folios */ in should_zap_folio()
1580 return details->zap_flags & ZAP_FLAG_DROP_MARKER; in zap_drop_markers()
1584 * This function makes sure that we'll replace the none pte with an uffd-wp
1587 * Returns true if uffd-wp ptes was installed, false otherwise.
1608 if (--nr == 0) in zap_install_uffd_wp_if_needed()
1623 struct mm_struct *mm = tlb->mm; in zap_present_folio_ptes()
1627 ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); in zap_present_folio_ptes()
1637 rss[mm_counter(folio)] -= nr; in zap_present_folio_ptes()
1639 /* We don't need up-to-date accessed/dirty bits. */ in zap_present_folio_ptes()
1640 clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); in zap_present_folio_ptes()
1641 rss[MM_ANONPAGES] -= nr; in zap_present_folio_ptes()
1663 * Zap or skip at least one present PTE, trying to batch-process subsequent
1674 struct mm_struct *mm = tlb->mm; in zap_present_ptes()
1681 /* We don't need up-to-date accessed/dirty bits. */ in zap_present_ptes()
1682 ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); in zap_present_ptes()
1734 * consider uffd-wp bit when zap. For more information, in zap_nonpresent_ptes()
1738 rss[mm_counter(folio)]--; in zap_nonpresent_ptes()
1747 rss[MM_SWAPENTS] -= nr; in zap_nonpresent_ptes()
1754 rss[mm_counter(folio)]--; in zap_nonpresent_ptes()
1778 clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); in zap_nonpresent_ptes()
1792 int max_nr = (end - addr) / PAGE_SIZE; in do_zap_pte_range()
1802 max_nr -= nr; in do_zap_pte_range()
1826 struct mm_struct *mm = tlb->mm; in zap_pte_range()
1924 if (next - addr != HPAGE_PMD_SIZE) in zap_pmd_range()
1931 } else if (details && details->single_folio && in zap_pmd_range()
1932 folio_test_pmd_mappable(details->single_folio) && in zap_pmd_range()
1933 next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { in zap_pmd_range()
1934 spinlock_t *ptl = pmd_lock(tlb->mm, pmd); in zap_pmd_range()
1948 pmd--; in zap_pmd_range()
1966 if (next - addr != HPAGE_PUD_SIZE) { in zap_pud_range()
1967 mmap_assert_locked(tlb->mm); in zap_pud_range()
2012 pgd = pgd_offset(vma->vm_mm, addr); in unmap_page_range()
2028 unsigned long start = max(vma->vm_start, start_addr); in unmap_single_vma()
2031 if (start >= vma->vm_end) in unmap_single_vma()
2033 end = min(vma->vm_end, end_addr); in unmap_single_vma()
2034 if (end <= vma->vm_start) in unmap_single_vma()
2037 if (vma->vm_file) in unmap_single_vma()
2043 * It is undesirable to test vma->vm_file as it in unmap_single_vma()
2044 * should be non-null for valid hugetlb area. in unmap_single_vma()
2047 * hugetlbfs ->mmap method fails, in unmap_single_vma()
2048 * mmap_region() nullifies vma->vm_file in unmap_single_vma()
2053 if (vma->vm_file) { in unmap_single_vma()
2055 details->zap_flags : 0; in unmap_single_vma()
2065 * unmap_vmas - unmap a range of memory covered by a list of vma's
2082 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
2093 /* Careful - we need to zap private pages too! */ in unmap_vmas()
2097 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, in unmap_vmas()
2107 vma = mas_find(mas, tree_end - 1); in unmap_vmas()
2113 * zap_page_range_single_batched - remove user pages in a given range
2121 * hugetlb, @tlb is flushed and re-initialized by this function.
2130 VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); in zap_page_range_single_batched()
2132 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, in zap_page_range_single_batched()
2135 update_hiwater_rss(vma->vm_mm); in zap_page_range_single_batched()
2138 * unmap 'address-end' not 'range.start-range.end' as range in zap_page_range_single_batched()
2150 tlb_gather_mmu(tlb, vma->vm_mm); in zap_page_range_single_batched()
2155 * zap_page_range_single - remove user pages in a given range
2168 tlb_gather_mmu(&tlb, vma->vm_mm); in zap_page_range_single()
2174 * zap_vma_ptes - remove ptes mapping the vma
2188 !(vma->vm_flags & VM_PFNMAP)) in zap_vma_ptes()
2229 VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); in vm_mixed_zeropage_allowed()
2236 if (mm_forbids_zeropage(vma->vm_mm)) in vm_mixed_zeropage_allowed()
2239 if (is_cow_mapping(vma->vm_flags)) in vm_mixed_zeropage_allowed()
2242 if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) in vm_mixed_zeropage_allowed()
2245 * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could in vm_mixed_zeropage_allowed()
2246 * find the shared zeropage and longterm-pin it, which would in vm_mixed_zeropage_allowed()
2248 * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would in vm_mixed_zeropage_allowed()
2253 return vma->vm_ops && vma->vm_ops->pfn_mkwrite && in vm_mixed_zeropage_allowed()
2254 (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); in vm_mixed_zeropage_allowed()
2263 return -EINVAL; in validate_page_before_insert()
2266 return -EINVAL; in validate_page_before_insert()
2270 return -EINVAL; in validate_page_before_insert()
2284 return -EBUSY; in insert_page_into_pte_locked()
2289 return -EFAULT; in insert_page_into_pte_locked()
2309 inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); in insert_page_into_pte_locked()
2312 set_pte_at(vma->vm_mm, addr, pte, pteval); in insert_page_into_pte_locked()
2326 retval = -ENOMEM; in insert_page()
2327 pte = get_locked_pte(vma->vm_mm, addr, &ptl); in insert_page()
2357 struct mm_struct *const mm = vma->vm_mm; in insert_pages()
2363 ret = -EFAULT; in insert_pages()
2369 remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); in insert_pages()
2372 ret = -ENOMEM; in insert_pages()
2382 ret = -EFAULT; in insert_pages()
2391 remaining_pages_total -= pte_idx; in insert_pages()
2398 pages_to_write_in_pmd -= batch_size; in insert_pages()
2399 remaining_pages_total -= batch_size; in insert_pages()
2410 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2427 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; in vm_insert_pages()
2429 if (addr < vma->vm_start || end_addr >= vma->vm_end) in vm_insert_pages()
2430 return -EFAULT; in vm_insert_pages()
2431 if (!(vma->vm_flags & VM_MIXEDMAP)) { in vm_insert_pages()
2432 BUG_ON(mmap_read_trylock(vma->vm_mm)); in vm_insert_pages()
2433 BUG_ON(vma->vm_flags & VM_PFNMAP); in vm_insert_pages()
2437 return insert_pages(vma, addr, pages, num, vma->vm_page_prot); in vm_insert_pages()
2442 * vm_insert_page - insert single page into user vma
2464 * Usually this function is called from f_op->mmap() handler
2465 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2467 * function from other places, for example from page-fault handler.
2474 if (addr < vma->vm_start || addr >= vma->vm_end) in vm_insert_page()
2475 return -EFAULT; in vm_insert_page()
2476 if (!(vma->vm_flags & VM_MIXEDMAP)) { in vm_insert_page()
2477 BUG_ON(mmap_read_trylock(vma->vm_mm)); in vm_insert_page()
2478 BUG_ON(vma->vm_flags & VM_PFNMAP); in vm_insert_page()
2481 return insert_page(vma, addr, page, vma->vm_page_prot, false); in vm_insert_page()
2486 * __vm_map_pages - maps range of kernel pages into user vma
2502 unsigned long uaddr = vma->vm_start; in __vm_map_pages()
2507 return -ENXIO; in __vm_map_pages()
2510 if (count > num - offset) in __vm_map_pages()
2511 return -ENXIO; in __vm_map_pages()
2524 * vm_map_pages - maps range of kernel pages starts with non zero offset
2544 return __vm_map_pages(vma, pages, num, vma->vm_pgoff); in vm_map_pages()
2549 * vm_map_pages_zero - map range of kernel pages starts with zero offset
2571 struct mm_struct *mm = vma->vm_mm; in insert_pfn()
2620 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2627 * to override pgprot on a per-page basis.
2634 * pgprot typically only differs from @vma->vm_page_prot when drivers set
2635 * caching- and encryption bits different than those of @vma->vm_page_prot,
2636 * because the caching- or encryption mode may not be known at mmap() time.
2638 * This is ok as long as @vma->vm_page_prot is not used by the core vm
2641 * functions that don't touch caching- or encryption bits, using pte_modify()
2644 * Also when new page-table entries are created, this is only done using the
2645 * fault() callback, and never using the value of vma->vm_page_prot,
2646 * except for page-table entries that point to anonymous pages as the result
2661 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); in vmf_insert_pfn_prot()
2662 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == in vmf_insert_pfn_prot()
2664 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); in vmf_insert_pfn_prot()
2665 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); in vmf_insert_pfn_prot()
2667 if (addr < vma->vm_start || addr >= vma->vm_end) in vmf_insert_pfn_prot()
2680 * vmf_insert_pfn - insert single pfn into user vma
2688 * This function should only be called from a vm_ops->fault handler, and
2702 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); in vmf_insert_pfn()
2713 if (vma->vm_flags & VM_MIXEDMAP) in vm_mixed_ok()
2723 pgprot_t pgprot = vma->vm_page_prot; in __vm_insert_mixed()
2729 if (addr < vma->vm_start || addr >= vma->vm_end) in __vm_insert_mixed()
2758 if (err == -ENOMEM) in __vm_insert_mixed()
2760 if (err < 0 && err != -EBUSY) in __vm_insert_mixed()
2769 pgprot_t pgprot = vmf->vma->vm_page_prot; in vmf_insert_page_mkwrite()
2770 unsigned long addr = vmf->address; in vmf_insert_page_mkwrite()
2773 if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) in vmf_insert_page_mkwrite()
2776 err = insert_page(vmf->vma, addr, page, pgprot, write); in vmf_insert_page_mkwrite()
2777 if (err == -ENOMEM) in vmf_insert_page_mkwrite()
2779 if (err < 0 && err != -EBUSY) in vmf_insert_page_mkwrite()
2807 * in null mappings (currently treated as "copy-on-access")
2819 return -ENOMEM; in remap_pte_range()
2824 err = -EACCES; in remap_pte_range()
2843 pfn -= addr >> PAGE_SHIFT; in remap_pmd_range()
2846 return -ENOMEM; in remap_pmd_range()
2866 pfn -= addr >> PAGE_SHIFT; in remap_pud_range()
2869 return -ENOMEM; in remap_pud_range()
2888 pfn -= addr >> PAGE_SHIFT; in remap_p4d_range()
2891 return -ENOMEM; in remap_p4d_range()
2908 struct mm_struct *mm = vma->vm_mm; in remap_pfn_range_internal()
2912 return -EINVAL; in remap_pfn_range_internal()
2927 * There's a horrible special case to handle copy-on-write in remap_pfn_range_internal()
2929 * un-COW'ed pages by matching them up with "vma->vm_pgoff". in remap_pfn_range_internal()
2932 if (is_cow_mapping(vma->vm_flags)) { in remap_pfn_range_internal()
2933 if (addr != vma->vm_start || end != vma->vm_end) in remap_pfn_range_internal()
2934 return -EINVAL; in remap_pfn_range_internal()
2935 vma->vm_pgoff = pfn; in remap_pfn_range_internal()
2941 pfn -= addr >> PAGE_SHIFT; in remap_pfn_range_internal()
2957 * must have pre-validated the caching bits of the pgprot_t.
2983 return ERR_PTR(-EINVAL); in pfnmap_track_ctx_alloc()
2988 return ERR_PTR(-ENOMEM); in pfnmap_track_ctx_alloc()
2991 ctx->pfn = pfn; in pfnmap_track_ctx_alloc()
2992 ctx->size = size; in pfnmap_track_ctx_alloc()
2993 kref_init(&ctx->kref); in pfnmap_track_ctx_alloc()
3001 pfnmap_untrack(ctx->pfn, ctx->size); in pfnmap_track_ctx_release()
3007 * remap_pfn_range - remap kernel memory to userspace
3036 if (addr == vma->vm_start && addr + size == vma->vm_end) { in remap_pfn_range()
3037 if (vma->pfnmap_track_ctx) in remap_pfn_range()
3038 return -EINVAL; in remap_pfn_range()
3043 return -EINVAL; in remap_pfn_range()
3049 kref_put(&ctx->kref, pfnmap_track_ctx_release); in remap_pfn_range()
3051 vma->pfnmap_track_ctx = ctx; in remap_pfn_range()
3066 * vm_iomap_memory - remap memory to userspace
3075 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
3076 * whatever write-combining details or similar.
3086 return -EINVAL; in vm_iomap_memory()
3088 * You *really* shouldn't map things that aren't page-aligned, in vm_iomap_memory()
3096 return -EINVAL; in vm_iomap_memory()
3099 if (vma->vm_pgoff > pages) in vm_iomap_memory()
3100 return -EINVAL; in vm_iomap_memory()
3101 pfn += vma->vm_pgoff; in vm_iomap_memory()
3102 pages -= vma->vm_pgoff; in vm_iomap_memory()
3105 vm_len = vma->vm_end - vma->vm_start; in vm_iomap_memory()
3107 return -EINVAL; in vm_iomap_memory()
3110 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); in vm_iomap_memory()
3128 return -ENOMEM; in apply_to_pte_range()
3134 return -EINVAL; in apply_to_pte_range()
3171 return -ENOMEM; in apply_to_pmd_range()
3180 return -EINVAL; in apply_to_pmd_range()
3207 return -ENOMEM; in apply_to_pud_range()
3216 return -EINVAL; in apply_to_pud_range()
3243 return -ENOMEM; in apply_to_p4d_range()
3252 return -EINVAL; in apply_to_p4d_range()
3278 return -EINVAL; in __apply_to_page_range()
3286 err = -EINVAL; in __apply_to_page_range()
3332 * read non-atomically. Before making any commitment, on those architectures
3343 spin_lock(vmf->ptl); in pte_unmap_same()
3344 same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); in pte_unmap_same()
3345 spin_unlock(vmf->ptl); in pte_unmap_same()
3348 pte_unmap(vmf->pte); in pte_unmap_same()
3349 vmf->pte = NULL; in pte_unmap_same()
3356 * -EHWPOISON: copy failed due to hwpoison in source page
3357 * -EAGAIN: copied failed (some other reason)
3365 struct vm_area_struct *vma = vmf->vma; in __wp_page_copy_user()
3366 struct mm_struct *mm = vma->vm_mm; in __wp_page_copy_user()
3367 unsigned long addr = vmf->address; in __wp_page_copy_user()
3371 return -EHWPOISON; in __wp_page_copy_user()
3377 * a "struct page" for it. We do a best-effort copy by in __wp_page_copy_user()
3379 * fails, we just zero-fill it. Live with it. in __wp_page_copy_user()
3389 vmf->pte = NULL; in __wp_page_copy_user()
3390 if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { in __wp_page_copy_user()
3393 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); in __wp_page_copy_user()
3394 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in __wp_page_copy_user()
3399 if (vmf->pte) in __wp_page_copy_user()
3400 update_mmu_tlb(vma, addr, vmf->pte); in __wp_page_copy_user()
3401 ret = -EAGAIN; in __wp_page_copy_user()
3405 entry = pte_mkyoung(vmf->orig_pte); in __wp_page_copy_user()
3406 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) in __wp_page_copy_user()
3407 update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); in __wp_page_copy_user()
3417 if (vmf->pte) in __wp_page_copy_user()
3420 /* Re-validate under PTL if the page is still mapped */ in __wp_page_copy_user()
3421 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); in __wp_page_copy_user()
3422 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in __wp_page_copy_user()
3424 if (vmf->pte) in __wp_page_copy_user()
3425 update_mmu_tlb(vma, addr, vmf->pte); in __wp_page_copy_user()
3426 ret = -EAGAIN; in __wp_page_copy_user()
3437 * use-case in __wp_page_copy_user()
3448 if (vmf->pte) in __wp_page_copy_user()
3449 pte_unmap_unlock(vmf->pte, vmf->ptl); in __wp_page_copy_user()
3459 struct file *vm_file = vma->vm_file; in __get_fault_gfp_mask()
3462 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; in __get_fault_gfp_mask()
3480 unsigned int old_flags = vmf->flags; in do_page_mkwrite()
3482 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; in do_page_mkwrite()
3484 if (vmf->vma->vm_file && in do_page_mkwrite()
3485 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) in do_page_mkwrite()
3488 ret = vmf->vma->vm_ops->page_mkwrite(vmf); in do_page_mkwrite()
3490 vmf->flags = old_flags; in do_page_mkwrite()
3495 if (!folio->mapping) { in do_page_mkwrite()
3512 struct vm_area_struct *vma = vmf->vma; in fault_dirty_shared_page()
3514 struct folio *folio = page_folio(vmf->page); in fault_dirty_shared_page()
3516 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; in fault_dirty_shared_page()
3521 * Take a local copy of the address_space - folio.mapping may be zeroed in fault_dirty_shared_page()
3523 * pinned by vma->vm_file's reference. We rely on folio_unlock()'s in fault_dirty_shared_page()
3530 file_update_time(vma->vm_file); in fault_dirty_shared_page()
3561 * any related book-keeping.
3564 __releases(vmf->ptl) in wp_page_reuse()
3566 struct vm_area_struct *vma = vmf->vma; in wp_page_reuse()
3569 VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); in wp_page_reuse()
3570 VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); in wp_page_reuse()
3574 !PageAnonExclusive(vmf->page)); in wp_page_reuse()
3580 folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); in wp_page_reuse()
3583 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); in wp_page_reuse()
3584 entry = pte_mkyoung(vmf->orig_pte); in wp_page_reuse()
3586 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) in wp_page_reuse()
3587 update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); in wp_page_reuse()
3588 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_reuse()
3594 * vm_ops that have a ->map_pages have been audited and don't need
3599 struct vm_area_struct *vma = vmf->vma; in vmf_can_call_fault()
3601 if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) in vmf_can_call_fault()
3608 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
3614 * only protected by the per-VMA lock, the caller must retry with the
3617 * do with only the per-VMA lock held for this VMA.
3624 struct vm_area_struct *vma = vmf->vma; in __vmf_anon_prepare()
3627 if (likely(vma->anon_vma)) in __vmf_anon_prepare()
3629 if (vmf->flags & FAULT_FLAG_VMA_LOCK) { in __vmf_anon_prepare()
3630 if (!mmap_read_trylock(vma->vm_mm)) in __vmf_anon_prepare()
3635 if (vmf->flags & FAULT_FLAG_VMA_LOCK) in __vmf_anon_prepare()
3636 mmap_read_unlock(vma->vm_mm); in __vmf_anon_prepare()
3649 * - Allocate a page, copy the content of the old page to the new one.
3650 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3651 * - Take the PTL. If the pte changed, bail out and release the allocated page
3652 * - If the pte is still the way we remember it, update the page table and all
3653 * relevant references. This includes dropping the reference the page-table
3655 * - In any case, unlock the PTL and drop the reference we took to the old page.
3659 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in wp_page_copy()
3660 struct vm_area_struct *vma = vmf->vma; in wp_page_copy()
3661 struct mm_struct *mm = vma->vm_mm; in wp_page_copy()
3672 if (vmf->page) in wp_page_copy()
3673 old_folio = page_folio(vmf->page); in wp_page_copy()
3678 pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); in wp_page_copy()
3679 new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); in wp_page_copy()
3686 err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); in wp_page_copy()
3690 * it's fine. If not, userspace would re-fault on in wp_page_copy()
3693 * The -EHWPOISON case will not be retried. in wp_page_copy()
3700 return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; in wp_page_copy()
3702 kmsan_copy_page_meta(&new_folio->page, vmf->page); in wp_page_copy()
3708 vmf->address & PAGE_MASK, in wp_page_copy()
3709 (vmf->address & PAGE_MASK) + PAGE_SIZE); in wp_page_copy()
3713 * Re-check the pte - we dropped the lock in wp_page_copy()
3715 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); in wp_page_copy()
3716 if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in wp_page_copy()
3723 ksm_might_unmap_zero_page(mm, vmf->orig_pte); in wp_page_copy()
3726 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); in wp_page_copy()
3727 entry = folio_mk_pte(new_folio, vma->vm_page_prot); in wp_page_copy()
3730 if (pte_soft_dirty(vmf->orig_pte)) in wp_page_copy()
3732 if (pte_uffd_wp(vmf->orig_pte)) in wp_page_copy()
3745 ptep_clear_flush(vma, vmf->address, vmf->pte); in wp_page_copy()
3746 folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); in wp_page_copy()
3749 set_pte_at(mm, vmf->address, vmf->pte, entry); in wp_page_copy()
3750 update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); in wp_page_copy()
3774 folio_remove_rmap_pte(old_folio, vmf->page, vma); in wp_page_copy()
3780 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_copy()
3781 } else if (vmf->pte) { in wp_page_copy()
3782 update_mmu_tlb(vma, vmf->address, vmf->pte); in wp_page_copy()
3783 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_copy()
3809 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3813 * @folio: the folio of vmf->page
3816 * shared mapping due to PTE being read-only once the mapped page is prepared.
3827 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); in finish_mkwrite_fault()
3828 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, in finish_mkwrite_fault()
3829 &vmf->ptl); in finish_mkwrite_fault()
3830 if (!vmf->pte) in finish_mkwrite_fault()
3836 if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { in finish_mkwrite_fault()
3837 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); in finish_mkwrite_fault()
3838 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_mkwrite_fault()
3851 struct vm_area_struct *vma = vmf->vma; in wp_pfn_shared()
3853 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { in wp_pfn_shared()
3856 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_pfn_shared()
3861 vmf->flags |= FAULT_FLAG_MKWRITE; in wp_pfn_shared()
3862 ret = vma->vm_ops->pfn_mkwrite(vmf); in wp_pfn_shared()
3872 __releases(vmf->ptl) in wp_page_shared()
3874 struct vm_area_struct *vma = vmf->vma; in wp_page_shared()
3879 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { in wp_page_shared()
3882 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_shared()
3934 if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) in __wp_can_reuse_large_anon_folio()
3957 if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) in __wp_can_reuse_large_anon_folio()
3964 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && in __wp_can_reuse_large_anon_folio()
3965 folio_mm_id(folio, 1) != vma->vm_mm->mm_id); in __wp_can_reuse_large_anon_folio()
4033 * shared-page counter for the old page.
4036 * done by the caller (the low-level page fault routine in most cases).
4044 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4049 __releases(vmf->ptl) in do_wp_page()
4051 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in do_wp_page()
4052 struct vm_area_struct *vma = vmf->vma; in do_wp_page()
4057 if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { in do_wp_page()
4059 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
4065 * etc.) because we're only removing the uffd-wp bit, in do_wp_page()
4068 pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); in do_wp_page()
4070 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); in do_wp_page()
4075 vmf->orig_pte = pte; in do_wp_page()
4079 * Userfaultfd write-protect can defer flushes. Ensure the TLB in do_wp_page()
4082 if (unlikely(userfaultfd_wp(vmf->vma) && in do_wp_page()
4083 mm_tlb_flush_pending(vmf->vma->vm_mm))) in do_wp_page()
4084 flush_tlb_page(vmf->vma, vmf->address); in do_wp_page()
4087 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); in do_wp_page()
4089 if (vmf->page) in do_wp_page()
4090 folio = page_folio(vmf->page); in do_wp_page()
4096 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in do_wp_page()
4099 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. in do_wp_page()
4102 * Just mark the pages writable and/or call ops->pfn_mkwrite. in do_wp_page()
4104 if (!vmf->page || is_fsdax_page(vmf->page)) { in do_wp_page()
4105 vmf->page = NULL; in do_wp_page()
4119 (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { in do_wp_page()
4120 if (!PageAnonExclusive(vmf->page)) in do_wp_page()
4121 SetPageAnonExclusive(vmf->page); in do_wp_page()
4123 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
4135 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
4147 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); in unmap_mapping_range_vma()
4159 vba = vma->vm_pgoff; in unmap_mapping_range_tree()
4160 vea = vba + vma_pages(vma) - 1; in unmap_mapping_range_tree()
4165 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, in unmap_mapping_range_tree()
4166 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, in unmap_mapping_range_tree()
4172 * unmap_mapping_folio() - Unmap single folio from processes.
4184 struct address_space *mapping = folio->mapping; in unmap_mapping_folio()
4191 first_index = folio->index; in unmap_mapping_folio()
4192 last_index = folio_next_index(folio) - 1; in unmap_mapping_folio()
4199 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) in unmap_mapping_folio()
4200 unmap_mapping_range_tree(&mapping->i_mmap, first_index, in unmap_mapping_folio()
4206 * unmap_mapping_pages() - Unmap pages from processes.
4222 pgoff_t last_index = start + nr - 1; in unmap_mapping_pages()
4229 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) in unmap_mapping_pages()
4230 unmap_mapping_range_tree(&mapping->i_mmap, first_index, in unmap_mapping_pages()
4237 * unmap_mapping_range - unmap the portion of all mmaps in the specified
4257 pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; in unmap_mapping_range()
4262 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; in unmap_mapping_range()
4264 hlen = ULONG_MAX - hba + 1; in unmap_mapping_range()
4276 struct folio *folio = page_folio(vmf->page); in remove_device_exclusive_entry()
4277 struct vm_area_struct *vma = vmf->vma; in remove_device_exclusive_entry()
4283 * the PTL so a racing thread can remove the device-exclusive in remove_device_exclusive_entry()
4286 * been re-allocated after being freed all we do is lock and in remove_device_exclusive_entry()
4298 vma->vm_mm, vmf->address & PAGE_MASK, in remove_device_exclusive_entry()
4299 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); in remove_device_exclusive_entry()
4302 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, in remove_device_exclusive_entry()
4303 &vmf->ptl); in remove_device_exclusive_entry()
4304 if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in remove_device_exclusive_entry()
4305 restore_exclusive_pte(vma, folio, vmf->page, vmf->address, in remove_device_exclusive_entry()
4306 vmf->pte, vmf->orig_pte); in remove_device_exclusive_entry()
4308 if (vmf->pte) in remove_device_exclusive_entry()
4309 pte_unmap_unlock(vmf->pte, vmf->ptl); in remove_device_exclusive_entry()
4323 if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || in should_try_to_free_swap()
4338 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in pte_marker_clear()
4339 vmf->address, &vmf->ptl); in pte_marker_clear()
4340 if (!vmf->pte) in pte_marker_clear()
4343 * Be careful so that we will only recover a special uffd-wp pte into a in pte_marker_clear()
4350 if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) in pte_marker_clear()
4351 pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); in pte_marker_clear()
4352 pte_unmap_unlock(vmf->pte, vmf->ptl); in pte_marker_clear()
4358 if (vma_is_anonymous(vmf->vma)) in do_pte_missing()
4365 * This is actually a page-missing access, but with uffd-wp special pte
4366 * installed. It means this pte was wr-protected before being unmapped.
4372 * got unregistered - we can simply clear them. in pte_marker_handle_uffd_wp()
4374 if (unlikely(!userfaultfd_wp(vmf->vma))) in pte_marker_handle_uffd_wp()
4382 swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); in handle_pte_marker()
4392 /* Higher priority than uffd-wp when data corrupted */ in handle_pte_marker()
4409 struct vm_area_struct *vma = vmf->vma; in __alloc_swap_folio()
4413 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); in __alloc_swap_folio()
4417 entry = pte_to_swp_entry(vmf->orig_pte); in __alloc_swap_folio()
4418 if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, in __alloc_swap_folio()
4439 addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); in can_swapin_thp()
4440 idx = (vmf->address - addr) / PAGE_SIZE; in can_swapin_thp()
4443 if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) in can_swapin_thp()
4464 unsigned long orders) in thp_swap_suitable_orders() argument
4468 order = highest_order(orders); in thp_swap_suitable_orders()
4475 while (orders) { in thp_swap_suitable_orders()
4479 order = next_order(&orders, order); in thp_swap_suitable_orders()
4482 return orders; in thp_swap_suitable_orders()
4487 struct vm_area_struct *vma = vmf->vma; in alloc_swap_folio()
4488 unsigned long orders; in alloc_swap_folio() local
4498 * If uffd is active for the vma we need per-page fault fidelity to in alloc_swap_folio()
4506 * lack handling for such cases, so fallback to swapping in order-0 in alloc_swap_folio()
4512 entry = pte_to_swp_entry(vmf->orig_pte); in alloc_swap_folio()
4514 * Get a list of all the (large) orders below PMD_ORDER that are enabled in alloc_swap_folio()
4517 orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, in alloc_swap_folio()
4518 BIT(PMD_ORDER) - 1); in alloc_swap_folio()
4519 orders = thp_vma_suitable_orders(vma, vmf->address, orders); in alloc_swap_folio()
4520 orders = thp_swap_suitable_orders(swp_offset(entry), in alloc_swap_folio()
4521 vmf->address, orders); in alloc_swap_folio()
4523 if (!orders) in alloc_swap_folio()
4526 pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in alloc_swap_folio()
4527 vmf->address & PMD_MASK, &ptl); in alloc_swap_folio()
4535 order = highest_order(orders); in alloc_swap_folio()
4536 while (orders) { in alloc_swap_folio()
4537 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_swap_folio()
4540 order = next_order(&orders, order); in alloc_swap_folio()
4545 /* Try allocating the highest of the remaining orders. */ in alloc_swap_folio()
4547 while (orders) { in alloc_swap_folio()
4548 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_swap_folio()
4551 if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, in alloc_swap_folio()
4558 order = next_order(&orders, order); in alloc_swap_folio()
4574 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4583 struct vm_area_struct *vma = vmf->vma; in do_swap_page()
4603 entry = pte_to_swp_entry(vmf->orig_pte); in do_swap_page()
4606 migration_entry_wait(vma->vm_mm, vmf->pmd, in do_swap_page()
4607 vmf->address); in do_swap_page()
4609 vmf->page = pfn_swap_entry_to_page(entry); in do_swap_page()
4612 if (vmf->flags & FAULT_FLAG_VMA_LOCK) { in do_swap_page()
4622 vmf->page = pfn_swap_entry_to_page(entry); in do_swap_page()
4623 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_swap_page()
4624 vmf->address, &vmf->ptl); in do_swap_page()
4625 if (unlikely(!vmf->pte || in do_swap_page()
4626 !pte_same(ptep_get(vmf->pte), in do_swap_page()
4627 vmf->orig_pte))) in do_swap_page()
4634 if (trylock_page(vmf->page)) { in do_swap_page()
4637 get_page(vmf->page); in do_swap_page()
4638 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4639 pgmap = page_pgmap(vmf->page); in do_swap_page()
4640 ret = pgmap->ops->migrate_to_ram(vmf); in do_swap_page()
4641 unlock_page(vmf->page); in do_swap_page()
4642 put_page(vmf->page); in do_swap_page()
4644 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4651 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); in do_swap_page()
4664 swap_update_readahead(folio, vma, vmf->address); in do_swap_page()
4668 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && in do_swap_page()
4708 folio->swap = entry; in do_swap_page()
4710 folio->private = NULL; in do_swap_page()
4723 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_swap_page()
4724 vmf->address, &vmf->ptl); in do_swap_page()
4725 if (likely(vmf->pte && in do_swap_page()
4726 pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in do_swap_page()
4734 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); in do_swap_page()
4764 * folio->index of non-ksm folios would be nonlinear inside the in do_swap_page()
4765 * anon VMA -- the ksm flag is lost on actual swapout. in do_swap_page()
4767 folio = ksm_might_need_to_copy(folio, vma, vmf->address); in do_swap_page()
4772 } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { in do_swap_page()
4786 if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && in do_swap_page()
4796 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, in do_swap_page()
4797 &vmf->ptl); in do_swap_page()
4798 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in do_swap_page()
4809 unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); in do_swap_page()
4810 unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; in do_swap_page()
4811 pte_t *folio_ptep = vmf->pte - idx; in do_swap_page()
4814 if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || in do_swap_page()
4826 address = vmf->address; in do_swap_page()
4827 ptep = vmf->pte; in do_swap_page()
4831 unsigned long folio_start = address - idx * PAGE_SIZE; in do_swap_page()
4836 if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) in do_swap_page()
4838 if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) in do_swap_page()
4841 folio_ptep = vmf->pte - idx; in do_swap_page()
4843 if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || in do_swap_page()
4851 entry = folio->swap; in do_swap_page()
4852 page = &folio->page; in do_swap_page()
4872 exclusive = pte_swp_exclusive(vmf->orig_pte); in do_swap_page()
4876 * swapcache -> certainly exclusive. in do_swap_page()
4880 data_race(si->flags & SWP_STABLE_WRITES)) { in do_swap_page()
4916 if (should_try_to_free_swap(folio, vma, vmf->flags)) in do_swap_page()
4919 add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); in do_swap_page()
4920 add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); in do_swap_page()
4921 pte = mk_pte(page, vma->vm_page_prot); in do_swap_page()
4922 if (pte_swp_soft_dirty(vmf->orig_pte)) in do_swap_page()
4924 if (pte_swp_uffd_wp(vmf->orig_pte)) in do_swap_page()
4935 if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && in do_swap_page()
4938 if (vmf->flags & FAULT_FLAG_WRITE) { in do_swap_page()
4940 vmf->flags &= ~FAULT_FLAG_WRITE; in do_swap_page()
4945 folio_ref_add(folio, nr_pages - 1); in do_swap_page()
4947 vmf->orig_pte = pte_advance_pfn(pte, page_idx); in do_swap_page()
4970 set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); in do_swap_page()
4971 arch_do_swap_page_nr(vma->vm_mm, vma, address, in do_swap_page()
4988 if (vmf->flags & FAULT_FLAG_WRITE) { in do_swap_page()
4995 /* No need to invalidate - it was non-present before */ in do_swap_page()
4998 if (vmf->pte) in do_swap_page()
4999 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
5011 if (vmf->pte) in do_swap_page()
5012 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
5045 struct vm_area_struct *vma = vmf->vma; in alloc_anon_folio()
5047 unsigned long orders; in alloc_anon_folio() local
5055 * If uffd is active for the vma we need per-page fault fidelity to in alloc_anon_folio()
5062 * Get a list of all the (large) orders below PMD_ORDER that are enabled in alloc_anon_folio()
5063 * for this vma. Then filter out the orders that can't be allocated over in alloc_anon_folio()
5066 orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, in alloc_anon_folio()
5067 BIT(PMD_ORDER) - 1); in alloc_anon_folio()
5068 orders = thp_vma_suitable_orders(vma, vmf->address, orders); in alloc_anon_folio()
5070 if (!orders) in alloc_anon_folio()
5073 pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); in alloc_anon_folio()
5075 return ERR_PTR(-EAGAIN); in alloc_anon_folio()
5079 * pte_none(). Note that all remaining orders will be completely in alloc_anon_folio()
5082 order = highest_order(orders); in alloc_anon_folio()
5083 while (orders) { in alloc_anon_folio()
5084 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_anon_folio()
5087 order = next_order(&orders, order); in alloc_anon_folio()
5092 if (!orders) in alloc_anon_folio()
5095 /* Try allocating the highest of the remaining orders. */ in alloc_anon_folio()
5097 while (orders) { in alloc_anon_folio()
5098 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_anon_folio()
5101 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { in alloc_anon_folio()
5115 folio_zero_user(folio, vmf->address); in alloc_anon_folio()
5120 order = next_order(&orders, order); in alloc_anon_folio()
5125 return folio_prealloc(vma->vm_mm, vma, vmf->address, true); in alloc_anon_folio()
5129 * We enter with non-exclusive mmap_lock (to exclude vma changes,
5135 struct vm_area_struct *vma = vmf->vma; in do_anonymous_page()
5136 unsigned long addr = vmf->address; in do_anonymous_page()
5142 /* File mapping without ->vm_ops ? */ in do_anonymous_page()
5143 if (vma->vm_flags & VM_SHARED) in do_anonymous_page()
5150 if (pte_alloc(vma->vm_mm, vmf->pmd)) in do_anonymous_page()
5153 /* Use the zero-page for reads */ in do_anonymous_page()
5154 if (!(vmf->flags & FAULT_FLAG_WRITE) && in do_anonymous_page()
5155 !mm_forbids_zeropage(vma->vm_mm)) { in do_anonymous_page()
5156 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), in do_anonymous_page()
5157 vma->vm_page_prot)); in do_anonymous_page()
5158 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_anonymous_page()
5159 vmf->address, &vmf->ptl); in do_anonymous_page()
5160 if (!vmf->pte) in do_anonymous_page()
5163 update_mmu_tlb(vma, vmf->address, vmf->pte); in do_anonymous_page()
5166 ret = check_stable_address_space(vma->vm_mm); in do_anonymous_page()
5171 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
5181 /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ in do_anonymous_page()
5189 addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); in do_anonymous_page()
5198 entry = folio_mk_pte(folio, vma->vm_page_prot); in do_anonymous_page()
5200 if (vma->vm_flags & VM_WRITE) in do_anonymous_page()
5203 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); in do_anonymous_page()
5204 if (!vmf->pte) in do_anonymous_page()
5207 update_mmu_tlb(vma, addr, vmf->pte); in do_anonymous_page()
5209 } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { in do_anonymous_page()
5210 update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); in do_anonymous_page()
5214 ret = check_stable_address_space(vma->vm_mm); in do_anonymous_page()
5220 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
5225 folio_ref_add(folio, nr_pages - 1); in do_anonymous_page()
5226 add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); in do_anonymous_page()
5233 set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); in do_anonymous_page()
5235 /* No need to invalidate - it was non-present before */ in do_anonymous_page()
5236 update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); in do_anonymous_page()
5238 if (vmf->pte) in do_anonymous_page()
5239 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
5250 * released depending on flags and vma->vm_ops->fault() return value.
5255 struct vm_area_struct *vma = vmf->vma; in __do_fault()
5274 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { in __do_fault()
5275 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); in __do_fault()
5276 if (!vmf->prealloc_pte) in __do_fault()
5280 ret = vma->vm_ops->fault(vmf); in __do_fault()
5285 folio = page_folio(vmf->page); in __do_fault()
5286 if (unlikely(PageHWPoison(vmf->page))) { in __do_fault()
5289 if (page_mapped(vmf->page)) in __do_fault()
5292 if (mapping_evict_folio(folio->mapping, folio)) in __do_fault()
5297 vmf->page = NULL; in __do_fault()
5304 VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); in __do_fault()
5312 struct vm_area_struct *vma = vmf->vma; in deposit_prealloc_pte()
5314 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); in deposit_prealloc_pte()
5319 mm_inc_nr_ptes(vma->vm_mm); in deposit_prealloc_pte()
5320 vmf->prealloc_pte = NULL; in deposit_prealloc_pte()
5325 struct vm_area_struct *vma = vmf->vma; in do_set_pmd()
5326 bool write = vmf->flags & FAULT_FLAG_WRITE; in do_set_pmd()
5327 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; in do_set_pmd()
5334 * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any in do_set_pmd()
5338 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, in do_set_pmd()
5347 page = &folio->page; in do_set_pmd()
5362 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { in do_set_pmd()
5363 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); in do_set_pmd()
5364 if (!vmf->prealloc_pte) in do_set_pmd()
5368 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); in do_set_pmd()
5369 if (unlikely(!pmd_none(*vmf->pmd))) in do_set_pmd()
5374 entry = folio_mk_pmd(folio, vma->vm_page_prot); in do_set_pmd()
5378 add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); in do_set_pmd()
5387 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); in do_set_pmd()
5389 update_mmu_cache_pmd(vma, haddr, vmf->pmd); in do_set_pmd()
5395 spin_unlock(vmf->ptl); in do_set_pmd()
5406 * set_pte_range - Set a range of PTEs to point to pages in a folio.
5416 struct vm_area_struct *vma = vmf->vma; in set_pte_range()
5417 bool write = vmf->flags & FAULT_FLAG_WRITE; in set_pte_range()
5418 bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); in set_pte_range()
5422 entry = mk_pte(page, vma->vm_page_prot); in set_pte_range()
5435 /* copy-on-write page */ in set_pte_range()
5436 if (write && !(vma->vm_flags & VM_SHARED)) { in set_pte_range()
5443 set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); in set_pte_range()
5445 /* no need to invalidate: a not-present page won't be cached */ in set_pte_range()
5446 update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); in set_pte_range()
5451 if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) in vmf_pte_changed()
5452 return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); in vmf_pte_changed()
5454 return !pte_none(ptep_get(vmf->pte)); in vmf_pte_changed()
5458 * finish_fault - finish page fault once we have prepared the page to fault
5474 struct vm_area_struct *vma = vmf->vma; in finish_fault()
5478 bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && in finish_fault()
5479 !(vma->vm_flags & VM_SHARED); in finish_fault()
5485 addr = vmf->address; in finish_fault()
5489 page = vmf->cow_page; in finish_fault()
5491 page = vmf->page; in finish_fault()
5498 if (!(vma->vm_flags & VM_SHARED)) { in finish_fault()
5499 ret = check_stable_address_space(vma->vm_mm); in finish_fault()
5504 if (pmd_none(*vmf->pmd)) { in finish_fault()
5511 if (vmf->prealloc_pte) in finish_fault()
5512 pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); in finish_fault()
5513 else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) in finish_fault()
5519 /* Using per-page fault to maintain the uffd semantics */ in finish_fault()
5524 /* The page offset of vmf->address within the VMA. */ in finish_fault()
5525 pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; in finish_fault()
5527 pgoff_t pte_off = pte_index(vmf->address); in finish_fault()
5530 * Fallback to per-page fault in case the folio size in page in finish_fault()
5534 vma_off + (nr_pages - idx) > vma_pages(vma) || in finish_fault()
5536 pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { in finish_fault()
5540 addr = vmf->address - idx * PAGE_SIZE; in finish_fault()
5541 page = &folio->page; in finish_fault()
5545 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in finish_fault()
5546 addr, &vmf->ptl); in finish_fault()
5547 if (!vmf->pte) in finish_fault()
5550 /* Re-check under ptl */ in finish_fault()
5552 update_mmu_tlb(vma, addr, vmf->pte); in finish_fault()
5555 } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { in finish_fault()
5557 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_fault()
5561 folio_ref_add(folio, nr_pages - 1); in finish_fault()
5564 add_mm_counter(vma->vm_mm, type, nr_pages); in finish_fault()
5568 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_fault()
5589 return -EINVAL; in fault_around_bytes_set()
5592 * The minimum value is 1 page, however this results in no fault-around in fault_around_bytes_set()
5617 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5618 * not ready to be mapped: not up-to-date, locked, etc.
5635 pgoff_t pte_off = pte_index(vmf->address); in do_fault_around()
5636 /* The page offset of vmf->address within the VMA. */ in do_fault_around()
5637 pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; in do_fault_around()
5643 pte_off - min(pte_off, vma_off)); in do_fault_around()
5647 pte_off + vma_pages(vmf->vma) - vma_off) - 1; in do_fault_around()
5649 if (pmd_none(*vmf->pmd)) { in do_fault_around()
5650 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); in do_fault_around()
5651 if (!vmf->prealloc_pte) in do_fault_around()
5656 ret = vmf->vma->vm_ops->map_pages(vmf, in do_fault_around()
5657 vmf->pgoff + from_pte - pte_off, in do_fault_around()
5658 vmf->pgoff + to_pte - pte_off); in do_fault_around()
5664 /* Return true if we should do read fault-around, false otherwise */
5667 /* No ->map_pages? No way to fault around... */ in should_fault_around()
5668 if (!vmf->vma->vm_ops->map_pages) in should_fault_around()
5671 if (uffd_disable_fault_around(vmf->vma)) in should_fault_around()
5684 * Let's call ->map_pages() first and use ->fault() as fallback in do_read_fault()
5703 folio = page_folio(vmf->page); in do_read_fault()
5712 struct vm_area_struct *vma = vmf->vma; in do_cow_fault()
5722 folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); in do_cow_fault()
5726 vmf->cow_page = &folio->page; in do_cow_fault()
5734 if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { in do_cow_fault()
5742 unlock_page(vmf->page); in do_cow_fault()
5743 put_page(vmf->page); in do_cow_fault()
5754 struct vm_area_struct *vma = vmf->vma; in do_shared_fault()
5766 folio = page_folio(vmf->page); in do_shared_fault()
5772 if (vma->vm_ops->page_mkwrite) { in do_shared_fault()
5795 * We enter with non-exclusive mmap_lock (to exclude vma changes,
5804 struct vm_area_struct *vma = vmf->vma; in do_fault()
5805 struct mm_struct *vm_mm = vma->vm_mm; in do_fault()
5811 if (!vma->vm_ops->fault) { in do_fault()
5812 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in do_fault()
5813 vmf->address, &vmf->ptl); in do_fault()
5814 if (unlikely(!vmf->pte)) in do_fault()
5824 if (unlikely(pte_none(ptep_get(vmf->pte)))) in do_fault()
5829 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_fault()
5831 } else if (!(vmf->flags & FAULT_FLAG_WRITE)) in do_fault()
5833 else if (!(vma->vm_flags & VM_SHARED)) in do_fault()
5839 if (vmf->prealloc_pte) { in do_fault()
5840 pte_free(vm_mm, vmf->prealloc_pte); in do_fault()
5841 vmf->prealloc_pte = NULL; in do_fault()
5850 struct vm_area_struct *vma = vmf->vma; in numa_migrate_check()
5867 if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) in numa_migrate_check()
5874 *last_cpupid = (-1 & LAST_CPUPID_MASK); in numa_migrate_check()
5900 pte = pte_modify(old_pte, vma->vm_page_prot); in numa_rebuild_single_mapping()
5912 int nr = pte_pfn(fault_pte) - folio_pfn(folio); in numa_rebuild_large_mapping()
5913 unsigned long start, end, addr = vmf->address; in numa_rebuild_large_mapping()
5914 unsigned long addr_start = addr - (nr << PAGE_SHIFT); in numa_rebuild_large_mapping()
5919 start = max3(addr_start, pt_start, vma->vm_start); in numa_rebuild_large_mapping()
5921 vma->vm_end); in numa_rebuild_large_mapping()
5922 start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); in numa_rebuild_large_mapping()
5936 ptent = pte_modify(ptent, vma->vm_page_prot); in numa_rebuild_large_mapping()
5949 struct vm_area_struct *vma = vmf->vma; in do_numa_page()
5963 spin_lock(vmf->ptl); in do_numa_page()
5965 old_pte = ptep_get(vmf->pte); in do_numa_page()
5967 if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { in do_numa_page()
5968 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
5972 pte = pte_modify(old_pte, vma->vm_page_prot); in do_numa_page()
5980 can_change_pte_writable(vma, vmf->address, pte)) in do_numa_page()
5983 folio = vm_normal_folio(vma, vmf->address, pte); in do_numa_page()
5990 target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, in do_numa_page()
5999 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
6012 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_numa_page()
6013 vmf->address, &vmf->ptl); in do_numa_page()
6014 if (unlikely(!vmf->pte)) in do_numa_page()
6016 if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in do_numa_page()
6017 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
6023 * non-accessible ptes, some can allow access by kernel mode. in do_numa_page()
6029 numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, in do_numa_page()
6031 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
6040 struct vm_area_struct *vma = vmf->vma; in create_huge_pmd()
6043 if (vma->vm_ops->huge_fault) in create_huge_pmd()
6044 return vma->vm_ops->huge_fault(vmf, PMD_ORDER); in create_huge_pmd()
6051 struct vm_area_struct *vma = vmf->vma; in wp_huge_pmd()
6052 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in wp_huge_pmd()
6057 userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { in wp_huge_pmd()
6058 if (userfaultfd_wp_async(vmf->vma)) in wp_huge_pmd()
6065 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in wp_huge_pmd()
6066 if (vma->vm_ops->huge_fault) { in wp_huge_pmd()
6067 ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); in wp_huge_pmd()
6074 /* COW or write-notify handled on pte level: split pmd. */ in wp_huge_pmd()
6075 __split_huge_pmd(vma, vmf->pmd, vmf->address, false); in wp_huge_pmd()
6084 struct vm_area_struct *vma = vmf->vma; in create_huge_pud()
6088 if (vma->vm_ops->huge_fault) in create_huge_pud()
6089 return vma->vm_ops->huge_fault(vmf, PUD_ORDER); in create_huge_pud()
6098 struct vm_area_struct *vma = vmf->vma; in wp_huge_pud()
6104 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in wp_huge_pud()
6105 if (vma->vm_ops->huge_fault) { in wp_huge_pud()
6106 ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); in wp_huge_pud()
6112 /* COW or write-notify not handled on PUD level: split pud.*/ in wp_huge_pud()
6113 __split_huge_pud(vma, vmf->pud, vmf->address); in wp_huge_pud()
6127 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
6137 if (unlikely(pmd_none(*vmf->pmd))) { in handle_pte_fault()
6139 * Leave __pte_alloc() until later: because vm_ops->fault may in handle_pte_fault()
6144 vmf->pte = NULL; in handle_pte_fault()
6145 vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; in handle_pte_fault()
6155 * Use the maywrite version to indicate that vmf->pte may be in handle_pte_fault()
6162 vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, in handle_pte_fault()
6163 vmf->address, &dummy_pmdval, in handle_pte_fault()
6164 &vmf->ptl); in handle_pte_fault()
6165 if (unlikely(!vmf->pte)) in handle_pte_fault()
6167 vmf->orig_pte = ptep_get_lockless(vmf->pte); in handle_pte_fault()
6168 vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; in handle_pte_fault()
6170 if (pte_none(vmf->orig_pte)) { in handle_pte_fault()
6171 pte_unmap(vmf->pte); in handle_pte_fault()
6172 vmf->pte = NULL; in handle_pte_fault()
6176 if (!vmf->pte) in handle_pte_fault()
6179 if (!pte_present(vmf->orig_pte)) in handle_pte_fault()
6182 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) in handle_pte_fault()
6185 spin_lock(vmf->ptl); in handle_pte_fault()
6186 entry = vmf->orig_pte; in handle_pte_fault()
6187 if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { in handle_pte_fault()
6188 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); in handle_pte_fault()
6191 if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { in handle_pte_fault()
6194 else if (likely(vmf->flags & FAULT_FLAG_WRITE)) in handle_pte_fault()
6198 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, in handle_pte_fault()
6199 vmf->flags & FAULT_FLAG_WRITE)) { in handle_pte_fault()
6200 update_mmu_cache_range(vmf, vmf->vma, vmf->address, in handle_pte_fault()
6201 vmf->pte, 1); in handle_pte_fault()
6204 if (vmf->flags & FAULT_FLAG_TRIED) in handle_pte_fault()
6212 if (vmf->flags & FAULT_FLAG_WRITE) in handle_pte_fault()
6213 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, in handle_pte_fault()
6214 vmf->pte); in handle_pte_fault()
6217 pte_unmap_unlock(vmf->pte, vmf->ptl); in handle_pte_fault()
6238 struct mm_struct *mm = vma->vm_mm; in __handle_mm_fault()
6239 vm_flags_t vm_flags = vma->vm_flags; in __handle_mm_fault()
6322 * mm_account_fault - Do page fault accounting
6325 * of perf event counters, but we'll still do the per-task accounting to
6334 * still be in per-arch page fault handlers at the entry of page fault.
6371 current->maj_flt++; in mm_account_fault()
6373 current->min_flt++; in mm_account_fault()
6393 current->in_lru_fault = vma_has_recency(vma); in lru_gen_enter_fault()
6398 current->in_lru_fault = false; in lru_gen_exit_fault()
6418 * just treat it like an ordinary read-fault otherwise. in sanitize_fault_flags()
6420 if (!is_cow_mapping(vma->vm_flags)) in sanitize_fault_flags()
6423 /* Write faults on read-only mappings are impossible ... */ in sanitize_fault_flags()
6424 if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) in sanitize_fault_flags()
6427 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && in sanitize_fault_flags()
6428 !is_cow_mapping(vma->vm_flags))) in sanitize_fault_flags()
6433 * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of in sanitize_fault_flags()
6456 struct mm_struct *mm = vma->vm_mm; in handle_mm_fault()
6473 is_droppable = !!(vma->vm_flags & VM_DROPPABLE); in handle_mm_fault()
6485 ret = hugetlb_fault(vma->vm_mm, vma, address, flags); in handle_mm_fault()
6490 * Warning: It is no longer safe to dereference vma-> after this point, in handle_mm_fault()
6522 * We've already handled the fast-path in-line.
6528 return -ENOMEM; in __p4d_alloc()
6530 spin_lock(&mm->page_table_lock); in __p4d_alloc()
6537 spin_unlock(&mm->page_table_lock); in __p4d_alloc()
6545 * We've already handled the fast-path in-line.
6551 return -ENOMEM; in __pud_alloc()
6553 spin_lock(&mm->page_table_lock); in __pud_alloc()
6560 spin_unlock(&mm->page_table_lock); in __pud_alloc()
6568 * We've already handled the fast-path in-line.
6575 return -ENOMEM; in __pmd_alloc()
6596 args->lock = lock; in pfnmap_args_setup()
6597 args->ptep = ptep; in pfnmap_args_setup()
6598 args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); in pfnmap_args_setup()
6599 args->addr_mask = addr_mask; in pfnmap_args_setup()
6600 args->pgprot = pgprot; in pfnmap_args_setup()
6601 args->writable = writable; in pfnmap_args_setup()
6602 args->special = special; in pfnmap_args_setup()
6608 struct file *file = vma->vm_file; in pfnmap_lockdep_assert()
6609 struct address_space *mapping = file ? file->f_mapping : NULL; in pfnmap_lockdep_assert()
6612 lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || in pfnmap_lockdep_assert()
6613 lockdep_is_held(&vma->vm_mm->mmap_lock)); in pfnmap_lockdep_assert()
6615 lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); in pfnmap_lockdep_assert()
6620 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6623 * The caller needs to setup args->vma and args->address to point to the
6640 * a later point in time can trigger use-after-free.
6652 struct vm_area_struct *vma = args->vma; in follow_pfnmap_start()
6653 unsigned long address = args->address; in follow_pfnmap_start()
6654 struct mm_struct *mm = vma->vm_mm; in follow_pfnmap_start()
6664 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) in follow_pfnmap_start()
6667 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) in follow_pfnmap_start()
6722 return -EINVAL; in follow_pfnmap_start()
6735 if (args->lock) in follow_pfnmap_end()
6736 spin_unlock(args->lock); in follow_pfnmap_end()
6737 if (args->ptep) in follow_pfnmap_end()
6738 pte_unmap(args->ptep); in follow_pfnmap_end()
6744 * generic_access_phys - generic implementation for iomem mmap access
6762 int ret = -EINVAL; in generic_access_phys()
6768 return -EINVAL; in generic_access_phys()
6775 return -EINVAL; in generic_access_phys()
6779 return -ENOMEM; in generic_access_phys()
6842 return buf - old_buf; in __access_remote_vm()
6854 if (vma->vm_ops && vma->vm_ops->access) in __access_remote_vm()
6855 bytes = vma->vm_ops->access(vma, addr, buf, in __access_remote_vm()
6863 offset = addr & (PAGE_SIZE-1); in __access_remote_vm()
6864 if (bytes > PAGE_SIZE-offset) in __access_remote_vm()
6865 bytes = PAGE_SIZE-offset; in __access_remote_vm()
6878 len -= bytes; in __access_remote_vm()
6884 return buf - old_buf; in __access_remote_vm()
6888 * access_remote_vm - access another process' address space
6931 * If there is any error return -EFAULT.
6942 return -EFAULT; in __copy_remote_vm_str()
6948 err = -EFAULT; in __copy_remote_vm_str()
6967 err = -EFAULT; in __copy_remote_vm_str()
6973 offset = addr & (PAGE_SIZE - 1); in __copy_remote_vm_str()
6974 if (bytes > PAGE_SIZE - offset) in __copy_remote_vm_str()
6975 bytes = PAGE_SIZE - offset; in __copy_remote_vm_str()
6986 buf += bytes - 1; in __copy_remote_vm_str()
6993 addr += bytes - 1; in __copy_remote_vm_str()
6994 copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); in __copy_remote_vm_str()
6998 len -= bytes; in __copy_remote_vm_str()
7007 return buf - old_buf; in __copy_remote_vm_str()
7011 * copy_remote_vm_str - copy a string from another process's address space.
7021 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
7022 * buffer. On any error, return -EFAULT.
7036 return -EFAULT; in copy_remote_vm_str()
7053 struct mm_struct *mm = current->mm; in print_vma_addr()
7063 if (vma && vma->vm_file) { in print_vma_addr()
7064 struct file *f = vma->vm_file; in print_vma_addr()
7065 ip -= vma->vm_start; in print_vma_addr()
7066 ip += vma->vm_pgoff << PAGE_SHIFT; in print_vma_addr()
7068 vma->vm_start, in print_vma_addr()
7069 vma->vm_end - vma->vm_start); in print_vma_addr()
7075 void __might_fault(const char *file, int line) in __might_fault() argument
7079 __might_sleep(file, line); in __might_fault()
7080 if (current->mm) in __might_fault()
7081 might_lock_read(&current->mm->mmap_lock); in __might_fault()
7099 ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); in process_huge_page()
7103 n = (addr_hint - addr) / PAGE_SIZE; in process_huge_page()
7109 for (i = nr_pages - 1; i >= 2 * n; i--) { in process_huge_page()
7117 base = nr_pages - 2 * (nr_pages - n); in process_huge_page()
7118 l = nr_pages - n; in process_huge_page()
7128 * Process remaining subpages in left-right-left-right pattern in process_huge_page()
7133 int right_idx = base + 2 * l - 1 - i; in process_huge_page()
7169 * folio_zero_user - Zero a folio which will be mapped to userspace.
7200 return -EHWPOISON; in copy_user_gigantic_page()
7214 struct page *dst = folio_page(copy_arg->dst, idx); in copy_subpage()
7215 struct page *src = folio_page(copy_arg->src, idx); in copy_subpage()
7217 if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) in copy_subpage()
7218 return -EHWPOISON; in copy_subpage()
7258 ret_val -= (PAGE_SIZE - rc); in copy_folio_from_user()
7276 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, in ptlock_cache_init()
7287 ptdesc->ptl = ptl; in ptlock_alloc()
7293 if (ptdesc->ptl) in ptlock_free()
7294 kmem_cache_free(page_ptl_cachep, ptdesc->ptl); in ptlock_free()