1c1a4de99SAndrea Arcangeli /* 2c1a4de99SAndrea Arcangeli * mm/userfaultfd.c 3c1a4de99SAndrea Arcangeli * 4c1a4de99SAndrea Arcangeli * Copyright (C) 2015 Red Hat, Inc. 5c1a4de99SAndrea Arcangeli * 6c1a4de99SAndrea Arcangeli * This work is licensed under the terms of the GNU GPL, version 2. See 7c1a4de99SAndrea Arcangeli * the COPYING file in the top-level directory. 8c1a4de99SAndrea Arcangeli */ 9c1a4de99SAndrea Arcangeli 10c1a4de99SAndrea Arcangeli #include <linux/mm.h> 11c1a4de99SAndrea Arcangeli #include <linux/pagemap.h> 12c1a4de99SAndrea Arcangeli #include <linux/rmap.h> 13c1a4de99SAndrea Arcangeli #include <linux/swap.h> 14c1a4de99SAndrea Arcangeli #include <linux/swapops.h> 15c1a4de99SAndrea Arcangeli #include <linux/userfaultfd_k.h> 16c1a4de99SAndrea Arcangeli #include <linux/mmu_notifier.h> 1760d4d2d2SMike Kravetz #include <linux/hugetlb.h> 1860d4d2d2SMike Kravetz #include <linux/pagemap.h> 19*26071cedSMike Rapoport #include <linux/shmem_fs.h> 20c1a4de99SAndrea Arcangeli #include <asm/tlbflush.h> 21c1a4de99SAndrea Arcangeli #include "internal.h" 22c1a4de99SAndrea Arcangeli 23c1a4de99SAndrea Arcangeli static int mcopy_atomic_pte(struct mm_struct *dst_mm, 24c1a4de99SAndrea Arcangeli pmd_t *dst_pmd, 25c1a4de99SAndrea Arcangeli struct vm_area_struct *dst_vma, 26c1a4de99SAndrea Arcangeli unsigned long dst_addr, 27b6ebaedbSAndrea Arcangeli unsigned long src_addr, 28b6ebaedbSAndrea Arcangeli struct page **pagep) 29c1a4de99SAndrea Arcangeli { 30c1a4de99SAndrea Arcangeli struct mem_cgroup *memcg; 31c1a4de99SAndrea Arcangeli pte_t _dst_pte, *dst_pte; 32c1a4de99SAndrea Arcangeli spinlock_t *ptl; 33c1a4de99SAndrea Arcangeli void *page_kaddr; 34c1a4de99SAndrea Arcangeli int ret; 35b6ebaedbSAndrea Arcangeli struct page *page; 36c1a4de99SAndrea Arcangeli 37b6ebaedbSAndrea Arcangeli if (!*pagep) { 38c1a4de99SAndrea Arcangeli ret = -ENOMEM; 39c1a4de99SAndrea Arcangeli page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 40c1a4de99SAndrea Arcangeli if (!page) 41c1a4de99SAndrea Arcangeli goto out; 42c1a4de99SAndrea Arcangeli 43b6ebaedbSAndrea Arcangeli page_kaddr = kmap_atomic(page); 44b6ebaedbSAndrea Arcangeli ret = copy_from_user(page_kaddr, 45b6ebaedbSAndrea Arcangeli (const void __user *) src_addr, 46b6ebaedbSAndrea Arcangeli PAGE_SIZE); 47b6ebaedbSAndrea Arcangeli kunmap_atomic(page_kaddr); 48b6ebaedbSAndrea Arcangeli 49b6ebaedbSAndrea Arcangeli /* fallback to copy_from_user outside mmap_sem */ 50b6ebaedbSAndrea Arcangeli if (unlikely(ret)) { 51c1a4de99SAndrea Arcangeli ret = -EFAULT; 52b6ebaedbSAndrea Arcangeli *pagep = page; 53b6ebaedbSAndrea Arcangeli /* don't free the page */ 54b6ebaedbSAndrea Arcangeli goto out; 55b6ebaedbSAndrea Arcangeli } 56b6ebaedbSAndrea Arcangeli } else { 57b6ebaedbSAndrea Arcangeli page = *pagep; 58b6ebaedbSAndrea Arcangeli *pagep = NULL; 59b6ebaedbSAndrea Arcangeli } 60c1a4de99SAndrea Arcangeli 61c1a4de99SAndrea Arcangeli /* 62c1a4de99SAndrea Arcangeli * The memory barrier inside __SetPageUptodate makes sure that 63c1a4de99SAndrea Arcangeli * preceeding stores to the page contents become visible before 64c1a4de99SAndrea Arcangeli * the set_pte_at() write. 65c1a4de99SAndrea Arcangeli */ 66c1a4de99SAndrea Arcangeli __SetPageUptodate(page); 67c1a4de99SAndrea Arcangeli 68c1a4de99SAndrea Arcangeli ret = -ENOMEM; 69f627c2f5SKirill A. Shutemov if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) 70c1a4de99SAndrea Arcangeli goto out_release; 71c1a4de99SAndrea Arcangeli 72c1a4de99SAndrea Arcangeli _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 73c1a4de99SAndrea Arcangeli if (dst_vma->vm_flags & VM_WRITE) 74c1a4de99SAndrea Arcangeli _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 75c1a4de99SAndrea Arcangeli 76c1a4de99SAndrea Arcangeli ret = -EEXIST; 77c1a4de99SAndrea Arcangeli dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 78c1a4de99SAndrea Arcangeli if (!pte_none(*dst_pte)) 79c1a4de99SAndrea Arcangeli goto out_release_uncharge_unlock; 80c1a4de99SAndrea Arcangeli 81c1a4de99SAndrea Arcangeli inc_mm_counter(dst_mm, MM_ANONPAGES); 82d281ee61SKirill A. Shutemov page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 83f627c2f5SKirill A. Shutemov mem_cgroup_commit_charge(page, memcg, false, false); 84c1a4de99SAndrea Arcangeli lru_cache_add_active_or_unevictable(page, dst_vma); 85c1a4de99SAndrea Arcangeli 86c1a4de99SAndrea Arcangeli set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 87c1a4de99SAndrea Arcangeli 88c1a4de99SAndrea Arcangeli /* No need to invalidate - it was non-present before */ 89c1a4de99SAndrea Arcangeli update_mmu_cache(dst_vma, dst_addr, dst_pte); 90c1a4de99SAndrea Arcangeli 91c1a4de99SAndrea Arcangeli pte_unmap_unlock(dst_pte, ptl); 92c1a4de99SAndrea Arcangeli ret = 0; 93c1a4de99SAndrea Arcangeli out: 94c1a4de99SAndrea Arcangeli return ret; 95c1a4de99SAndrea Arcangeli out_release_uncharge_unlock: 96c1a4de99SAndrea Arcangeli pte_unmap_unlock(dst_pte, ptl); 97f627c2f5SKirill A. Shutemov mem_cgroup_cancel_charge(page, memcg, false); 98c1a4de99SAndrea Arcangeli out_release: 9909cbfeafSKirill A. Shutemov put_page(page); 100c1a4de99SAndrea Arcangeli goto out; 101c1a4de99SAndrea Arcangeli } 102c1a4de99SAndrea Arcangeli 103c1a4de99SAndrea Arcangeli static int mfill_zeropage_pte(struct mm_struct *dst_mm, 104c1a4de99SAndrea Arcangeli pmd_t *dst_pmd, 105c1a4de99SAndrea Arcangeli struct vm_area_struct *dst_vma, 106c1a4de99SAndrea Arcangeli unsigned long dst_addr) 107c1a4de99SAndrea Arcangeli { 108c1a4de99SAndrea Arcangeli pte_t _dst_pte, *dst_pte; 109c1a4de99SAndrea Arcangeli spinlock_t *ptl; 110c1a4de99SAndrea Arcangeli int ret; 111c1a4de99SAndrea Arcangeli 112c1a4de99SAndrea Arcangeli _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 113c1a4de99SAndrea Arcangeli dst_vma->vm_page_prot)); 114c1a4de99SAndrea Arcangeli ret = -EEXIST; 115c1a4de99SAndrea Arcangeli dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 116c1a4de99SAndrea Arcangeli if (!pte_none(*dst_pte)) 117c1a4de99SAndrea Arcangeli goto out_unlock; 118c1a4de99SAndrea Arcangeli set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 119c1a4de99SAndrea Arcangeli /* No need to invalidate - it was non-present before */ 120c1a4de99SAndrea Arcangeli update_mmu_cache(dst_vma, dst_addr, dst_pte); 121c1a4de99SAndrea Arcangeli ret = 0; 122c1a4de99SAndrea Arcangeli out_unlock: 123c1a4de99SAndrea Arcangeli pte_unmap_unlock(dst_pte, ptl); 124c1a4de99SAndrea Arcangeli return ret; 125c1a4de99SAndrea Arcangeli } 126c1a4de99SAndrea Arcangeli 127c1a4de99SAndrea Arcangeli static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 128c1a4de99SAndrea Arcangeli { 129c1a4de99SAndrea Arcangeli pgd_t *pgd; 130c1a4de99SAndrea Arcangeli pud_t *pud; 131c1a4de99SAndrea Arcangeli pmd_t *pmd = NULL; 132c1a4de99SAndrea Arcangeli 133c1a4de99SAndrea Arcangeli pgd = pgd_offset(mm, address); 134c1a4de99SAndrea Arcangeli pud = pud_alloc(mm, pgd, address); 135c1a4de99SAndrea Arcangeli if (pud) 136c1a4de99SAndrea Arcangeli /* 137c1a4de99SAndrea Arcangeli * Note that we didn't run this because the pmd was 138c1a4de99SAndrea Arcangeli * missing, the *pmd may be already established and in 139c1a4de99SAndrea Arcangeli * turn it may also be a trans_huge_pmd. 140c1a4de99SAndrea Arcangeli */ 141c1a4de99SAndrea Arcangeli pmd = pmd_alloc(mm, pud, address); 142c1a4de99SAndrea Arcangeli return pmd; 143c1a4de99SAndrea Arcangeli } 144c1a4de99SAndrea Arcangeli 14560d4d2d2SMike Kravetz #ifdef CONFIG_HUGETLB_PAGE 14660d4d2d2SMike Kravetz /* 14760d4d2d2SMike Kravetz * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 14860d4d2d2SMike Kravetz * called with mmap_sem held, it will release mmap_sem before returning. 14960d4d2d2SMike Kravetz */ 15060d4d2d2SMike Kravetz static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 15160d4d2d2SMike Kravetz struct vm_area_struct *dst_vma, 15260d4d2d2SMike Kravetz unsigned long dst_start, 15360d4d2d2SMike Kravetz unsigned long src_start, 15460d4d2d2SMike Kravetz unsigned long len, 15560d4d2d2SMike Kravetz bool zeropage) 15660d4d2d2SMike Kravetz { 15760d4d2d2SMike Kravetz ssize_t err; 15860d4d2d2SMike Kravetz pte_t *dst_pte; 15960d4d2d2SMike Kravetz unsigned long src_addr, dst_addr; 16060d4d2d2SMike Kravetz long copied; 16160d4d2d2SMike Kravetz struct page *page; 16260d4d2d2SMike Kravetz struct hstate *h; 16360d4d2d2SMike Kravetz unsigned long vma_hpagesize; 16460d4d2d2SMike Kravetz pgoff_t idx; 16560d4d2d2SMike Kravetz u32 hash; 16660d4d2d2SMike Kravetz struct address_space *mapping; 16760d4d2d2SMike Kravetz 16860d4d2d2SMike Kravetz /* 16960d4d2d2SMike Kravetz * There is no default zero huge page for all huge page sizes as 17060d4d2d2SMike Kravetz * supported by hugetlb. A PMD_SIZE huge pages may exist as used 17160d4d2d2SMike Kravetz * by THP. Since we can not reliably insert a zero page, this 17260d4d2d2SMike Kravetz * feature is not supported. 17360d4d2d2SMike Kravetz */ 17460d4d2d2SMike Kravetz if (zeropage) { 17560d4d2d2SMike Kravetz up_read(&dst_mm->mmap_sem); 17660d4d2d2SMike Kravetz return -EINVAL; 17760d4d2d2SMike Kravetz } 17860d4d2d2SMike Kravetz 17960d4d2d2SMike Kravetz src_addr = src_start; 18060d4d2d2SMike Kravetz dst_addr = dst_start; 18160d4d2d2SMike Kravetz copied = 0; 18260d4d2d2SMike Kravetz page = NULL; 18360d4d2d2SMike Kravetz vma_hpagesize = vma_kernel_pagesize(dst_vma); 18460d4d2d2SMike Kravetz 18560d4d2d2SMike Kravetz /* 18660d4d2d2SMike Kravetz * Validate alignment based on huge page size 18760d4d2d2SMike Kravetz */ 18860d4d2d2SMike Kravetz err = -EINVAL; 18960d4d2d2SMike Kravetz if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 19060d4d2d2SMike Kravetz goto out_unlock; 19160d4d2d2SMike Kravetz 19260d4d2d2SMike Kravetz retry: 19360d4d2d2SMike Kravetz /* 19460d4d2d2SMike Kravetz * On routine entry dst_vma is set. If we had to drop mmap_sem and 19560d4d2d2SMike Kravetz * retry, dst_vma will be set to NULL and we must lookup again. 19660d4d2d2SMike Kravetz */ 19760d4d2d2SMike Kravetz if (!dst_vma) { 19860d4d2d2SMike Kravetz err = -EINVAL; 19960d4d2d2SMike Kravetz dst_vma = find_vma(dst_mm, dst_start); 20060d4d2d2SMike Kravetz if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) 20160d4d2d2SMike Kravetz goto out_unlock; 20260d4d2d2SMike Kravetz 20360d4d2d2SMike Kravetz if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 20460d4d2d2SMike Kravetz goto out_unlock; 20560d4d2d2SMike Kravetz 20660d4d2d2SMike Kravetz /* 20760d4d2d2SMike Kravetz * Make sure the vma is not shared, that the remaining dst 20860d4d2d2SMike Kravetz * range is both valid and fully within a single existing vma. 20960d4d2d2SMike Kravetz */ 21060d4d2d2SMike Kravetz if (dst_vma->vm_flags & VM_SHARED) 21160d4d2d2SMike Kravetz goto out_unlock; 21260d4d2d2SMike Kravetz if (dst_start < dst_vma->vm_start || 21360d4d2d2SMike Kravetz dst_start + len > dst_vma->vm_end) 21460d4d2d2SMike Kravetz goto out_unlock; 21560d4d2d2SMike Kravetz } 21660d4d2d2SMike Kravetz 21760d4d2d2SMike Kravetz if (WARN_ON(dst_addr & (vma_hpagesize - 1) || 21860d4d2d2SMike Kravetz (len - copied) & (vma_hpagesize - 1))) 21960d4d2d2SMike Kravetz goto out_unlock; 22060d4d2d2SMike Kravetz 22160d4d2d2SMike Kravetz /* 22260d4d2d2SMike Kravetz * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges. 22360d4d2d2SMike Kravetz */ 22460d4d2d2SMike Kravetz if (!dst_vma->vm_userfaultfd_ctx.ctx) 22560d4d2d2SMike Kravetz goto out_unlock; 22660d4d2d2SMike Kravetz 22760d4d2d2SMike Kravetz /* 22860d4d2d2SMike Kravetz * Ensure the dst_vma has a anon_vma. 22960d4d2d2SMike Kravetz */ 23060d4d2d2SMike Kravetz err = -ENOMEM; 23160d4d2d2SMike Kravetz if (unlikely(anon_vma_prepare(dst_vma))) 23260d4d2d2SMike Kravetz goto out_unlock; 23360d4d2d2SMike Kravetz 23460d4d2d2SMike Kravetz h = hstate_vma(dst_vma); 23560d4d2d2SMike Kravetz 23660d4d2d2SMike Kravetz while (src_addr < src_start + len) { 23760d4d2d2SMike Kravetz pte_t dst_pteval; 23860d4d2d2SMike Kravetz 23960d4d2d2SMike Kravetz BUG_ON(dst_addr >= dst_start + len); 24060d4d2d2SMike Kravetz VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 24160d4d2d2SMike Kravetz 24260d4d2d2SMike Kravetz /* 24360d4d2d2SMike Kravetz * Serialize via hugetlb_fault_mutex 24460d4d2d2SMike Kravetz */ 24560d4d2d2SMike Kravetz idx = linear_page_index(dst_vma, dst_addr); 24660d4d2d2SMike Kravetz mapping = dst_vma->vm_file->f_mapping; 24760d4d2d2SMike Kravetz hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 24860d4d2d2SMike Kravetz idx, dst_addr); 24960d4d2d2SMike Kravetz mutex_lock(&hugetlb_fault_mutex_table[hash]); 25060d4d2d2SMike Kravetz 25160d4d2d2SMike Kravetz err = -ENOMEM; 25260d4d2d2SMike Kravetz dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 25360d4d2d2SMike Kravetz if (!dst_pte) { 25460d4d2d2SMike Kravetz mutex_unlock(&hugetlb_fault_mutex_table[hash]); 25560d4d2d2SMike Kravetz goto out_unlock; 25660d4d2d2SMike Kravetz } 25760d4d2d2SMike Kravetz 25860d4d2d2SMike Kravetz err = -EEXIST; 25960d4d2d2SMike Kravetz dst_pteval = huge_ptep_get(dst_pte); 26060d4d2d2SMike Kravetz if (!huge_pte_none(dst_pteval)) { 26160d4d2d2SMike Kravetz mutex_unlock(&hugetlb_fault_mutex_table[hash]); 26260d4d2d2SMike Kravetz goto out_unlock; 26360d4d2d2SMike Kravetz } 26460d4d2d2SMike Kravetz 26560d4d2d2SMike Kravetz err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 26660d4d2d2SMike Kravetz dst_addr, src_addr, &page); 26760d4d2d2SMike Kravetz 26860d4d2d2SMike Kravetz mutex_unlock(&hugetlb_fault_mutex_table[hash]); 26960d4d2d2SMike Kravetz 27060d4d2d2SMike Kravetz cond_resched(); 27160d4d2d2SMike Kravetz 27260d4d2d2SMike Kravetz if (unlikely(err == -EFAULT)) { 27360d4d2d2SMike Kravetz up_read(&dst_mm->mmap_sem); 27460d4d2d2SMike Kravetz BUG_ON(!page); 27560d4d2d2SMike Kravetz 27660d4d2d2SMike Kravetz err = copy_huge_page_from_user(page, 27760d4d2d2SMike Kravetz (const void __user *)src_addr, 278810a56b9SMike Kravetz pages_per_huge_page(h), true); 27960d4d2d2SMike Kravetz if (unlikely(err)) { 28060d4d2d2SMike Kravetz err = -EFAULT; 28160d4d2d2SMike Kravetz goto out; 28260d4d2d2SMike Kravetz } 28360d4d2d2SMike Kravetz down_read(&dst_mm->mmap_sem); 28460d4d2d2SMike Kravetz 28560d4d2d2SMike Kravetz dst_vma = NULL; 28660d4d2d2SMike Kravetz goto retry; 28760d4d2d2SMike Kravetz } else 28860d4d2d2SMike Kravetz BUG_ON(page); 28960d4d2d2SMike Kravetz 29060d4d2d2SMike Kravetz if (!err) { 29160d4d2d2SMike Kravetz dst_addr += vma_hpagesize; 29260d4d2d2SMike Kravetz src_addr += vma_hpagesize; 29360d4d2d2SMike Kravetz copied += vma_hpagesize; 29460d4d2d2SMike Kravetz 29560d4d2d2SMike Kravetz if (fatal_signal_pending(current)) 29660d4d2d2SMike Kravetz err = -EINTR; 29760d4d2d2SMike Kravetz } 29860d4d2d2SMike Kravetz if (err) 29960d4d2d2SMike Kravetz break; 30060d4d2d2SMike Kravetz } 30160d4d2d2SMike Kravetz 30260d4d2d2SMike Kravetz out_unlock: 30360d4d2d2SMike Kravetz up_read(&dst_mm->mmap_sem); 30460d4d2d2SMike Kravetz out: 30521205bf8SMike Kravetz if (page) { 30621205bf8SMike Kravetz /* 30721205bf8SMike Kravetz * We encountered an error and are about to free a newly 30821205bf8SMike Kravetz * allocated huge page. It is possible that there was a 30921205bf8SMike Kravetz * reservation associated with the page that has been 31021205bf8SMike Kravetz * consumed. See the routine restore_reserve_on_error 31121205bf8SMike Kravetz * for details. Unfortunately, we can not call 31221205bf8SMike Kravetz * restore_reserve_on_error now as it would require holding 31321205bf8SMike Kravetz * mmap_sem. Clear the PagePrivate flag so that the global 31421205bf8SMike Kravetz * reserve count will not be incremented in free_huge_page. 31521205bf8SMike Kravetz * The reservation map will still indicate the reservation 31621205bf8SMike Kravetz * was consumed and possibly prevent later page allocation. 31721205bf8SMike Kravetz * This is better than leaking a global reservation. 31821205bf8SMike Kravetz */ 31921205bf8SMike Kravetz ClearPagePrivate(page); 32060d4d2d2SMike Kravetz put_page(page); 32121205bf8SMike Kravetz } 32260d4d2d2SMike Kravetz BUG_ON(copied < 0); 32360d4d2d2SMike Kravetz BUG_ON(err > 0); 32460d4d2d2SMike Kravetz BUG_ON(!copied && !err); 32560d4d2d2SMike Kravetz return copied ? copied : err; 32660d4d2d2SMike Kravetz } 32760d4d2d2SMike Kravetz #else /* !CONFIG_HUGETLB_PAGE */ 32860d4d2d2SMike Kravetz /* fail at build time if gcc attempts to use this */ 32960d4d2d2SMike Kravetz extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 33060d4d2d2SMike Kravetz struct vm_area_struct *dst_vma, 33160d4d2d2SMike Kravetz unsigned long dst_start, 33260d4d2d2SMike Kravetz unsigned long src_start, 33360d4d2d2SMike Kravetz unsigned long len, 33460d4d2d2SMike Kravetz bool zeropage); 33560d4d2d2SMike Kravetz #endif /* CONFIG_HUGETLB_PAGE */ 33660d4d2d2SMike Kravetz 337c1a4de99SAndrea Arcangeli static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 338c1a4de99SAndrea Arcangeli unsigned long dst_start, 339c1a4de99SAndrea Arcangeli unsigned long src_start, 340c1a4de99SAndrea Arcangeli unsigned long len, 341c1a4de99SAndrea Arcangeli bool zeropage) 342c1a4de99SAndrea Arcangeli { 343c1a4de99SAndrea Arcangeli struct vm_area_struct *dst_vma; 344c1a4de99SAndrea Arcangeli ssize_t err; 345c1a4de99SAndrea Arcangeli pmd_t *dst_pmd; 346c1a4de99SAndrea Arcangeli unsigned long src_addr, dst_addr; 347b6ebaedbSAndrea Arcangeli long copied; 348b6ebaedbSAndrea Arcangeli struct page *page; 349c1a4de99SAndrea Arcangeli 350c1a4de99SAndrea Arcangeli /* 351c1a4de99SAndrea Arcangeli * Sanitize the command parameters: 352c1a4de99SAndrea Arcangeli */ 353c1a4de99SAndrea Arcangeli BUG_ON(dst_start & ~PAGE_MASK); 354c1a4de99SAndrea Arcangeli BUG_ON(len & ~PAGE_MASK); 355c1a4de99SAndrea Arcangeli 356c1a4de99SAndrea Arcangeli /* Does the address range wrap, or is the span zero-sized? */ 357c1a4de99SAndrea Arcangeli BUG_ON(src_start + len <= src_start); 358c1a4de99SAndrea Arcangeli BUG_ON(dst_start + len <= dst_start); 359c1a4de99SAndrea Arcangeli 360b6ebaedbSAndrea Arcangeli src_addr = src_start; 361b6ebaedbSAndrea Arcangeli dst_addr = dst_start; 362b6ebaedbSAndrea Arcangeli copied = 0; 363b6ebaedbSAndrea Arcangeli page = NULL; 364b6ebaedbSAndrea Arcangeli retry: 365c1a4de99SAndrea Arcangeli down_read(&dst_mm->mmap_sem); 366c1a4de99SAndrea Arcangeli 367c1a4de99SAndrea Arcangeli /* 368c1a4de99SAndrea Arcangeli * Make sure the vma is not shared, that the dst range is 369c1a4de99SAndrea Arcangeli * both valid and fully within a single existing vma. 370c1a4de99SAndrea Arcangeli */ 371c1a4de99SAndrea Arcangeli err = -EINVAL; 372c1a4de99SAndrea Arcangeli dst_vma = find_vma(dst_mm, dst_start); 373*26071cedSMike Rapoport if (!dst_vma) 374*26071cedSMike Rapoport goto out_unlock; 375*26071cedSMike Rapoport if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED) 376b6ebaedbSAndrea Arcangeli goto out_unlock; 377c1a4de99SAndrea Arcangeli if (dst_start < dst_vma->vm_start || 378c1a4de99SAndrea Arcangeli dst_start + len > dst_vma->vm_end) 379b6ebaedbSAndrea Arcangeli goto out_unlock; 380c1a4de99SAndrea Arcangeli 381c1a4de99SAndrea Arcangeli /* 38260d4d2d2SMike Kravetz * If this is a HUGETLB vma, pass off to appropriate routine 38360d4d2d2SMike Kravetz */ 38460d4d2d2SMike Kravetz if (is_vm_hugetlb_page(dst_vma)) 38560d4d2d2SMike Kravetz return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 38660d4d2d2SMike Kravetz src_start, len, zeropage); 38760d4d2d2SMike Kravetz 38860d4d2d2SMike Kravetz /* 389c1a4de99SAndrea Arcangeli * Be strict and only allow __mcopy_atomic on userfaultfd 390c1a4de99SAndrea Arcangeli * registered ranges to prevent userland errors going 391c1a4de99SAndrea Arcangeli * unnoticed. As far as the VM consistency is concerned, it 392c1a4de99SAndrea Arcangeli * would be perfectly safe to remove this check, but there's 393c1a4de99SAndrea Arcangeli * no useful usage for __mcopy_atomic ouside of userfaultfd 394c1a4de99SAndrea Arcangeli * registered ranges. This is after all why these are ioctls 395c1a4de99SAndrea Arcangeli * belonging to the userfaultfd and not syscalls. 396c1a4de99SAndrea Arcangeli */ 397c1a4de99SAndrea Arcangeli if (!dst_vma->vm_userfaultfd_ctx.ctx) 398b6ebaedbSAndrea Arcangeli goto out_unlock; 399c1a4de99SAndrea Arcangeli 400*26071cedSMike Rapoport if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 401b6ebaedbSAndrea Arcangeli goto out_unlock; 402c1a4de99SAndrea Arcangeli 403c1a4de99SAndrea Arcangeli /* 404c1a4de99SAndrea Arcangeli * Ensure the dst_vma has a anon_vma or this page 405c1a4de99SAndrea Arcangeli * would get a NULL anon_vma when moved in the 406c1a4de99SAndrea Arcangeli * dst_vma. 407c1a4de99SAndrea Arcangeli */ 408c1a4de99SAndrea Arcangeli err = -ENOMEM; 409*26071cedSMike Rapoport if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) 410b6ebaedbSAndrea Arcangeli goto out_unlock; 411c1a4de99SAndrea Arcangeli 412b6ebaedbSAndrea Arcangeli while (src_addr < src_start + len) { 413c1a4de99SAndrea Arcangeli pmd_t dst_pmdval; 414b6ebaedbSAndrea Arcangeli 415c1a4de99SAndrea Arcangeli BUG_ON(dst_addr >= dst_start + len); 416b6ebaedbSAndrea Arcangeli 417c1a4de99SAndrea Arcangeli dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 418c1a4de99SAndrea Arcangeli if (unlikely(!dst_pmd)) { 419c1a4de99SAndrea Arcangeli err = -ENOMEM; 420c1a4de99SAndrea Arcangeli break; 421c1a4de99SAndrea Arcangeli } 422c1a4de99SAndrea Arcangeli 423c1a4de99SAndrea Arcangeli dst_pmdval = pmd_read_atomic(dst_pmd); 424c1a4de99SAndrea Arcangeli /* 425c1a4de99SAndrea Arcangeli * If the dst_pmd is mapped as THP don't 426c1a4de99SAndrea Arcangeli * override it and just be strict. 427c1a4de99SAndrea Arcangeli */ 428c1a4de99SAndrea Arcangeli if (unlikely(pmd_trans_huge(dst_pmdval))) { 429c1a4de99SAndrea Arcangeli err = -EEXIST; 430c1a4de99SAndrea Arcangeli break; 431c1a4de99SAndrea Arcangeli } 432c1a4de99SAndrea Arcangeli if (unlikely(pmd_none(dst_pmdval)) && 4333ed3a4f0SKirill A. Shutemov unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { 434c1a4de99SAndrea Arcangeli err = -ENOMEM; 435c1a4de99SAndrea Arcangeli break; 436c1a4de99SAndrea Arcangeli } 437c1a4de99SAndrea Arcangeli /* If an huge pmd materialized from under us fail */ 438c1a4de99SAndrea Arcangeli if (unlikely(pmd_trans_huge(*dst_pmd))) { 439c1a4de99SAndrea Arcangeli err = -EFAULT; 440c1a4de99SAndrea Arcangeli break; 441c1a4de99SAndrea Arcangeli } 442c1a4de99SAndrea Arcangeli 443c1a4de99SAndrea Arcangeli BUG_ON(pmd_none(*dst_pmd)); 444c1a4de99SAndrea Arcangeli BUG_ON(pmd_trans_huge(*dst_pmd)); 445c1a4de99SAndrea Arcangeli 446*26071cedSMike Rapoport if (vma_is_anonymous(dst_vma)) { 447c1a4de99SAndrea Arcangeli if (!zeropage) 448c1a4de99SAndrea Arcangeli err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 449*26071cedSMike Rapoport dst_addr, src_addr, 450*26071cedSMike Rapoport &page); 451c1a4de99SAndrea Arcangeli else 452*26071cedSMike Rapoport err = mfill_zeropage_pte(dst_mm, dst_pmd, 453*26071cedSMike Rapoport dst_vma, dst_addr); 454*26071cedSMike Rapoport } else { 455*26071cedSMike Rapoport err = -EINVAL; /* if zeropage is true return -EINVAL */ 456*26071cedSMike Rapoport if (likely(!zeropage)) 457*26071cedSMike Rapoport err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, 458*26071cedSMike Rapoport dst_vma, dst_addr, 459*26071cedSMike Rapoport src_addr, &page); 460*26071cedSMike Rapoport } 461c1a4de99SAndrea Arcangeli 462c1a4de99SAndrea Arcangeli cond_resched(); 463c1a4de99SAndrea Arcangeli 464b6ebaedbSAndrea Arcangeli if (unlikely(err == -EFAULT)) { 465b6ebaedbSAndrea Arcangeli void *page_kaddr; 466b6ebaedbSAndrea Arcangeli 467b6ebaedbSAndrea Arcangeli up_read(&dst_mm->mmap_sem); 468b6ebaedbSAndrea Arcangeli BUG_ON(!page); 469b6ebaedbSAndrea Arcangeli 470b6ebaedbSAndrea Arcangeli page_kaddr = kmap(page); 471b6ebaedbSAndrea Arcangeli err = copy_from_user(page_kaddr, 472b6ebaedbSAndrea Arcangeli (const void __user *) src_addr, 473b6ebaedbSAndrea Arcangeli PAGE_SIZE); 474b6ebaedbSAndrea Arcangeli kunmap(page); 475b6ebaedbSAndrea Arcangeli if (unlikely(err)) { 476b6ebaedbSAndrea Arcangeli err = -EFAULT; 477b6ebaedbSAndrea Arcangeli goto out; 478b6ebaedbSAndrea Arcangeli } 479b6ebaedbSAndrea Arcangeli goto retry; 480b6ebaedbSAndrea Arcangeli } else 481b6ebaedbSAndrea Arcangeli BUG_ON(page); 482b6ebaedbSAndrea Arcangeli 483c1a4de99SAndrea Arcangeli if (!err) { 484c1a4de99SAndrea Arcangeli dst_addr += PAGE_SIZE; 485c1a4de99SAndrea Arcangeli src_addr += PAGE_SIZE; 486c1a4de99SAndrea Arcangeli copied += PAGE_SIZE; 487c1a4de99SAndrea Arcangeli 488c1a4de99SAndrea Arcangeli if (fatal_signal_pending(current)) 489c1a4de99SAndrea Arcangeli err = -EINTR; 490c1a4de99SAndrea Arcangeli } 491c1a4de99SAndrea Arcangeli if (err) 492c1a4de99SAndrea Arcangeli break; 493c1a4de99SAndrea Arcangeli } 494c1a4de99SAndrea Arcangeli 495b6ebaedbSAndrea Arcangeli out_unlock: 496c1a4de99SAndrea Arcangeli up_read(&dst_mm->mmap_sem); 497b6ebaedbSAndrea Arcangeli out: 498b6ebaedbSAndrea Arcangeli if (page) 49909cbfeafSKirill A. Shutemov put_page(page); 500c1a4de99SAndrea Arcangeli BUG_ON(copied < 0); 501c1a4de99SAndrea Arcangeli BUG_ON(err > 0); 502c1a4de99SAndrea Arcangeli BUG_ON(!copied && !err); 503c1a4de99SAndrea Arcangeli return copied ? copied : err; 504c1a4de99SAndrea Arcangeli } 505c1a4de99SAndrea Arcangeli 506c1a4de99SAndrea Arcangeli ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 507c1a4de99SAndrea Arcangeli unsigned long src_start, unsigned long len) 508c1a4de99SAndrea Arcangeli { 509c1a4de99SAndrea Arcangeli return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); 510c1a4de99SAndrea Arcangeli } 511c1a4de99SAndrea Arcangeli 512c1a4de99SAndrea Arcangeli ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 513c1a4de99SAndrea Arcangeli unsigned long len) 514c1a4de99SAndrea Arcangeli { 515c1a4de99SAndrea Arcangeli return __mcopy_atomic(dst_mm, start, 0, len, true); 516c1a4de99SAndrea Arcangeli } 517