xref: /linux/mm/mremap.c (revision 7203ca412fc8e8a0588e9adc0f777d3163f8dff3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/leafops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 #include <linux/pgalloc.h>
29 
30 #include <asm/cacheflush.h>
31 #include <asm/tlb.h>
32 
33 #include "internal.h"
34 
35 /* Classify the kind of remap operation being performed. */
36 enum mremap_type {
37 	MREMAP_INVALID,		/* Initial state. */
38 	MREMAP_NO_RESIZE,	/* old_len == new_len, if not moved, do nothing. */
39 	MREMAP_SHRINK,		/* old_len > new_len. */
40 	MREMAP_EXPAND,		/* old_len < new_len. */
41 };
42 
43 /*
44  * Describes a VMA mremap() operation and is threaded throughout it.
45  *
46  * Any of the fields may be mutated by the operation, however these values will
47  * always accurately reflect the remap (for instance, we may adjust lengths and
48  * delta to account for hugetlb alignment).
49  */
50 struct vma_remap_struct {
51 	/* User-provided state. */
52 	unsigned long addr;	/* User-specified address from which we remap. */
53 	unsigned long old_len;	/* Length of range being remapped. */
54 	unsigned long new_len;	/* Desired new length of mapping. */
55 	const unsigned long flags; /* user-specified MREMAP_* flags. */
56 	unsigned long new_addr;	/* Optionally, desired new address. */
57 
58 	/* uffd state. */
59 	struct vm_userfaultfd_ctx *uf;
60 	struct list_head *uf_unmap_early;
61 	struct list_head *uf_unmap;
62 
63 	/* VMA state, determined in do_mremap(). */
64 	struct vm_area_struct *vma;
65 
66 	/* Internal state, determined in do_mremap(). */
67 	unsigned long delta;		/* Absolute delta of old_len,new_len. */
68 	bool populate_expand;		/* mlock()'d expanded, must populate. */
69 	enum mremap_type remap_type;	/* expand, shrink, etc. */
70 	bool mmap_locked;		/* Is mm currently write-locked? */
71 	unsigned long charged;		/* If VM_ACCOUNT, # pages to account. */
72 	bool vmi_needs_invalidate;	/* Is the VMA iterator invalidated? */
73 };
74 
get_old_pud(struct mm_struct * mm,unsigned long addr)75 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
76 {
77 	pgd_t *pgd;
78 	p4d_t *p4d;
79 	pud_t *pud;
80 
81 	pgd = pgd_offset(mm, addr);
82 	if (pgd_none_or_clear_bad(pgd))
83 		return NULL;
84 
85 	p4d = p4d_offset(pgd, addr);
86 	if (p4d_none_or_clear_bad(p4d))
87 		return NULL;
88 
89 	pud = pud_offset(p4d, addr);
90 	if (pud_none_or_clear_bad(pud))
91 		return NULL;
92 
93 	return pud;
94 }
95 
get_old_pmd(struct mm_struct * mm,unsigned long addr)96 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
97 {
98 	pud_t *pud;
99 	pmd_t *pmd;
100 
101 	pud = get_old_pud(mm, addr);
102 	if (!pud)
103 		return NULL;
104 
105 	pmd = pmd_offset(pud, addr);
106 	if (pmd_none(*pmd))
107 		return NULL;
108 
109 	return pmd;
110 }
111 
alloc_new_pud(struct mm_struct * mm,unsigned long addr)112 static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
113 {
114 	pgd_t *pgd;
115 	p4d_t *p4d;
116 
117 	pgd = pgd_offset(mm, addr);
118 	p4d = p4d_alloc(mm, pgd, addr);
119 	if (!p4d)
120 		return NULL;
121 
122 	return pud_alloc(mm, p4d, addr);
123 }
124 
alloc_new_pmd(struct mm_struct * mm,unsigned long addr)125 static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
126 {
127 	pud_t *pud;
128 	pmd_t *pmd;
129 
130 	pud = alloc_new_pud(mm, addr);
131 	if (!pud)
132 		return NULL;
133 
134 	pmd = pmd_alloc(mm, pud, addr);
135 	if (!pmd)
136 		return NULL;
137 
138 	VM_BUG_ON(pmd_trans_huge(*pmd));
139 
140 	return pmd;
141 }
142 
take_rmap_locks(struct vm_area_struct * vma)143 static void take_rmap_locks(struct vm_area_struct *vma)
144 {
145 	if (vma->vm_file)
146 		i_mmap_lock_write(vma->vm_file->f_mapping);
147 	if (vma->anon_vma)
148 		anon_vma_lock_write(vma->anon_vma);
149 }
150 
drop_rmap_locks(struct vm_area_struct * vma)151 static void drop_rmap_locks(struct vm_area_struct *vma)
152 {
153 	if (vma->anon_vma)
154 		anon_vma_unlock_write(vma->anon_vma);
155 	if (vma->vm_file)
156 		i_mmap_unlock_write(vma->vm_file->f_mapping);
157 }
158 
move_soft_dirty_pte(pte_t pte)159 static pte_t move_soft_dirty_pte(pte_t pte)
160 {
161 	if (pte_none(pte))
162 		return pte;
163 
164 	/*
165 	 * Set soft dirty bit so we can notice
166 	 * in userspace the ptes were moved.
167 	 */
168 	if (pgtable_supports_soft_dirty()) {
169 		if (pte_present(pte))
170 			pte = pte_mksoft_dirty(pte);
171 		else
172 			pte = pte_swp_mksoft_dirty(pte);
173 	}
174 
175 	return pte;
176 }
177 
mremap_folio_pte_batch(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pte_t pte,int max_nr)178 static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
179 		pte_t *ptep, pte_t pte, int max_nr)
180 {
181 	struct folio *folio;
182 
183 	if (max_nr == 1)
184 		return 1;
185 
186 	/* Avoid expensive folio lookup if we stand no chance of benefit. */
187 	if (pte_batch_hint(ptep, pte) == 1)
188 		return 1;
189 
190 	folio = vm_normal_folio(vma, addr, pte);
191 	if (!folio || !folio_test_large(folio))
192 		return 1;
193 
194 	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
195 }
196 
move_ptes(struct pagetable_move_control * pmc,unsigned long extent,pmd_t * old_pmd,pmd_t * new_pmd)197 static int move_ptes(struct pagetable_move_control *pmc,
198 		unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
199 {
200 	struct vm_area_struct *vma = pmc->old;
201 	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
202 	struct mm_struct *mm = vma->vm_mm;
203 	pte_t *old_ptep, *new_ptep;
204 	pte_t old_pte, pte;
205 	pmd_t dummy_pmdval;
206 	spinlock_t *old_ptl, *new_ptl;
207 	bool force_flush = false;
208 	unsigned long old_addr = pmc->old_addr;
209 	unsigned long new_addr = pmc->new_addr;
210 	unsigned long old_end = old_addr + extent;
211 	unsigned long len = old_end - old_addr;
212 	int max_nr_ptes;
213 	int nr_ptes;
214 	int err = 0;
215 
216 	/*
217 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
218 	 * locks to ensure that rmap will always observe either the old or the
219 	 * new ptes. This is the easiest way to avoid races with
220 	 * truncate_pagecache(), page migration, etc...
221 	 *
222 	 * When need_rmap_locks is false, we use other ways to avoid
223 	 * such races:
224 	 *
225 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
226 	 *   which rmap call sites look for using vma_is_temporary_stack().
227 	 *
228 	 * - During mremap(), new_vma is often known to be placed after vma
229 	 *   in rmap traversal order. This ensures rmap will always observe
230 	 *   either the old pte, or the new pte, or both (the page table locks
231 	 *   serialize access to individual ptes, but only rmap traversal
232 	 *   order guarantees that we won't miss both the old and new ptes).
233 	 */
234 	if (pmc->need_rmap_locks)
235 		take_rmap_locks(vma);
236 
237 	/*
238 	 * We don't have to worry about the ordering of src and dst
239 	 * pte locks because exclusive mmap_lock prevents deadlock.
240 	 */
241 	old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
242 	if (!old_ptep) {
243 		err = -EAGAIN;
244 		goto out;
245 	}
246 	/*
247 	 * Now new_pte is none, so hpage_collapse_scan_file() path can not find
248 	 * this by traversing file->f_mapping, so there is no concurrency with
249 	 * retract_page_tables(). In addition, we already hold the exclusive
250 	 * mmap_lock, so this new_pte page is stable, so there is no need to get
251 	 * pmdval and do pmd_same() check.
252 	 */
253 	new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
254 					   &new_ptl);
255 	if (!new_ptep) {
256 		pte_unmap_unlock(old_ptep, old_ptl);
257 		err = -EAGAIN;
258 		goto out;
259 	}
260 	if (new_ptl != old_ptl)
261 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
262 	flush_tlb_batched_pending(vma->vm_mm);
263 	arch_enter_lazy_mmu_mode();
264 
265 	for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
266 		new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
267 		VM_WARN_ON_ONCE(!pte_none(*new_ptep));
268 
269 		nr_ptes = 1;
270 		max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT;
271 		old_pte = ptep_get(old_ptep);
272 		if (pte_none(old_pte))
273 			continue;
274 
275 		/*
276 		 * If we are remapping a valid PTE, make sure
277 		 * to flush TLB before we drop the PTL for the
278 		 * PTE.
279 		 *
280 		 * NOTE! Both old and new PTL matter: the old one
281 		 * for racing with folio_mkclean(), the new one to
282 		 * make sure the physical page stays valid until
283 		 * the TLB entry for the old mapping has been
284 		 * flushed.
285 		 */
286 		if (pte_present(old_pte)) {
287 			nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep,
288 							 old_pte, max_nr_ptes);
289 			force_flush = true;
290 		}
291 		pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
292 		pte = move_pte(pte, old_addr, new_addr);
293 		pte = move_soft_dirty_pte(pte);
294 
295 		if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
296 			pte_clear(mm, new_addr, new_ptep);
297 		else {
298 			if (need_clear_uffd_wp) {
299 				if (pte_present(pte))
300 					pte = pte_clear_uffd_wp(pte);
301 				else
302 					pte = pte_swp_clear_uffd_wp(pte);
303 			}
304 			set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
305 		}
306 	}
307 
308 	arch_leave_lazy_mmu_mode();
309 	if (force_flush)
310 		flush_tlb_range(vma, old_end - len, old_end);
311 	if (new_ptl != old_ptl)
312 		spin_unlock(new_ptl);
313 	pte_unmap(new_ptep - 1);
314 	pte_unmap_unlock(old_ptep - 1, old_ptl);
315 out:
316 	if (pmc->need_rmap_locks)
317 		drop_rmap_locks(vma);
318 	return err;
319 }
320 
321 #ifndef arch_supports_page_table_move
322 #define arch_supports_page_table_move arch_supports_page_table_move
arch_supports_page_table_move(void)323 static inline bool arch_supports_page_table_move(void)
324 {
325 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
326 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
327 }
328 #endif
329 
uffd_supports_page_table_move(struct pagetable_move_control * pmc)330 static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc)
331 {
332 	/*
333 	 * If we are moving a VMA that has uffd-wp registered but with
334 	 * remap events disabled (new VMA will not be registered with uffd), we
335 	 * need to ensure that the uffd-wp state is cleared from all pgtables.
336 	 * This means recursing into lower page tables in move_page_tables().
337 	 *
338 	 * We might get called with VMAs reversed when recovering from a
339 	 * failed page table move. In that case, the
340 	 * "old"-but-actually-"originally new" VMA during recovery will not have
341 	 * a uffd context. Recursing into lower page tables during the original
342 	 * move but not during the recovery move will cause trouble, because we
343 	 * run into already-existing page tables. So check both VMAs.
344 	 */
345 	return !vma_has_uffd_without_event_remap(pmc->old) &&
346 	       !vma_has_uffd_without_event_remap(pmc->new);
347 }
348 
349 #ifdef CONFIG_HAVE_MOVE_PMD
move_normal_pmd(struct pagetable_move_control * pmc,pmd_t * old_pmd,pmd_t * new_pmd)350 static bool move_normal_pmd(struct pagetable_move_control *pmc,
351 			pmd_t *old_pmd, pmd_t *new_pmd)
352 {
353 	spinlock_t *old_ptl, *new_ptl;
354 	struct vm_area_struct *vma = pmc->old;
355 	struct mm_struct *mm = vma->vm_mm;
356 	bool res = false;
357 	pmd_t pmd;
358 
359 	if (!arch_supports_page_table_move())
360 		return false;
361 	if (!uffd_supports_page_table_move(pmc))
362 		return false;
363 	/*
364 	 * The destination pmd shouldn't be established, free_pgtables()
365 	 * should have released it.
366 	 *
367 	 * However, there's a case during execve() where we use mremap
368 	 * to move the initial stack, and in that case the target area
369 	 * may overlap the source area (always moving down).
370 	 *
371 	 * If everything is PMD-aligned, that works fine, as moving
372 	 * each pmd down will clear the source pmd. But if we first
373 	 * have a few 4kB-only pages that get moved down, and then
374 	 * hit the "now the rest is PMD-aligned, let's do everything
375 	 * one pmd at a time", we will still have the old (now empty
376 	 * of any 4kB pages, but still there) PMD in the page table
377 	 * tree.
378 	 *
379 	 * Warn on it once - because we really should try to figure
380 	 * out how to do this better - but then say "I won't move
381 	 * this pmd".
382 	 *
383 	 * One alternative might be to just unmap the target pmd at
384 	 * this point, and verify that it really is empty. We'll see.
385 	 */
386 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
387 		return false;
388 
389 	/*
390 	 * We don't have to worry about the ordering of src and dst
391 	 * ptlocks because exclusive mmap_lock prevents deadlock.
392 	 */
393 	old_ptl = pmd_lock(mm, old_pmd);
394 	new_ptl = pmd_lockptr(mm, new_pmd);
395 	if (new_ptl != old_ptl)
396 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
397 
398 	pmd = *old_pmd;
399 
400 	/* Racing with collapse? */
401 	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
402 		goto out_unlock;
403 	/* Clear the pmd */
404 	pmd_clear(old_pmd);
405 	res = true;
406 
407 	VM_BUG_ON(!pmd_none(*new_pmd));
408 
409 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
410 	flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
411 out_unlock:
412 	if (new_ptl != old_ptl)
413 		spin_unlock(new_ptl);
414 	spin_unlock(old_ptl);
415 
416 	return res;
417 }
418 #else
move_normal_pmd(struct pagetable_move_control * pmc,pmd_t * old_pmd,pmd_t * new_pmd)419 static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
420 		pmd_t *old_pmd, pmd_t *new_pmd)
421 {
422 	return false;
423 }
424 #endif
425 
426 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
move_normal_pud(struct pagetable_move_control * pmc,pud_t * old_pud,pud_t * new_pud)427 static bool move_normal_pud(struct pagetable_move_control *pmc,
428 		pud_t *old_pud, pud_t *new_pud)
429 {
430 	spinlock_t *old_ptl, *new_ptl;
431 	struct vm_area_struct *vma = pmc->old;
432 	struct mm_struct *mm = vma->vm_mm;
433 	pud_t pud;
434 
435 	if (!arch_supports_page_table_move())
436 		return false;
437 	if (!uffd_supports_page_table_move(pmc))
438 		return false;
439 	/*
440 	 * The destination pud shouldn't be established, free_pgtables()
441 	 * should have released it.
442 	 */
443 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
444 		return false;
445 
446 	/*
447 	 * We don't have to worry about the ordering of src and dst
448 	 * ptlocks because exclusive mmap_lock prevents deadlock.
449 	 */
450 	old_ptl = pud_lock(mm, old_pud);
451 	new_ptl = pud_lockptr(mm, new_pud);
452 	if (new_ptl != old_ptl)
453 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
454 
455 	/* Clear the pud */
456 	pud = *old_pud;
457 	pud_clear(old_pud);
458 
459 	VM_BUG_ON(!pud_none(*new_pud));
460 
461 	pud_populate(mm, new_pud, pud_pgtable(pud));
462 	flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
463 	if (new_ptl != old_ptl)
464 		spin_unlock(new_ptl);
465 	spin_unlock(old_ptl);
466 
467 	return true;
468 }
469 #else
move_normal_pud(struct pagetable_move_control * pmc,pud_t * old_pud,pud_t * new_pud)470 static inline bool move_normal_pud(struct pagetable_move_control *pmc,
471 		pud_t *old_pud, pud_t *new_pud)
472 {
473 	return false;
474 }
475 #endif
476 
477 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
move_huge_pud(struct pagetable_move_control * pmc,pud_t * old_pud,pud_t * new_pud)478 static bool move_huge_pud(struct pagetable_move_control *pmc,
479 		pud_t *old_pud, pud_t *new_pud)
480 {
481 	spinlock_t *old_ptl, *new_ptl;
482 	struct vm_area_struct *vma = pmc->old;
483 	struct mm_struct *mm = vma->vm_mm;
484 	pud_t pud;
485 
486 	/*
487 	 * The destination pud shouldn't be established, free_pgtables()
488 	 * should have released it.
489 	 */
490 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
491 		return false;
492 
493 	/*
494 	 * We don't have to worry about the ordering of src and dst
495 	 * ptlocks because exclusive mmap_lock prevents deadlock.
496 	 */
497 	old_ptl = pud_lock(mm, old_pud);
498 	new_ptl = pud_lockptr(mm, new_pud);
499 	if (new_ptl != old_ptl)
500 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
501 
502 	/* Clear the pud */
503 	pud = *old_pud;
504 	pud_clear(old_pud);
505 
506 	VM_BUG_ON(!pud_none(*new_pud));
507 
508 	/* Set the new pud */
509 	/* mark soft_ditry when we add pud level soft dirty support */
510 	set_pud_at(mm, pmc->new_addr, new_pud, pud);
511 	flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
512 	if (new_ptl != old_ptl)
513 		spin_unlock(new_ptl);
514 	spin_unlock(old_ptl);
515 
516 	return true;
517 }
518 #else
move_huge_pud(struct pagetable_move_control * pmc,pud_t * old_pud,pud_t * new_pud)519 static bool move_huge_pud(struct pagetable_move_control *pmc,
520 		pud_t *old_pud, pud_t *new_pud)
521 
522 {
523 	WARN_ON_ONCE(1);
524 	return false;
525 
526 }
527 #endif
528 
529 enum pgt_entry {
530 	NORMAL_PMD,
531 	HPAGE_PMD,
532 	NORMAL_PUD,
533 	HPAGE_PUD,
534 };
535 
536 /*
537  * Returns an extent of the corresponding size for the pgt_entry specified if
538  * valid. Else returns a smaller extent bounded by the end of the source and
539  * destination pgt_entry.
540  */
get_extent(enum pgt_entry entry,struct pagetable_move_control * pmc)541 static __always_inline unsigned long get_extent(enum pgt_entry entry,
542 						struct pagetable_move_control *pmc)
543 {
544 	unsigned long next, extent, mask, size;
545 	unsigned long old_addr = pmc->old_addr;
546 	unsigned long old_end = pmc->old_end;
547 	unsigned long new_addr = pmc->new_addr;
548 
549 	switch (entry) {
550 	case HPAGE_PMD:
551 	case NORMAL_PMD:
552 		mask = PMD_MASK;
553 		size = PMD_SIZE;
554 		break;
555 	case HPAGE_PUD:
556 	case NORMAL_PUD:
557 		mask = PUD_MASK;
558 		size = PUD_SIZE;
559 		break;
560 	default:
561 		BUILD_BUG();
562 		break;
563 	}
564 
565 	next = (old_addr + size) & mask;
566 	/* even if next overflowed, extent below will be ok */
567 	extent = next - old_addr;
568 	if (extent > old_end - old_addr)
569 		extent = old_end - old_addr;
570 	next = (new_addr + size) & mask;
571 	if (extent > next - new_addr)
572 		extent = next - new_addr;
573 	return extent;
574 }
575 
576 /*
577  * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
578  * the PMC, or overridden in the case of normal, larger page tables.
579  */
should_take_rmap_locks(struct pagetable_move_control * pmc,enum pgt_entry entry)580 static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
581 				   enum pgt_entry entry)
582 {
583 	switch (entry) {
584 	case NORMAL_PMD:
585 	case NORMAL_PUD:
586 		return true;
587 	default:
588 		return pmc->need_rmap_locks;
589 	}
590 }
591 
592 /*
593  * Attempts to speedup the move by moving entry at the level corresponding to
594  * pgt_entry. Returns true if the move was successful, else false.
595  */
move_pgt_entry(struct pagetable_move_control * pmc,enum pgt_entry entry,void * old_entry,void * new_entry)596 static bool move_pgt_entry(struct pagetable_move_control *pmc,
597 			   enum pgt_entry entry, void *old_entry, void *new_entry)
598 {
599 	bool moved = false;
600 	bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
601 
602 	/* See comment in move_ptes() */
603 	if (need_rmap_locks)
604 		take_rmap_locks(pmc->old);
605 
606 	switch (entry) {
607 	case NORMAL_PMD:
608 		moved = move_normal_pmd(pmc, old_entry, new_entry);
609 		break;
610 	case NORMAL_PUD:
611 		moved = move_normal_pud(pmc, old_entry, new_entry);
612 		break;
613 	case HPAGE_PMD:
614 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
615 			move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
616 				      new_entry);
617 		break;
618 	case HPAGE_PUD:
619 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
620 			move_huge_pud(pmc, old_entry, new_entry);
621 		break;
622 
623 	default:
624 		WARN_ON_ONCE(1);
625 		break;
626 	}
627 
628 	if (need_rmap_locks)
629 		drop_rmap_locks(pmc->old);
630 
631 	return moved;
632 }
633 
634 /*
635  * A helper to check if aligning down is OK. The aligned address should fall
636  * on *no mapping*. For the stack moving down, that's a special move within
637  * the VMA that is created to span the source and destination of the move,
638  * so we make an exception for it.
639  */
can_align_down(struct pagetable_move_control * pmc,struct vm_area_struct * vma,unsigned long addr_to_align,unsigned long mask)640 static bool can_align_down(struct pagetable_move_control *pmc,
641 			   struct vm_area_struct *vma, unsigned long addr_to_align,
642 			   unsigned long mask)
643 {
644 	unsigned long addr_masked = addr_to_align & mask;
645 
646 	/*
647 	 * If @addr_to_align of either source or destination is not the beginning
648 	 * of the corresponding VMA, we can't align down or we will destroy part
649 	 * of the current mapping.
650 	 */
651 	if (!pmc->for_stack && vma->vm_start != addr_to_align)
652 		return false;
653 
654 	/* In the stack case we explicitly permit in-VMA alignment. */
655 	if (pmc->for_stack && addr_masked >= vma->vm_start)
656 		return true;
657 
658 	/*
659 	 * Make sure the realignment doesn't cause the address to fall on an
660 	 * existing mapping.
661 	 */
662 	return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
663 }
664 
665 /*
666  * Determine if are in fact able to realign for efficiency to a higher page
667  * table boundary.
668  */
can_realign_addr(struct pagetable_move_control * pmc,unsigned long pagetable_mask)669 static bool can_realign_addr(struct pagetable_move_control *pmc,
670 			     unsigned long pagetable_mask)
671 {
672 	unsigned long align_mask = ~pagetable_mask;
673 	unsigned long old_align = pmc->old_addr & align_mask;
674 	unsigned long new_align = pmc->new_addr & align_mask;
675 	unsigned long pagetable_size = align_mask + 1;
676 	unsigned long old_align_next = pagetable_size - old_align;
677 
678 	/*
679 	 * We don't want to have to go hunting for VMAs from the end of the old
680 	 * VMA to the next page table boundary, also we want to make sure the
681 	 * operation is wortwhile.
682 	 *
683 	 * So ensure that we only perform this realignment if the end of the
684 	 * range being copied reaches or crosses the page table boundary.
685 	 *
686 	 * boundary                        boundary
687 	 *    .<- old_align ->                .
688 	 *    .              |----------------.-----------|
689 	 *    .              |          vma   .           |
690 	 *    .              |----------------.-----------|
691 	 *    .              <----------------.----------->
692 	 *    .                          len_in
693 	 *    <------------------------------->
694 	 *    .         pagetable_size        .
695 	 *    .              <---------------->
696 	 *    .                old_align_next .
697 	 */
698 	if (pmc->len_in < old_align_next)
699 		return false;
700 
701 	/* Skip if the addresses are already aligned. */
702 	if (old_align == 0)
703 		return false;
704 
705 	/* Only realign if the new and old addresses are mutually aligned. */
706 	if (old_align != new_align)
707 		return false;
708 
709 	/* Ensure realignment doesn't cause overlap with existing mappings. */
710 	if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
711 	    !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
712 		return false;
713 
714 	return true;
715 }
716 
717 /*
718  * Opportunistically realign to specified boundary for faster copy.
719  *
720  * Consider an mremap() of a VMA with page table boundaries as below, and no
721  * preceding VMAs from the lower page table boundary to the start of the VMA,
722  * with the end of the range reaching or crossing the page table boundary.
723  *
724  *   boundary                        boundary
725  *      .              |----------------.-----------|
726  *      .              |          vma   .           |
727  *      .              |----------------.-----------|
728  *      .         pmc->old_addr         .      pmc->old_end
729  *      .              <---------------------------->
730  *      .                  move these page tables
731  *
732  * If we proceed with moving page tables in this scenario, we will have a lot of
733  * work to do traversing old page tables and establishing new ones in the
734  * destination across multiple lower level page tables.
735  *
736  * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
737  * page table boundary, so we can simply copy a single page table entry for the
738  * aligned portion of the VMA instead:
739  *
740  *   boundary                        boundary
741  *      .              |----------------.-----------|
742  *      .              |          vma   .           |
743  *      .              |----------------.-----------|
744  * pmc->old_addr                        .      pmc->old_end
745  *      <------------------------------------------->
746  *      .           move these page tables
747  */
try_realign_addr(struct pagetable_move_control * pmc,unsigned long pagetable_mask)748 static void try_realign_addr(struct pagetable_move_control *pmc,
749 			     unsigned long pagetable_mask)
750 {
751 
752 	if (!can_realign_addr(pmc, pagetable_mask))
753 		return;
754 
755 	/*
756 	 * Simply align to page table boundaries. Note that we do NOT update the
757 	 * pmc->old_end value, and since the move_page_tables() operation spans
758 	 * from [old_addr, old_end) (offsetting new_addr as it is performed),
759 	 * this simply changes the start of the copy, not the end.
760 	 */
761 	pmc->old_addr &= pagetable_mask;
762 	pmc->new_addr &= pagetable_mask;
763 }
764 
765 /* Is the page table move operation done? */
pmc_done(struct pagetable_move_control * pmc)766 static bool pmc_done(struct pagetable_move_control *pmc)
767 {
768 	return pmc->old_addr >= pmc->old_end;
769 }
770 
771 /* Advance to the next page table, offset by extent bytes. */
pmc_next(struct pagetable_move_control * pmc,unsigned long extent)772 static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
773 {
774 	pmc->old_addr += extent;
775 	pmc->new_addr += extent;
776 }
777 
778 /*
779  * Determine how many bytes in the specified input range have had their page
780  * tables moved so far.
781  */
pmc_progress(struct pagetable_move_control * pmc)782 static unsigned long pmc_progress(struct pagetable_move_control *pmc)
783 {
784 	unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
785 	unsigned long old_addr = pmc->old_addr;
786 
787 	/*
788 	 * Prevent negative return values when {old,new}_addr was realigned but
789 	 * we broke out of the loop in move_page_tables() for the first PMD
790 	 * itself.
791 	 */
792 	return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
793 }
794 
move_page_tables(struct pagetable_move_control * pmc)795 unsigned long move_page_tables(struct pagetable_move_control *pmc)
796 {
797 	unsigned long extent;
798 	struct mmu_notifier_range range;
799 	pmd_t *old_pmd, *new_pmd;
800 	pud_t *old_pud, *new_pud;
801 	struct mm_struct *mm = pmc->old->vm_mm;
802 
803 	if (!pmc->len_in)
804 		return 0;
805 
806 	if (is_vm_hugetlb_page(pmc->old))
807 		return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
808 						pmc->new_addr, pmc->len_in);
809 
810 	/*
811 	 * If possible, realign addresses to PMD boundary for faster copy.
812 	 * Only realign if the mremap copying hits a PMD boundary.
813 	 */
814 	try_realign_addr(pmc, PMD_MASK);
815 
816 	flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
817 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
818 				pmc->old_addr, pmc->old_end);
819 	mmu_notifier_invalidate_range_start(&range);
820 
821 	for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
822 		cond_resched();
823 		/*
824 		 * If extent is PUD-sized try to speed up the move by moving at the
825 		 * PUD level if possible.
826 		 */
827 		extent = get_extent(NORMAL_PUD, pmc);
828 
829 		old_pud = get_old_pud(mm, pmc->old_addr);
830 		if (!old_pud)
831 			continue;
832 		new_pud = alloc_new_pud(mm, pmc->new_addr);
833 		if (!new_pud)
834 			break;
835 		if (pud_trans_huge(*old_pud)) {
836 			if (extent == HPAGE_PUD_SIZE) {
837 				move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
838 				/* We ignore and continue on error? */
839 				continue;
840 			}
841 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
842 			if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
843 				continue;
844 		}
845 
846 		extent = get_extent(NORMAL_PMD, pmc);
847 		old_pmd = get_old_pmd(mm, pmc->old_addr);
848 		if (!old_pmd)
849 			continue;
850 		new_pmd = alloc_new_pmd(mm, pmc->new_addr);
851 		if (!new_pmd)
852 			break;
853 again:
854 		if (pmd_is_huge(*old_pmd)) {
855 			if (extent == HPAGE_PMD_SIZE &&
856 			    move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
857 				continue;
858 			split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
859 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
860 			   extent == PMD_SIZE) {
861 			/*
862 			 * If the extent is PMD-sized, try to speed the move by
863 			 * moving at the PMD level if possible.
864 			 */
865 			if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
866 				continue;
867 		}
868 		if (pmd_none(*old_pmd))
869 			continue;
870 		if (pte_alloc(pmc->new->vm_mm, new_pmd))
871 			break;
872 		if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
873 			goto again;
874 	}
875 
876 	mmu_notifier_invalidate_range_end(&range);
877 
878 	return pmc_progress(pmc);
879 }
880 
881 /* Set vrm->delta to the difference in VMA size specified by user. */
vrm_set_delta(struct vma_remap_struct * vrm)882 static void vrm_set_delta(struct vma_remap_struct *vrm)
883 {
884 	vrm->delta = abs_diff(vrm->old_len, vrm->new_len);
885 }
886 
887 /* Determine what kind of remap this is - shrink, expand or no resize at all. */
vrm_remap_type(struct vma_remap_struct * vrm)888 static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm)
889 {
890 	if (vrm->delta == 0)
891 		return MREMAP_NO_RESIZE;
892 
893 	if (vrm->old_len > vrm->new_len)
894 		return MREMAP_SHRINK;
895 
896 	return MREMAP_EXPAND;
897 }
898 
899 /*
900  * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs
901  * overlapping?
902  */
vrm_overlaps(struct vma_remap_struct * vrm)903 static bool vrm_overlaps(struct vma_remap_struct *vrm)
904 {
905 	unsigned long start_old = vrm->addr;
906 	unsigned long start_new = vrm->new_addr;
907 	unsigned long end_old = vrm->addr + vrm->old_len;
908 	unsigned long end_new = vrm->new_addr + vrm->new_len;
909 
910 	/*
911 	 * start_old    end_old
912 	 *     |-----------|
913 	 *     |           |
914 	 *     |-----------|
915 	 *             |-------------|
916 	 *             |             |
917 	 *             |-------------|
918 	 *         start_new      end_new
919 	 */
920 	if (end_old > start_new && end_new > start_old)
921 		return true;
922 
923 	return false;
924 }
925 
926 /*
927  * Will a new address definitely be assigned? This either if the user specifies
928  * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
929  * always detemrine a target address.
930  */
vrm_implies_new_addr(struct vma_remap_struct * vrm)931 static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
932 {
933 	return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
934 }
935 
936 /*
937  * Find an unmapped area for the requested vrm->new_addr.
938  *
939  * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only
940  * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to
941  * mmap(), otherwise this is equivalent to mmap() specifying a NULL address.
942  *
943  * Returns 0 on success (with vrm->new_addr updated), or an error code upon
944  * failure.
945  */
vrm_set_new_addr(struct vma_remap_struct * vrm)946 static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
947 {
948 	struct vm_area_struct *vma = vrm->vma;
949 	unsigned long map_flags = 0;
950 	/* Page Offset _into_ the VMA. */
951 	pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT;
952 	pgoff_t pgoff = vma->vm_pgoff + internal_pgoff;
953 	unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0;
954 	unsigned long res;
955 
956 	if (vrm->flags & MREMAP_FIXED)
957 		map_flags |= MAP_FIXED;
958 	if (vma->vm_flags & VM_MAYSHARE)
959 		map_flags |= MAP_SHARED;
960 
961 	res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff,
962 				map_flags);
963 	if (IS_ERR_VALUE(res))
964 		return res;
965 
966 	vrm->new_addr = res;
967 	return 0;
968 }
969 
970 /*
971  * Keep track of pages which have been added to the memory mapping. If the VMA
972  * is accounted, also check to see if there is sufficient memory.
973  *
974  * Returns true on success, false if insufficient memory to charge.
975  */
vrm_calc_charge(struct vma_remap_struct * vrm)976 static bool vrm_calc_charge(struct vma_remap_struct *vrm)
977 {
978 	unsigned long charged;
979 
980 	if (!(vrm->vma->vm_flags & VM_ACCOUNT))
981 		return true;
982 
983 	/*
984 	 * If we don't unmap the old mapping, then we account the entirety of
985 	 * the length of the new one. Otherwise it's just the delta in size.
986 	 */
987 	if (vrm->flags & MREMAP_DONTUNMAP)
988 		charged = vrm->new_len >> PAGE_SHIFT;
989 	else
990 		charged = vrm->delta >> PAGE_SHIFT;
991 
992 
993 	/* This accounts 'charged' pages of memory. */
994 	if (security_vm_enough_memory_mm(current->mm, charged))
995 		return false;
996 
997 	vrm->charged = charged;
998 	return true;
999 }
1000 
1001 /*
1002  * an error has occurred so we will not be using vrm->charged memory. Unaccount
1003  * this memory if the VMA is accounted.
1004  */
vrm_uncharge(struct vma_remap_struct * vrm)1005 static void vrm_uncharge(struct vma_remap_struct *vrm)
1006 {
1007 	if (!(vrm->vma->vm_flags & VM_ACCOUNT))
1008 		return;
1009 
1010 	vm_unacct_memory(vrm->charged);
1011 	vrm->charged = 0;
1012 }
1013 
1014 /*
1015  * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
1016  * account for 'bytes' memory used, and if locked, indicate this in the VRM so
1017  * we can handle this correctly later.
1018  */
vrm_stat_account(struct vma_remap_struct * vrm,unsigned long bytes)1019 static void vrm_stat_account(struct vma_remap_struct *vrm,
1020 			     unsigned long bytes)
1021 {
1022 	unsigned long pages = bytes >> PAGE_SHIFT;
1023 	struct mm_struct *mm = current->mm;
1024 	struct vm_area_struct *vma = vrm->vma;
1025 
1026 	vm_stat_account(mm, vma->vm_flags, pages);
1027 	if (vma->vm_flags & VM_LOCKED)
1028 		mm->locked_vm += pages;
1029 }
1030 
1031 /*
1032  * Perform checks before attempting to write a VMA prior to it being
1033  * moved.
1034  */
prep_move_vma(struct vma_remap_struct * vrm)1035 static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
1036 {
1037 	unsigned long err = 0;
1038 	struct vm_area_struct *vma = vrm->vma;
1039 	unsigned long old_addr = vrm->addr;
1040 	unsigned long old_len = vrm->old_len;
1041 	vm_flags_t dummy = vma->vm_flags;
1042 
1043 	/*
1044 	 * We'd prefer to avoid failure later on in do_munmap:
1045 	 * which may split one vma into three before unmapping.
1046 	 */
1047 	if (current->mm->map_count >= sysctl_max_map_count - 3)
1048 		return -ENOMEM;
1049 
1050 	if (vma->vm_ops && vma->vm_ops->may_split) {
1051 		if (vma->vm_start != old_addr)
1052 			err = vma->vm_ops->may_split(vma, old_addr);
1053 		if (!err && vma->vm_end != old_addr + old_len)
1054 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
1055 		if (err)
1056 			return err;
1057 	}
1058 
1059 	/*
1060 	 * Advise KSM to break any KSM pages in the area to be moved:
1061 	 * it would be confusing if they were to turn up at the new
1062 	 * location, where they happen to coincide with different KSM
1063 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
1064 	 * so KSM can come around to merge on vma and new_vma afterwards.
1065 	 */
1066 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
1067 			  MADV_UNMERGEABLE, &dummy);
1068 	if (err)
1069 		return err;
1070 
1071 	return 0;
1072 }
1073 
1074 /*
1075  * Unmap source VMA for VMA move, turning it from a copy to a move, being
1076  * careful to ensure we do not underflow memory account while doing so if an
1077  * accountable move.
1078  *
1079  * This is best effort, if we fail to unmap then we simply try to correct
1080  * accounting and exit.
1081  */
unmap_source_vma(struct vma_remap_struct * vrm)1082 static void unmap_source_vma(struct vma_remap_struct *vrm)
1083 {
1084 	struct mm_struct *mm = current->mm;
1085 	unsigned long addr = vrm->addr;
1086 	unsigned long len = vrm->old_len;
1087 	struct vm_area_struct *vma = vrm->vma;
1088 	VMA_ITERATOR(vmi, mm, addr);
1089 	int err;
1090 	unsigned long vm_start;
1091 	unsigned long vm_end;
1092 	/*
1093 	 * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
1094 	 * function implies that we unmap the original VMA, which seems
1095 	 * contradictory.
1096 	 *
1097 	 * However, this occurs when this operation was attempted and an error
1098 	 * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
1099 	 * we actually _do_ want it be unaccounted.
1100 	 */
1101 	bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
1102 		!(vrm->flags & MREMAP_DONTUNMAP);
1103 
1104 	/*
1105 	 * So we perform a trick here to prevent incorrect accounting. Any merge
1106 	 * or new VMA allocation performed in copy_vma() does not adjust
1107 	 * accounting, it is expected that callers handle this.
1108 	 *
1109 	 * And indeed we already have, accounting appropriately in the case of
1110 	 * both in vrm_charge().
1111 	 *
1112 	 * However, when we unmap the existing VMA (to effect the move), this
1113 	 * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
1114 	 * removed pages.
1115 	 *
1116 	 * To avoid this we temporarily clear this flag, reinstating on any
1117 	 * portions of the original VMA that remain.
1118 	 */
1119 	if (accountable_move) {
1120 		vm_flags_clear(vma, VM_ACCOUNT);
1121 		/* We are about to split vma, so store the start/end. */
1122 		vm_start = vma->vm_start;
1123 		vm_end = vma->vm_end;
1124 	}
1125 
1126 	err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
1127 	vrm->vma = NULL; /* Invalidated. */
1128 	vrm->vmi_needs_invalidate = true;
1129 	if (err) {
1130 		/* OOM: unable to split vma, just get accounts right */
1131 		vm_acct_memory(len >> PAGE_SHIFT);
1132 		return;
1133 	}
1134 
1135 	/*
1136 	 * If we mremap() from a VMA like this:
1137 	 *
1138 	 *    addr  end
1139 	 *     |     |
1140 	 *     v     v
1141 	 * |-------------|
1142 	 * |             |
1143 	 * |-------------|
1144 	 *
1145 	 * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
1146 	 * we'll end up with:
1147 	 *
1148 	 *    addr  end
1149 	 *     |     |
1150 	 *     v     v
1151 	 * |---|     |---|
1152 	 * | A |     | B |
1153 	 * |---|     |---|
1154 	 *
1155 	 * The VMI is still pointing at addr, so vma_prev() will give us A, and
1156 	 * a subsequent or lone vma_next() will give as B.
1157 	 *
1158 	 * do_vmi_munmap() will have restored the VMI back to addr.
1159 	 */
1160 	if (accountable_move) {
1161 		unsigned long end = addr + len;
1162 
1163 		if (vm_start < addr) {
1164 			struct vm_area_struct *prev = vma_prev(&vmi);
1165 
1166 			vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
1167 		}
1168 
1169 		if (vm_end > end) {
1170 			struct vm_area_struct *next = vma_next(&vmi);
1171 
1172 			vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
1173 		}
1174 	}
1175 }
1176 
1177 /*
1178  * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
1179  * process. Additionally handle an error occurring on moving of page tables,
1180  * where we reset vrm state to cause unmapping of the new VMA.
1181  *
1182  * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
1183  * error code.
1184  */
copy_vma_and_data(struct vma_remap_struct * vrm,struct vm_area_struct ** new_vma_ptr)1185 static int copy_vma_and_data(struct vma_remap_struct *vrm,
1186 			     struct vm_area_struct **new_vma_ptr)
1187 {
1188 	unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
1189 	unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
1190 	unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
1191 	unsigned long moved_len;
1192 	struct vm_area_struct *vma = vrm->vma;
1193 	struct vm_area_struct *new_vma;
1194 	int err = 0;
1195 	PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
1196 
1197 	new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
1198 			   &pmc.need_rmap_locks);
1199 	if (!new_vma) {
1200 		vrm_uncharge(vrm);
1201 		*new_vma_ptr = NULL;
1202 		return -ENOMEM;
1203 	}
1204 	/* By merging, we may have invalidated any iterator in use. */
1205 	if (vma != vrm->vma)
1206 		vrm->vmi_needs_invalidate = true;
1207 
1208 	vrm->vma = vma;
1209 	pmc.old = vma;
1210 	pmc.new = new_vma;
1211 
1212 	moved_len = move_page_tables(&pmc);
1213 	if (moved_len < vrm->old_len)
1214 		err = -ENOMEM;
1215 	else if (vma->vm_ops && vma->vm_ops->mremap)
1216 		err = vma->vm_ops->mremap(new_vma);
1217 
1218 	if (unlikely(err)) {
1219 		PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
1220 			       vrm->addr, moved_len);
1221 
1222 		/*
1223 		 * On error, move entries back from new area to old,
1224 		 * which will succeed since page tables still there,
1225 		 * and then proceed to unmap new area instead of old.
1226 		 */
1227 		pmc_revert.need_rmap_locks = true;
1228 		move_page_tables(&pmc_revert);
1229 
1230 		vrm->vma = new_vma;
1231 		vrm->old_len = vrm->new_len;
1232 		vrm->addr = vrm->new_addr;
1233 	} else {
1234 		mremap_userfaultfd_prep(new_vma, vrm->uf);
1235 	}
1236 
1237 	fixup_hugetlb_reservations(vma);
1238 
1239 	*new_vma_ptr = new_vma;
1240 	return err;
1241 }
1242 
1243 /*
1244  * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() flag on
1245  * remaining VMA by convention (it cannot be mlock()'d any longer, as pages in
1246  * range are no longer mapped), and removing anon_vma_chain links from it if the
1247  * entire VMA was copied over.
1248  */
dontunmap_complete(struct vma_remap_struct * vrm,struct vm_area_struct * new_vma)1249 static void dontunmap_complete(struct vma_remap_struct *vrm,
1250 			       struct vm_area_struct *new_vma)
1251 {
1252 	unsigned long start = vrm->addr;
1253 	unsigned long end = vrm->addr + vrm->old_len;
1254 	unsigned long old_start = vrm->vma->vm_start;
1255 	unsigned long old_end = vrm->vma->vm_end;
1256 
1257 	/* We always clear VM_LOCKED[ONFAULT] on the old VMA. */
1258 	vm_flags_clear(vrm->vma, VM_LOCKED_MASK);
1259 
1260 	/*
1261 	 * anon_vma links of the old vma is no longer needed after its page
1262 	 * table has been moved.
1263 	 */
1264 	if (new_vma != vrm->vma && start == old_start && end == old_end)
1265 		unlink_anon_vmas(vrm->vma);
1266 
1267 	/* Because we won't unmap we don't need to touch locked_vm. */
1268 }
1269 
move_vma(struct vma_remap_struct * vrm)1270 static unsigned long move_vma(struct vma_remap_struct *vrm)
1271 {
1272 	struct mm_struct *mm = current->mm;
1273 	struct vm_area_struct *new_vma;
1274 	unsigned long hiwater_vm;
1275 	int err;
1276 
1277 	err = prep_move_vma(vrm);
1278 	if (err)
1279 		return err;
1280 
1281 	/*
1282 	 * If accounted, determine the number of bytes the operation will
1283 	 * charge.
1284 	 */
1285 	if (!vrm_calc_charge(vrm))
1286 		return -ENOMEM;
1287 
1288 	/* We don't want racing faults. */
1289 	vma_start_write(vrm->vma);
1290 
1291 	/* Perform copy step. */
1292 	err = copy_vma_and_data(vrm, &new_vma);
1293 	/*
1294 	 * If we established the copied-to VMA, we attempt to recover from the
1295 	 * error by setting the destination VMA to the source VMA and unmapping
1296 	 * it below.
1297 	 */
1298 	if (err && !new_vma)
1299 		return err;
1300 
1301 	/*
1302 	 * If we failed to move page tables we still do total_vm increment
1303 	 * since do_munmap() will decrement it by old_len == new_len.
1304 	 *
1305 	 * Since total_vm is about to be raised artificially high for a
1306 	 * moment, we need to restore high watermark afterwards: if stats
1307 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
1308 	 * If this were a serious issue, we'd add a flag to do_munmap().
1309 	 */
1310 	hiwater_vm = mm->hiwater_vm;
1311 
1312 	vrm_stat_account(vrm, vrm->new_len);
1313 	if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
1314 		dontunmap_complete(vrm, new_vma);
1315 	else
1316 		unmap_source_vma(vrm);
1317 
1318 	mm->hiwater_vm = hiwater_vm;
1319 
1320 	return err ? (unsigned long)err : vrm->new_addr;
1321 }
1322 
1323 /*
1324  * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
1325  * execute this, optionally dropping the mmap lock when we do so.
1326  *
1327  * In both cases this invalidates the VMA, however if we don't drop the lock,
1328  * then load the correct VMA into vrm->vma afterwards.
1329  */
shrink_vma(struct vma_remap_struct * vrm,bool drop_lock)1330 static unsigned long shrink_vma(struct vma_remap_struct *vrm,
1331 				bool drop_lock)
1332 {
1333 	struct mm_struct *mm = current->mm;
1334 	unsigned long unmap_start = vrm->addr + vrm->new_len;
1335 	unsigned long unmap_bytes = vrm->delta;
1336 	unsigned long res;
1337 	VMA_ITERATOR(vmi, mm, unmap_start);
1338 
1339 	VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK);
1340 
1341 	res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes,
1342 			    vrm->uf_unmap, drop_lock);
1343 	vrm->vma = NULL; /* Invalidated. */
1344 	if (res)
1345 		return res;
1346 
1347 	/*
1348 	 * If we've not dropped the lock, then we should reload the VMA to
1349 	 * replace the invalidated VMA with the one that may have now been
1350 	 * split.
1351 	 */
1352 	if (drop_lock) {
1353 		vrm->mmap_locked = false;
1354 	} else {
1355 		vrm->vma = vma_lookup(mm, vrm->addr);
1356 		if (!vrm->vma)
1357 			return -EFAULT;
1358 	}
1359 
1360 	return 0;
1361 }
1362 
1363 /*
1364  * mremap_to() - remap a vma to a new location.
1365  * Returns: The new address of the vma or an error.
1366  */
mremap_to(struct vma_remap_struct * vrm)1367 static unsigned long mremap_to(struct vma_remap_struct *vrm)
1368 {
1369 	struct mm_struct *mm = current->mm;
1370 	unsigned long err;
1371 
1372 	if (vrm->flags & MREMAP_FIXED) {
1373 		/*
1374 		 * In mremap_to().
1375 		 * VMA is moved to dst address, and munmap dst first.
1376 		 * do_munmap will check if dst is sealed.
1377 		 */
1378 		err = do_munmap(mm, vrm->new_addr, vrm->new_len,
1379 				vrm->uf_unmap_early);
1380 		vrm->vma = NULL; /* Invalidated. */
1381 		vrm->vmi_needs_invalidate = true;
1382 		if (err)
1383 			return err;
1384 
1385 		/*
1386 		 * If we remap a portion of a VMA elsewhere in the same VMA,
1387 		 * this can invalidate the old VMA. Reset.
1388 		 */
1389 		vrm->vma = vma_lookup(mm, vrm->addr);
1390 		if (!vrm->vma)
1391 			return -EFAULT;
1392 	}
1393 
1394 	if (vrm->remap_type == MREMAP_SHRINK) {
1395 		err = shrink_vma(vrm, /* drop_lock= */false);
1396 		if (err)
1397 			return err;
1398 
1399 		/* Set up for the move now shrink has been executed. */
1400 		vrm->old_len = vrm->new_len;
1401 	}
1402 
1403 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
1404 	if (vrm->flags & MREMAP_DONTUNMAP) {
1405 		vm_flags_t vm_flags = vrm->vma->vm_flags;
1406 		unsigned long pages = vrm->old_len >> PAGE_SHIFT;
1407 
1408 		if (!may_expand_vm(mm, vm_flags, pages))
1409 			return -ENOMEM;
1410 	}
1411 
1412 	err = vrm_set_new_addr(vrm);
1413 	if (err)
1414 		return err;
1415 
1416 	return move_vma(vrm);
1417 }
1418 
vma_expandable(struct vm_area_struct * vma,unsigned long delta)1419 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
1420 {
1421 	unsigned long end = vma->vm_end + delta;
1422 
1423 	if (end < vma->vm_end) /* overflow */
1424 		return 0;
1425 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
1426 		return 0;
1427 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
1428 			      0, MAP_FIXED) & ~PAGE_MASK)
1429 		return 0;
1430 	return 1;
1431 }
1432 
1433 /* Determine whether we are actually able to execute an in-place expansion. */
vrm_can_expand_in_place(struct vma_remap_struct * vrm)1434 static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
1435 {
1436 	/* Number of bytes from vrm->addr to end of VMA. */
1437 	unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr;
1438 
1439 	/* If end of range aligns to end of VMA, we can just expand in-place. */
1440 	if (suffix_bytes != vrm->old_len)
1441 		return false;
1442 
1443 	/* Check whether this is feasible. */
1444 	if (!vma_expandable(vrm->vma, vrm->delta))
1445 		return false;
1446 
1447 	return true;
1448 }
1449 
1450 /*
1451  * We know we can expand the VMA in-place by delta pages, so do so.
1452  *
1453  * If we discover the VMA is locked, update mm_struct statistics accordingly and
1454  * indicate so to the caller.
1455  */
expand_vma_in_place(struct vma_remap_struct * vrm)1456 static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
1457 {
1458 	struct mm_struct *mm = current->mm;
1459 	struct vm_area_struct *vma = vrm->vma;
1460 	VMA_ITERATOR(vmi, mm, vma->vm_end);
1461 
1462 	if (!vrm_calc_charge(vrm))
1463 		return -ENOMEM;
1464 
1465 	/*
1466 	 * Function vma_merge_extend() is called on the
1467 	 * extension we are adding to the already existing vma,
1468 	 * vma_merge_extend() will merge this extension with the
1469 	 * already existing vma (expand operation itself) and
1470 	 * possibly also with the next vma if it becomes
1471 	 * adjacent to the expanded vma and otherwise
1472 	 * compatible.
1473 	 */
1474 	vma = vma_merge_extend(&vmi, vma, vrm->delta);
1475 	if (!vma) {
1476 		vrm_uncharge(vrm);
1477 		return -ENOMEM;
1478 	}
1479 	vrm->vma = vma;
1480 
1481 	vrm_stat_account(vrm, vrm->delta);
1482 
1483 	return 0;
1484 }
1485 
align_hugetlb(struct vma_remap_struct * vrm)1486 static bool align_hugetlb(struct vma_remap_struct *vrm)
1487 {
1488 	struct hstate *h __maybe_unused = hstate_vma(vrm->vma);
1489 
1490 	vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h));
1491 	vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h));
1492 
1493 	/* addrs must be huge page aligned */
1494 	if (vrm->addr & ~huge_page_mask(h))
1495 		return false;
1496 	if (vrm->new_addr & ~huge_page_mask(h))
1497 		return false;
1498 
1499 	/*
1500 	 * Don't allow remap expansion, because the underlying hugetlb
1501 	 * reservation is not yet capable to handle split reservation.
1502 	 */
1503 	if (vrm->new_len > vrm->old_len)
1504 		return false;
1505 
1506 	return true;
1507 }
1508 
1509 /*
1510  * We are mremap()'ing without specifying a fixed address to move to, but are
1511  * requesting that the VMA's size be increased.
1512  *
1513  * Try to do so in-place, if this fails, then move the VMA to a new location to
1514  * action the change.
1515  */
expand_vma(struct vma_remap_struct * vrm)1516 static unsigned long expand_vma(struct vma_remap_struct *vrm)
1517 {
1518 	unsigned long err;
1519 
1520 	/*
1521 	 * [addr, old_len) spans precisely to the end of the VMA, so try to
1522 	 * expand it in-place.
1523 	 */
1524 	if (vrm_can_expand_in_place(vrm)) {
1525 		err = expand_vma_in_place(vrm);
1526 		if (err)
1527 			return err;
1528 
1529 		/* OK we're done! */
1530 		return vrm->addr;
1531 	}
1532 
1533 	/*
1534 	 * We weren't able to just expand or shrink the area,
1535 	 * we need to create a new one and move it.
1536 	 */
1537 
1538 	/* We're not allowed to move the VMA, so error out. */
1539 	if (!(vrm->flags & MREMAP_MAYMOVE))
1540 		return -ENOMEM;
1541 
1542 	/* Find a new location to move the VMA to. */
1543 	err = vrm_set_new_addr(vrm);
1544 	if (err)
1545 		return err;
1546 
1547 	return move_vma(vrm);
1548 }
1549 
1550 /*
1551  * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the
1552  * first available address to perform the operation.
1553  */
mremap_at(struct vma_remap_struct * vrm)1554 static unsigned long mremap_at(struct vma_remap_struct *vrm)
1555 {
1556 	unsigned long res;
1557 
1558 	switch (vrm->remap_type) {
1559 	case MREMAP_INVALID:
1560 		break;
1561 	case MREMAP_NO_RESIZE:
1562 		/* NO-OP CASE - resizing to the same size. */
1563 		return vrm->addr;
1564 	case MREMAP_SHRINK:
1565 		/*
1566 		 * SHRINK CASE. Can always be done in-place.
1567 		 *
1568 		 * Simply unmap the shrunken portion of the VMA. This does all
1569 		 * the needed commit accounting, and we indicate that the mmap
1570 		 * lock should be dropped.
1571 		 */
1572 		res = shrink_vma(vrm, /* drop_lock= */true);
1573 		if (res)
1574 			return res;
1575 
1576 		return vrm->addr;
1577 	case MREMAP_EXPAND:
1578 		return expand_vma(vrm);
1579 	}
1580 
1581 	/* Should not be possible. */
1582 	WARN_ON_ONCE(1);
1583 	return -EINVAL;
1584 }
1585 
1586 /*
1587  * Will this operation result in the VMA being expanded or moved and thus need
1588  * to map a new portion of virtual address space?
1589  */
vrm_will_map_new(struct vma_remap_struct * vrm)1590 static bool vrm_will_map_new(struct vma_remap_struct *vrm)
1591 {
1592 	if (vrm->remap_type == MREMAP_EXPAND)
1593 		return true;
1594 
1595 	if (vrm_implies_new_addr(vrm))
1596 		return true;
1597 
1598 	return false;
1599 }
1600 
1601 /* Does this remap ONLY move mappings? */
vrm_move_only(struct vma_remap_struct * vrm)1602 static bool vrm_move_only(struct vma_remap_struct *vrm)
1603 {
1604 	if (!(vrm->flags & MREMAP_FIXED))
1605 		return false;
1606 
1607 	if (vrm->old_len != vrm->new_len)
1608 		return false;
1609 
1610 	return true;
1611 }
1612 
notify_uffd(struct vma_remap_struct * vrm,bool failed)1613 static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
1614 {
1615 	struct mm_struct *mm = current->mm;
1616 
1617 	/* Regardless of success/failure, we always notify of any unmaps. */
1618 	userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
1619 	if (failed)
1620 		mremap_userfaultfd_fail(vrm->uf);
1621 	else
1622 		mremap_userfaultfd_complete(vrm->uf, vrm->addr,
1623 			vrm->new_addr, vrm->old_len);
1624 	userfaultfd_unmap_complete(mm, vrm->uf_unmap);
1625 }
1626 
vma_multi_allowed(struct vm_area_struct * vma)1627 static bool vma_multi_allowed(struct vm_area_struct *vma)
1628 {
1629 	struct file *file = vma->vm_file;
1630 
1631 	/*
1632 	 * We can't support moving multiple uffd VMAs as notify requires
1633 	 * mmap lock to be dropped.
1634 	 */
1635 	if (userfaultfd_armed(vma))
1636 		return false;
1637 
1638 	/*
1639 	 * Custom get unmapped area might result in MREMAP_FIXED not
1640 	 * being obeyed.
1641 	 */
1642 	if (!file || !file->f_op->get_unmapped_area)
1643 		return true;
1644 	/* Known good. */
1645 	if (vma_is_shmem(vma))
1646 		return true;
1647 	if (is_vm_hugetlb_page(vma))
1648 		return true;
1649 	if (file->f_op->get_unmapped_area == thp_get_unmapped_area)
1650 		return true;
1651 
1652 	return false;
1653 }
1654 
check_prep_vma(struct vma_remap_struct * vrm)1655 static int check_prep_vma(struct vma_remap_struct *vrm)
1656 {
1657 	struct vm_area_struct *vma = vrm->vma;
1658 	struct mm_struct *mm = current->mm;
1659 	unsigned long addr = vrm->addr;
1660 	unsigned long old_len, new_len, pgoff;
1661 
1662 	if (!vma)
1663 		return -EFAULT;
1664 
1665 	/* If mseal()'d, mremap() is prohibited. */
1666 	if (vma_is_sealed(vma))
1667 		return -EPERM;
1668 
1669 	/* Align to hugetlb page size, if required. */
1670 	if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm))
1671 		return -EINVAL;
1672 
1673 	vrm_set_delta(vrm);
1674 	vrm->remap_type = vrm_remap_type(vrm);
1675 	/* For convenience, we set new_addr even if VMA won't move. */
1676 	if (!vrm_implies_new_addr(vrm))
1677 		vrm->new_addr = addr;
1678 
1679 	/* Below only meaningful if we expand or move a VMA. */
1680 	if (!vrm_will_map_new(vrm))
1681 		return 0;
1682 
1683 	old_len = vrm->old_len;
1684 	new_len = vrm->new_len;
1685 
1686 	/*
1687 	 * !old_len is a special case where an attempt is made to 'duplicate'
1688 	 * a mapping.  This makes no sense for private mappings as it will
1689 	 * instead create a fresh/new mapping unrelated to the original.  This
1690 	 * is contrary to the basic idea of mremap which creates new mappings
1691 	 * based on the original.  There are no known use cases for this
1692 	 * behavior.  As a result, fail such attempts.
1693 	 */
1694 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
1695 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n",
1696 			     current->comm, current->pid);
1697 		return -EINVAL;
1698 	}
1699 
1700 	if ((vrm->flags & MREMAP_DONTUNMAP) &&
1701 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
1702 		return -EINVAL;
1703 
1704 	/*
1705 	 * We permit crossing of boundaries for the range being unmapped due to
1706 	 * a shrink.
1707 	 */
1708 	if (vrm->remap_type == MREMAP_SHRINK)
1709 		old_len = new_len;
1710 
1711 	/*
1712 	 * We can't remap across the end of VMAs, as another VMA may be
1713 	 * adjacent:
1714 	 *
1715 	 *       addr   vma->vm_end
1716 	 *  |-----.----------|
1717 	 *  |     .          |
1718 	 *  |-----.----------|
1719 	 *        .<--------->xxx>
1720 	 *            old_len
1721 	 *
1722 	 * We also require that vma->vm_start <= addr < vma->vm_end.
1723 	 */
1724 	if (old_len > vma->vm_end - addr)
1725 		return -EFAULT;
1726 
1727 	if (new_len == old_len)
1728 		return 0;
1729 
1730 	/* We are expanding and the VMA is mlock()'d so we need to populate. */
1731 	if (vma->vm_flags & VM_LOCKED)
1732 		vrm->populate_expand = true;
1733 
1734 	/* Need to be careful about a growing mapping */
1735 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
1736 	pgoff += vma->vm_pgoff;
1737 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
1738 		return -EINVAL;
1739 
1740 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
1741 		return -EFAULT;
1742 
1743 	if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
1744 		return -EAGAIN;
1745 
1746 	if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
1747 		return -ENOMEM;
1748 
1749 	return 0;
1750 }
1751 
1752 /*
1753  * Are the parameters passed to mremap() valid? If so return 0, otherwise return
1754  * error.
1755  */
check_mremap_params(struct vma_remap_struct * vrm)1756 static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
1757 
1758 {
1759 	unsigned long addr = vrm->addr;
1760 	unsigned long flags = vrm->flags;
1761 
1762 	/* Ensure no unexpected flag values. */
1763 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1764 		return -EINVAL;
1765 
1766 	/* Start address must be page-aligned. */
1767 	if (offset_in_page(addr))
1768 		return -EINVAL;
1769 
1770 	/*
1771 	 * We allow a zero old-len as a special case
1772 	 * for DOS-emu "duplicate shm area" thing. But
1773 	 * a zero new-len is nonsensical.
1774 	 */
1775 	if (!vrm->new_len)
1776 		return -EINVAL;
1777 
1778 	/* Is the new length silly? */
1779 	if (vrm->new_len > TASK_SIZE)
1780 		return -EINVAL;
1781 
1782 	/* Remainder of checks are for cases with specific new_addr. */
1783 	if (!vrm_implies_new_addr(vrm))
1784 		return 0;
1785 
1786 	/* Is the new address silly? */
1787 	if (vrm->new_addr > TASK_SIZE - vrm->new_len)
1788 		return -EINVAL;
1789 
1790 	/* The new address must be page-aligned. */
1791 	if (offset_in_page(vrm->new_addr))
1792 		return -EINVAL;
1793 
1794 	/* A fixed address implies a move. */
1795 	if (!(flags & MREMAP_MAYMOVE))
1796 		return -EINVAL;
1797 
1798 	/* MREMAP_DONTUNMAP does not allow resizing in the process. */
1799 	if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
1800 		return -EINVAL;
1801 
1802 	/* Target VMA must not overlap source VMA. */
1803 	if (vrm_overlaps(vrm))
1804 		return -EINVAL;
1805 
1806 	/*
1807 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
1808 	 * it will bail out at the very beginning.
1809 	 * That is a problem if we have already unmaped the regions here
1810 	 * (new_addr, and old_addr), because userspace will not know the
1811 	 * state of the vma's after it gets -ENOMEM.
1812 	 * So, to avoid such scenario we can pre-compute if the whole
1813 	 * operation has high chances to success map-wise.
1814 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
1815 	 * split in 3 before unmapping it.
1816 	 * That means 2 more maps (1 for each) to the ones we already hold.
1817 	 * Check whether current map count plus 2 still leads us to 4 maps below
1818 	 * the threshold, otherwise return -ENOMEM here to be more safe.
1819 	 */
1820 	if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
1821 		return -ENOMEM;
1822 
1823 	return 0;
1824 }
1825 
remap_move(struct vma_remap_struct * vrm)1826 static unsigned long remap_move(struct vma_remap_struct *vrm)
1827 {
1828 	struct vm_area_struct *vma;
1829 	unsigned long start = vrm->addr;
1830 	unsigned long end = vrm->addr + vrm->old_len;
1831 	unsigned long new_addr = vrm->new_addr;
1832 	unsigned long target_addr = new_addr;
1833 	unsigned long res = -EFAULT;
1834 	unsigned long last_end;
1835 	bool seen_vma = false;
1836 
1837 	VMA_ITERATOR(vmi, current->mm, start);
1838 
1839 	/*
1840 	 * When moving VMAs we allow for batched moves across multiple VMAs,
1841 	 * with all VMAs in the input range [addr, addr + old_len) being moved
1842 	 * (and split as necessary).
1843 	 */
1844 	for_each_vma_range(vmi, vma, end) {
1845 		/* Account for start, end not aligned with VMA start, end. */
1846 		unsigned long addr = max(vma->vm_start, start);
1847 		unsigned long len = min(end, vma->vm_end) - addr;
1848 		unsigned long offset, res_vma;
1849 		bool multi_allowed;
1850 
1851 		/* No gap permitted at the start of the range. */
1852 		if (!seen_vma && start < vma->vm_start)
1853 			return -EFAULT;
1854 
1855 		/*
1856 		 * To sensibly move multiple VMAs, accounting for the fact that
1857 		 * get_unmapped_area() may align even MAP_FIXED moves, we simply
1858 		 * attempt to move such that the gaps between source VMAs remain
1859 		 * consistent in destination VMAs, e.g.:
1860 		 *
1861 		 *           X        Y                       X        Y
1862 		 *         <--->     <->                    <--->     <->
1863 		 * |-------|   |-----| |-----|      |-------|   |-----| |-----|
1864 		 * |   A   |   |  B  | |  C  | ---> |   A'  |   |  B' | |  C' |
1865 		 * |-------|   |-----| |-----|      |-------|   |-----| |-----|
1866 		 *                               new_addr
1867 		 *
1868 		 * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
1869 		 */
1870 		offset = seen_vma ? vma->vm_start - last_end : 0;
1871 		last_end = vma->vm_end;
1872 
1873 		vrm->vma = vma;
1874 		vrm->addr = addr;
1875 		vrm->new_addr = target_addr + offset;
1876 		vrm->old_len = vrm->new_len = len;
1877 
1878 		multi_allowed = vma_multi_allowed(vma);
1879 		if (!multi_allowed) {
1880 			/* This is not the first VMA, abort immediately. */
1881 			if (seen_vma)
1882 				return -EFAULT;
1883 			/* This is the first, but there are more, abort. */
1884 			if (vma->vm_end < end)
1885 				return -EFAULT;
1886 		}
1887 
1888 		res_vma = check_prep_vma(vrm);
1889 		if (!res_vma)
1890 			res_vma = mremap_to(vrm);
1891 		if (IS_ERR_VALUE(res_vma))
1892 			return res_vma;
1893 
1894 		if (!seen_vma) {
1895 			VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr);
1896 			res = res_vma;
1897 		}
1898 
1899 		/* mmap lock is only dropped on shrink. */
1900 		VM_WARN_ON_ONCE(!vrm->mmap_locked);
1901 		/* This is a move, no expand should occur. */
1902 		VM_WARN_ON_ONCE(vrm->populate_expand);
1903 
1904 		if (vrm->vmi_needs_invalidate) {
1905 			vma_iter_invalidate(&vmi);
1906 			vrm->vmi_needs_invalidate = false;
1907 		}
1908 		seen_vma = true;
1909 		target_addr = res_vma + vrm->new_len;
1910 	}
1911 
1912 	return res;
1913 }
1914 
do_mremap(struct vma_remap_struct * vrm)1915 static unsigned long do_mremap(struct vma_remap_struct *vrm)
1916 {
1917 	struct mm_struct *mm = current->mm;
1918 	unsigned long res;
1919 	bool failed;
1920 
1921 	vrm->old_len = PAGE_ALIGN(vrm->old_len);
1922 	vrm->new_len = PAGE_ALIGN(vrm->new_len);
1923 
1924 	res = check_mremap_params(vrm);
1925 	if (res)
1926 		return res;
1927 
1928 	if (mmap_write_lock_killable(mm))
1929 		return -EINTR;
1930 	vrm->mmap_locked = true;
1931 
1932 	if (vrm_move_only(vrm)) {
1933 		res = remap_move(vrm);
1934 	} else {
1935 		vrm->vma = vma_lookup(current->mm, vrm->addr);
1936 		res = check_prep_vma(vrm);
1937 		if (res)
1938 			goto out;
1939 
1940 		/* Actually execute mremap. */
1941 		res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
1942 	}
1943 
1944 out:
1945 	failed = IS_ERR_VALUE(res);
1946 
1947 	if (vrm->mmap_locked)
1948 		mmap_write_unlock(mm);
1949 
1950 	/* VMA mlock'd + was expanded, so populated expanded region. */
1951 	if (!failed && vrm->populate_expand)
1952 		mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
1953 
1954 	notify_uffd(vrm, failed);
1955 	return res;
1956 }
1957 
1958 /*
1959  * Expand (or shrink) an existing mapping, potentially moving it at the
1960  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1961  *
1962  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
1963  * This option implies MREMAP_MAYMOVE.
1964  */
SYSCALL_DEFINE5(mremap,unsigned long,addr,unsigned long,old_len,unsigned long,new_len,unsigned long,flags,unsigned long,new_addr)1965 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1966 		unsigned long, new_len, unsigned long, flags,
1967 		unsigned long, new_addr)
1968 {
1969 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
1970 	LIST_HEAD(uf_unmap_early);
1971 	LIST_HEAD(uf_unmap);
1972 	/*
1973 	 * There is a deliberate asymmetry here: we strip the pointer tag
1974 	 * from the old address but leave the new address alone. This is
1975 	 * for consistency with mmap(), where we prevent the creation of
1976 	 * aliasing mappings in userspace by leaving the tag bits of the
1977 	 * mapping address intact. A non-zero tag will cause the subsequent
1978 	 * range checks to reject the address as invalid.
1979 	 *
1980 	 * See Documentation/arch/arm64/tagged-address-abi.rst for more
1981 	 * information.
1982 	 */
1983 	struct vma_remap_struct vrm = {
1984 		.addr = untagged_addr(addr),
1985 		.old_len = old_len,
1986 		.new_len = new_len,
1987 		.flags = flags,
1988 		.new_addr = new_addr,
1989 
1990 		.uf = &uf,
1991 		.uf_unmap_early = &uf_unmap_early,
1992 		.uf_unmap = &uf_unmap,
1993 
1994 		.remap_type = MREMAP_INVALID, /* We set later. */
1995 	};
1996 
1997 	return do_mremap(&vrm);
1998 }
1999