xref: /linux/mm/mremap.c (revision 5cd2340cb6a383d04fd88e48fabc2a21a909d6a1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
31 #include <asm/pgalloc.h>
32 
33 #include "internal.h"
34 
35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 {
37 	pgd_t *pgd;
38 	p4d_t *p4d;
39 	pud_t *pud;
40 
41 	pgd = pgd_offset(mm, addr);
42 	if (pgd_none_or_clear_bad(pgd))
43 		return NULL;
44 
45 	p4d = p4d_offset(pgd, addr);
46 	if (p4d_none_or_clear_bad(p4d))
47 		return NULL;
48 
49 	pud = pud_offset(p4d, addr);
50 	if (pud_none_or_clear_bad(pud))
51 		return NULL;
52 
53 	return pud;
54 }
55 
56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 {
58 	pud_t *pud;
59 	pmd_t *pmd;
60 
61 	pud = get_old_pud(mm, addr);
62 	if (!pud)
63 		return NULL;
64 
65 	pmd = pmd_offset(pud, addr);
66 	if (pmd_none(*pmd))
67 		return NULL;
68 
69 	return pmd;
70 }
71 
72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 			    unsigned long addr)
74 {
75 	pgd_t *pgd;
76 	p4d_t *p4d;
77 
78 	pgd = pgd_offset(mm, addr);
79 	p4d = p4d_alloc(mm, pgd, addr);
80 	if (!p4d)
81 		return NULL;
82 
83 	return pud_alloc(mm, p4d, addr);
84 }
85 
86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 			    unsigned long addr)
88 {
89 	pud_t *pud;
90 	pmd_t *pmd;
91 
92 	pud = alloc_new_pud(mm, vma, addr);
93 	if (!pud)
94 		return NULL;
95 
96 	pmd = pmd_alloc(mm, pud, addr);
97 	if (!pmd)
98 		return NULL;
99 
100 	VM_BUG_ON(pmd_trans_huge(*pmd));
101 
102 	return pmd;
103 }
104 
105 static void take_rmap_locks(struct vm_area_struct *vma)
106 {
107 	if (vma->vm_file)
108 		i_mmap_lock_write(vma->vm_file->f_mapping);
109 	if (vma->anon_vma)
110 		anon_vma_lock_write(vma->anon_vma);
111 }
112 
113 static void drop_rmap_locks(struct vm_area_struct *vma)
114 {
115 	if (vma->anon_vma)
116 		anon_vma_unlock_write(vma->anon_vma);
117 	if (vma->vm_file)
118 		i_mmap_unlock_write(vma->vm_file->f_mapping);
119 }
120 
121 static pte_t move_soft_dirty_pte(pte_t pte)
122 {
123 	/*
124 	 * Set soft dirty bit so we can notice
125 	 * in userspace the ptes were moved.
126 	 */
127 #ifdef CONFIG_MEM_SOFT_DIRTY
128 	if (pte_present(pte))
129 		pte = pte_mksoft_dirty(pte);
130 	else if (is_swap_pte(pte))
131 		pte = pte_swp_mksoft_dirty(pte);
132 #endif
133 	return pte;
134 }
135 
136 static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 		unsigned long old_addr, unsigned long old_end,
138 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 		unsigned long new_addr, bool need_rmap_locks)
140 {
141 	struct mm_struct *mm = vma->vm_mm;
142 	pte_t *old_pte, *new_pte, pte;
143 	spinlock_t *old_ptl, *new_ptl;
144 	bool force_flush = false;
145 	unsigned long len = old_end - old_addr;
146 	int err = 0;
147 
148 	/*
149 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
150 	 * locks to ensure that rmap will always observe either the old or the
151 	 * new ptes. This is the easiest way to avoid races with
152 	 * truncate_pagecache(), page migration, etc...
153 	 *
154 	 * When need_rmap_locks is false, we use other ways to avoid
155 	 * such races:
156 	 *
157 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
158 	 *   which rmap call sites look for using vma_is_temporary_stack().
159 	 *
160 	 * - During mremap(), new_vma is often known to be placed after vma
161 	 *   in rmap traversal order. This ensures rmap will always observe
162 	 *   either the old pte, or the new pte, or both (the page table locks
163 	 *   serialize access to individual ptes, but only rmap traversal
164 	 *   order guarantees that we won't miss both the old and new ptes).
165 	 */
166 	if (need_rmap_locks)
167 		take_rmap_locks(vma);
168 
169 	/*
170 	 * We don't have to worry about the ordering of src and dst
171 	 * pte locks because exclusive mmap_lock prevents deadlock.
172 	 */
173 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
174 	if (!old_pte) {
175 		err = -EAGAIN;
176 		goto out;
177 	}
178 	new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
179 	if (!new_pte) {
180 		pte_unmap_unlock(old_pte, old_ptl);
181 		err = -EAGAIN;
182 		goto out;
183 	}
184 	if (new_ptl != old_ptl)
185 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
186 	flush_tlb_batched_pending(vma->vm_mm);
187 	arch_enter_lazy_mmu_mode();
188 
189 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
190 				   new_pte++, new_addr += PAGE_SIZE) {
191 		if (pte_none(ptep_get(old_pte)))
192 			continue;
193 
194 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
195 		/*
196 		 * If we are remapping a valid PTE, make sure
197 		 * to flush TLB before we drop the PTL for the
198 		 * PTE.
199 		 *
200 		 * NOTE! Both old and new PTL matter: the old one
201 		 * for racing with folio_mkclean(), the new one to
202 		 * make sure the physical page stays valid until
203 		 * the TLB entry for the old mapping has been
204 		 * flushed.
205 		 */
206 		if (pte_present(pte))
207 			force_flush = true;
208 		pte = move_pte(pte, old_addr, new_addr);
209 		pte = move_soft_dirty_pte(pte);
210 		set_pte_at(mm, new_addr, new_pte, pte);
211 	}
212 
213 	arch_leave_lazy_mmu_mode();
214 	if (force_flush)
215 		flush_tlb_range(vma, old_end - len, old_end);
216 	if (new_ptl != old_ptl)
217 		spin_unlock(new_ptl);
218 	pte_unmap(new_pte - 1);
219 	pte_unmap_unlock(old_pte - 1, old_ptl);
220 out:
221 	if (need_rmap_locks)
222 		drop_rmap_locks(vma);
223 	return err;
224 }
225 
226 #ifndef arch_supports_page_table_move
227 #define arch_supports_page_table_move arch_supports_page_table_move
228 static inline bool arch_supports_page_table_move(void)
229 {
230 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
231 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
232 }
233 #endif
234 
235 #ifdef CONFIG_HAVE_MOVE_PMD
236 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
237 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
238 {
239 	spinlock_t *old_ptl, *new_ptl;
240 	struct mm_struct *mm = vma->vm_mm;
241 	pmd_t pmd;
242 
243 	if (!arch_supports_page_table_move())
244 		return false;
245 	/*
246 	 * The destination pmd shouldn't be established, free_pgtables()
247 	 * should have released it.
248 	 *
249 	 * However, there's a case during execve() where we use mremap
250 	 * to move the initial stack, and in that case the target area
251 	 * may overlap the source area (always moving down).
252 	 *
253 	 * If everything is PMD-aligned, that works fine, as moving
254 	 * each pmd down will clear the source pmd. But if we first
255 	 * have a few 4kB-only pages that get moved down, and then
256 	 * hit the "now the rest is PMD-aligned, let's do everything
257 	 * one pmd at a time", we will still have the old (now empty
258 	 * of any 4kB pages, but still there) PMD in the page table
259 	 * tree.
260 	 *
261 	 * Warn on it once - because we really should try to figure
262 	 * out how to do this better - but then say "I won't move
263 	 * this pmd".
264 	 *
265 	 * One alternative might be to just unmap the target pmd at
266 	 * this point, and verify that it really is empty. We'll see.
267 	 */
268 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
269 		return false;
270 
271 	/*
272 	 * We don't have to worry about the ordering of src and dst
273 	 * ptlocks because exclusive mmap_lock prevents deadlock.
274 	 */
275 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
276 	new_ptl = pmd_lockptr(mm, new_pmd);
277 	if (new_ptl != old_ptl)
278 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
279 
280 	/* Clear the pmd */
281 	pmd = *old_pmd;
282 	pmd_clear(old_pmd);
283 
284 	VM_BUG_ON(!pmd_none(*new_pmd));
285 
286 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
287 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
288 	if (new_ptl != old_ptl)
289 		spin_unlock(new_ptl);
290 	spin_unlock(old_ptl);
291 
292 	return true;
293 }
294 #else
295 static inline bool move_normal_pmd(struct vm_area_struct *vma,
296 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
297 		pmd_t *new_pmd)
298 {
299 	return false;
300 }
301 #endif
302 
303 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
304 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
305 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
306 {
307 	spinlock_t *old_ptl, *new_ptl;
308 	struct mm_struct *mm = vma->vm_mm;
309 	pud_t pud;
310 
311 	if (!arch_supports_page_table_move())
312 		return false;
313 	/*
314 	 * The destination pud shouldn't be established, free_pgtables()
315 	 * should have released it.
316 	 */
317 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
318 		return false;
319 
320 	/*
321 	 * We don't have to worry about the ordering of src and dst
322 	 * ptlocks because exclusive mmap_lock prevents deadlock.
323 	 */
324 	old_ptl = pud_lock(vma->vm_mm, old_pud);
325 	new_ptl = pud_lockptr(mm, new_pud);
326 	if (new_ptl != old_ptl)
327 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
328 
329 	/* Clear the pud */
330 	pud = *old_pud;
331 	pud_clear(old_pud);
332 
333 	VM_BUG_ON(!pud_none(*new_pud));
334 
335 	pud_populate(mm, new_pud, pud_pgtable(pud));
336 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
337 	if (new_ptl != old_ptl)
338 		spin_unlock(new_ptl);
339 	spin_unlock(old_ptl);
340 
341 	return true;
342 }
343 #else
344 static inline bool move_normal_pud(struct vm_area_struct *vma,
345 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
346 		pud_t *new_pud)
347 {
348 	return false;
349 }
350 #endif
351 
352 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
353 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
354 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
355 {
356 	spinlock_t *old_ptl, *new_ptl;
357 	struct mm_struct *mm = vma->vm_mm;
358 	pud_t pud;
359 
360 	/*
361 	 * The destination pud shouldn't be established, free_pgtables()
362 	 * should have released it.
363 	 */
364 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
365 		return false;
366 
367 	/*
368 	 * We don't have to worry about the ordering of src and dst
369 	 * ptlocks because exclusive mmap_lock prevents deadlock.
370 	 */
371 	old_ptl = pud_lock(vma->vm_mm, old_pud);
372 	new_ptl = pud_lockptr(mm, new_pud);
373 	if (new_ptl != old_ptl)
374 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375 
376 	/* Clear the pud */
377 	pud = *old_pud;
378 	pud_clear(old_pud);
379 
380 	VM_BUG_ON(!pud_none(*new_pud));
381 
382 	/* Set the new pud */
383 	/* mark soft_ditry when we add pud level soft dirty support */
384 	set_pud_at(mm, new_addr, new_pud, pud);
385 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
386 	if (new_ptl != old_ptl)
387 		spin_unlock(new_ptl);
388 	spin_unlock(old_ptl);
389 
390 	return true;
391 }
392 #else
393 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
394 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
395 {
396 	WARN_ON_ONCE(1);
397 	return false;
398 
399 }
400 #endif
401 
402 enum pgt_entry {
403 	NORMAL_PMD,
404 	HPAGE_PMD,
405 	NORMAL_PUD,
406 	HPAGE_PUD,
407 };
408 
409 /*
410  * Returns an extent of the corresponding size for the pgt_entry specified if
411  * valid. Else returns a smaller extent bounded by the end of the source and
412  * destination pgt_entry.
413  */
414 static __always_inline unsigned long get_extent(enum pgt_entry entry,
415 			unsigned long old_addr, unsigned long old_end,
416 			unsigned long new_addr)
417 {
418 	unsigned long next, extent, mask, size;
419 
420 	switch (entry) {
421 	case HPAGE_PMD:
422 	case NORMAL_PMD:
423 		mask = PMD_MASK;
424 		size = PMD_SIZE;
425 		break;
426 	case HPAGE_PUD:
427 	case NORMAL_PUD:
428 		mask = PUD_MASK;
429 		size = PUD_SIZE;
430 		break;
431 	default:
432 		BUILD_BUG();
433 		break;
434 	}
435 
436 	next = (old_addr + size) & mask;
437 	/* even if next overflowed, extent below will be ok */
438 	extent = next - old_addr;
439 	if (extent > old_end - old_addr)
440 		extent = old_end - old_addr;
441 	next = (new_addr + size) & mask;
442 	if (extent > next - new_addr)
443 		extent = next - new_addr;
444 	return extent;
445 }
446 
447 /*
448  * Attempts to speedup the move by moving entry at the level corresponding to
449  * pgt_entry. Returns true if the move was successful, else false.
450  */
451 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
452 			unsigned long old_addr, unsigned long new_addr,
453 			void *old_entry, void *new_entry, bool need_rmap_locks)
454 {
455 	bool moved = false;
456 
457 	/* See comment in move_ptes() */
458 	if (need_rmap_locks)
459 		take_rmap_locks(vma);
460 
461 	switch (entry) {
462 	case NORMAL_PMD:
463 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
464 					new_entry);
465 		break;
466 	case NORMAL_PUD:
467 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
468 					new_entry);
469 		break;
470 	case HPAGE_PMD:
471 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
472 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
473 				      new_entry);
474 		break;
475 	case HPAGE_PUD:
476 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
477 			move_huge_pud(vma, old_addr, new_addr, old_entry,
478 				      new_entry);
479 		break;
480 
481 	default:
482 		WARN_ON_ONCE(1);
483 		break;
484 	}
485 
486 	if (need_rmap_locks)
487 		drop_rmap_locks(vma);
488 
489 	return moved;
490 }
491 
492 /*
493  * A helper to check if aligning down is OK. The aligned address should fall
494  * on *no mapping*. For the stack moving down, that's a special move within
495  * the VMA that is created to span the source and destination of the move,
496  * so we make an exception for it.
497  */
498 static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
499 			    unsigned long mask, bool for_stack)
500 {
501 	unsigned long addr_masked = addr_to_align & mask;
502 
503 	/*
504 	 * If @addr_to_align of either source or destination is not the beginning
505 	 * of the corresponding VMA, we can't align down or we will destroy part
506 	 * of the current mapping.
507 	 */
508 	if (!for_stack && vma->vm_start != addr_to_align)
509 		return false;
510 
511 	/* In the stack case we explicitly permit in-VMA alignment. */
512 	if (for_stack && addr_masked >= vma->vm_start)
513 		return true;
514 
515 	/*
516 	 * Make sure the realignment doesn't cause the address to fall on an
517 	 * existing mapping.
518 	 */
519 	return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
520 }
521 
522 /* Opportunistically realign to specified boundary for faster copy. */
523 static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
524 			     unsigned long *new_addr, struct vm_area_struct *new_vma,
525 			     unsigned long mask, bool for_stack)
526 {
527 	/* Skip if the addresses are already aligned. */
528 	if ((*old_addr & ~mask) == 0)
529 		return;
530 
531 	/* Only realign if the new and old addresses are mutually aligned. */
532 	if ((*old_addr & ~mask) != (*new_addr & ~mask))
533 		return;
534 
535 	/* Ensure realignment doesn't cause overlap with existing mappings. */
536 	if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
537 	    !can_align_down(new_vma, *new_addr, mask, for_stack))
538 		return;
539 
540 	*old_addr = *old_addr & mask;
541 	*new_addr = *new_addr & mask;
542 }
543 
544 unsigned long move_page_tables(struct vm_area_struct *vma,
545 		unsigned long old_addr, struct vm_area_struct *new_vma,
546 		unsigned long new_addr, unsigned long len,
547 		bool need_rmap_locks, bool for_stack)
548 {
549 	unsigned long extent, old_end;
550 	struct mmu_notifier_range range;
551 	pmd_t *old_pmd, *new_pmd;
552 	pud_t *old_pud, *new_pud;
553 
554 	if (!len)
555 		return 0;
556 
557 	old_end = old_addr + len;
558 
559 	if (is_vm_hugetlb_page(vma))
560 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
561 						new_addr, len);
562 
563 	/*
564 	 * If possible, realign addresses to PMD boundary for faster copy.
565 	 * Only realign if the mremap copying hits a PMD boundary.
566 	 */
567 	if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
568 		try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
569 				 for_stack);
570 
571 	flush_cache_range(vma, old_addr, old_end);
572 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
573 				old_addr, old_end);
574 	mmu_notifier_invalidate_range_start(&range);
575 
576 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
577 		cond_resched();
578 		/*
579 		 * If extent is PUD-sized try to speed up the move by moving at the
580 		 * PUD level if possible.
581 		 */
582 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
583 
584 		old_pud = get_old_pud(vma->vm_mm, old_addr);
585 		if (!old_pud)
586 			continue;
587 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
588 		if (!new_pud)
589 			break;
590 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
591 			if (extent == HPAGE_PUD_SIZE) {
592 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
593 					       old_pud, new_pud, need_rmap_locks);
594 				/* We ignore and continue on error? */
595 				continue;
596 			}
597 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
598 
599 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
600 					   old_pud, new_pud, true))
601 				continue;
602 		}
603 
604 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
605 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
606 		if (!old_pmd)
607 			continue;
608 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
609 		if (!new_pmd)
610 			break;
611 again:
612 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
613 		    pmd_devmap(*old_pmd)) {
614 			if (extent == HPAGE_PMD_SIZE &&
615 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
616 					   old_pmd, new_pmd, need_rmap_locks))
617 				continue;
618 			split_huge_pmd(vma, old_pmd, old_addr);
619 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
620 			   extent == PMD_SIZE) {
621 			/*
622 			 * If the extent is PMD-sized, try to speed the move by
623 			 * moving at the PMD level if possible.
624 			 */
625 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
626 					   old_pmd, new_pmd, true))
627 				continue;
628 		}
629 		if (pmd_none(*old_pmd))
630 			continue;
631 		if (pte_alloc(new_vma->vm_mm, new_pmd))
632 			break;
633 		if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
634 			      new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
635 			goto again;
636 	}
637 
638 	mmu_notifier_invalidate_range_end(&range);
639 
640 	/*
641 	 * Prevent negative return values when {old,new}_addr was realigned
642 	 * but we broke out of the above loop for the first PMD itself.
643 	 */
644 	if (len + old_addr < old_end)
645 		return 0;
646 
647 	return len + old_addr - old_end;	/* how much done */
648 }
649 
650 static unsigned long move_vma(struct vm_area_struct *vma,
651 		unsigned long old_addr, unsigned long old_len,
652 		unsigned long new_len, unsigned long new_addr,
653 		bool *locked, unsigned long flags,
654 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
655 {
656 	long to_account = new_len - old_len;
657 	struct mm_struct *mm = vma->vm_mm;
658 	struct vm_area_struct *new_vma;
659 	unsigned long vm_flags = vma->vm_flags;
660 	unsigned long new_pgoff;
661 	unsigned long moved_len;
662 	unsigned long account_start = 0;
663 	unsigned long account_end = 0;
664 	unsigned long hiwater_vm;
665 	int err = 0;
666 	bool need_rmap_locks;
667 	struct vma_iterator vmi;
668 
669 	/*
670 	 * We'd prefer to avoid failure later on in do_munmap:
671 	 * which may split one vma into three before unmapping.
672 	 */
673 	if (mm->map_count >= sysctl_max_map_count - 3)
674 		return -ENOMEM;
675 
676 	if (unlikely(flags & MREMAP_DONTUNMAP))
677 		to_account = new_len;
678 
679 	if (vma->vm_ops && vma->vm_ops->may_split) {
680 		if (vma->vm_start != old_addr)
681 			err = vma->vm_ops->may_split(vma, old_addr);
682 		if (!err && vma->vm_end != old_addr + old_len)
683 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
684 		if (err)
685 			return err;
686 	}
687 
688 	/*
689 	 * Advise KSM to break any KSM pages in the area to be moved:
690 	 * it would be confusing if they were to turn up at the new
691 	 * location, where they happen to coincide with different KSM
692 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
693 	 * so KSM can come around to merge on vma and new_vma afterwards.
694 	 */
695 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
696 						MADV_UNMERGEABLE, &vm_flags);
697 	if (err)
698 		return err;
699 
700 	if (vm_flags & VM_ACCOUNT) {
701 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
702 			return -ENOMEM;
703 	}
704 
705 	vma_start_write(vma);
706 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
707 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
708 			   &need_rmap_locks);
709 	if (!new_vma) {
710 		if (vm_flags & VM_ACCOUNT)
711 			vm_unacct_memory(to_account >> PAGE_SHIFT);
712 		return -ENOMEM;
713 	}
714 
715 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
716 				     need_rmap_locks, false);
717 	if (moved_len < old_len) {
718 		err = -ENOMEM;
719 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
720 		err = vma->vm_ops->mremap(new_vma);
721 	}
722 
723 	if (unlikely(err)) {
724 		/*
725 		 * On error, move entries back from new area to old,
726 		 * which will succeed since page tables still there,
727 		 * and then proceed to unmap new area instead of old.
728 		 */
729 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
730 				 true, false);
731 		vma = new_vma;
732 		old_len = new_len;
733 		old_addr = new_addr;
734 		new_addr = err;
735 	} else {
736 		mremap_userfaultfd_prep(new_vma, uf);
737 	}
738 
739 	if (is_vm_hugetlb_page(vma)) {
740 		clear_vma_resv_huge_pages(vma);
741 	}
742 
743 	/* Conceal VM_ACCOUNT so old reservation is not undone */
744 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
745 		vm_flags_clear(vma, VM_ACCOUNT);
746 		if (vma->vm_start < old_addr)
747 			account_start = vma->vm_start;
748 		if (vma->vm_end > old_addr + old_len)
749 			account_end = vma->vm_end;
750 	}
751 
752 	/*
753 	 * If we failed to move page tables we still do total_vm increment
754 	 * since do_munmap() will decrement it by old_len == new_len.
755 	 *
756 	 * Since total_vm is about to be raised artificially high for a
757 	 * moment, we need to restore high watermark afterwards: if stats
758 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
759 	 * If this were a serious issue, we'd add a flag to do_munmap().
760 	 */
761 	hiwater_vm = mm->hiwater_vm;
762 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
763 
764 	/* Tell pfnmap has moved from this vma */
765 	if (unlikely(vma->vm_flags & VM_PFNMAP))
766 		untrack_pfn_clear(vma);
767 
768 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
769 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
770 		vm_flags_clear(vma, VM_LOCKED_MASK);
771 
772 		/*
773 		 * anon_vma links of the old vma is no longer needed after its page
774 		 * table has been moved.
775 		 */
776 		if (new_vma != vma && vma->vm_start == old_addr &&
777 			vma->vm_end == (old_addr + old_len))
778 			unlink_anon_vmas(vma);
779 
780 		/* Because we won't unmap we don't need to touch locked_vm */
781 		return new_addr;
782 	}
783 
784 	vma_iter_init(&vmi, mm, old_addr);
785 	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
786 		/* OOM: unable to split vma, just get accounts right */
787 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
788 			vm_acct_memory(old_len >> PAGE_SHIFT);
789 		account_start = account_end = 0;
790 	}
791 
792 	if (vm_flags & VM_LOCKED) {
793 		mm->locked_vm += new_len >> PAGE_SHIFT;
794 		*locked = true;
795 	}
796 
797 	mm->hiwater_vm = hiwater_vm;
798 
799 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
800 	if (account_start) {
801 		vma = vma_prev(&vmi);
802 		vm_flags_set(vma, VM_ACCOUNT);
803 	}
804 
805 	if (account_end) {
806 		vma = vma_next(&vmi);
807 		vm_flags_set(vma, VM_ACCOUNT);
808 	}
809 
810 	return new_addr;
811 }
812 
813 static struct vm_area_struct *vma_to_resize(unsigned long addr,
814 	unsigned long old_len, unsigned long new_len, unsigned long flags)
815 {
816 	struct mm_struct *mm = current->mm;
817 	struct vm_area_struct *vma;
818 	unsigned long pgoff;
819 
820 	vma = vma_lookup(mm, addr);
821 	if (!vma)
822 		return ERR_PTR(-EFAULT);
823 
824 	/*
825 	 * !old_len is a special case where an attempt is made to 'duplicate'
826 	 * a mapping.  This makes no sense for private mappings as it will
827 	 * instead create a fresh/new mapping unrelated to the original.  This
828 	 * is contrary to the basic idea of mremap which creates new mappings
829 	 * based on the original.  There are no known use cases for this
830 	 * behavior.  As a result, fail such attempts.
831 	 */
832 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
833 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
834 		return ERR_PTR(-EINVAL);
835 	}
836 
837 	if ((flags & MREMAP_DONTUNMAP) &&
838 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
839 		return ERR_PTR(-EINVAL);
840 
841 	/* We can't remap across vm area boundaries */
842 	if (old_len > vma->vm_end - addr)
843 		return ERR_PTR(-EFAULT);
844 
845 	if (new_len == old_len)
846 		return vma;
847 
848 	/* Need to be careful about a growing mapping */
849 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
850 	pgoff += vma->vm_pgoff;
851 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
852 		return ERR_PTR(-EINVAL);
853 
854 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
855 		return ERR_PTR(-EFAULT);
856 
857 	if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
858 		return ERR_PTR(-EAGAIN);
859 
860 	if (!may_expand_vm(mm, vma->vm_flags,
861 				(new_len - old_len) >> PAGE_SHIFT))
862 		return ERR_PTR(-ENOMEM);
863 
864 	return vma;
865 }
866 
867 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
868 		unsigned long new_addr, unsigned long new_len, bool *locked,
869 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
870 		struct list_head *uf_unmap_early,
871 		struct list_head *uf_unmap)
872 {
873 	struct mm_struct *mm = current->mm;
874 	struct vm_area_struct *vma;
875 	unsigned long ret = -EINVAL;
876 	unsigned long map_flags = 0;
877 
878 	if (offset_in_page(new_addr))
879 		goto out;
880 
881 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
882 		goto out;
883 
884 	/* Ensure the old/new locations do not overlap */
885 	if (addr + old_len > new_addr && new_addr + new_len > addr)
886 		goto out;
887 
888 	/*
889 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
890 	 * it will bail out at the very beginning.
891 	 * That is a problem if we have already unmaped the regions here
892 	 * (new_addr, and old_addr), because userspace will not know the
893 	 * state of the vma's after it gets -ENOMEM.
894 	 * So, to avoid such scenario we can pre-compute if the whole
895 	 * operation has high chances to success map-wise.
896 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
897 	 * split in 3 before unmapping it.
898 	 * That means 2 more maps (1 for each) to the ones we already hold.
899 	 * Check whether current map count plus 2 still leads us to 4 maps below
900 	 * the threshold, otherwise return -ENOMEM here to be more safe.
901 	 */
902 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
903 		return -ENOMEM;
904 
905 	/*
906 	 * In mremap_to().
907 	 * Move a VMA to another location, check if src addr is sealed.
908 	 *
909 	 * Place can_modify_mm here because mremap_to()
910 	 * does its own checking for address range, and we only
911 	 * check the sealing after passing those checks.
912 	 *
913 	 * can_modify_mm assumes we have acquired the lock on MM.
914 	 */
915 	if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
916 		return -EPERM;
917 
918 	if (flags & MREMAP_FIXED) {
919 		/*
920 		 * In mremap_to().
921 		 * VMA is moved to dst address, and munmap dst first.
922 		 * do_munmap will check if dst is sealed.
923 		 */
924 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
925 		if (ret)
926 			goto out;
927 	}
928 
929 	if (old_len > new_len) {
930 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
931 		if (ret)
932 			goto out;
933 		old_len = new_len;
934 	}
935 
936 	vma = vma_to_resize(addr, old_len, new_len, flags);
937 	if (IS_ERR(vma)) {
938 		ret = PTR_ERR(vma);
939 		goto out;
940 	}
941 
942 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
943 	if (flags & MREMAP_DONTUNMAP &&
944 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
945 		ret = -ENOMEM;
946 		goto out;
947 	}
948 
949 	if (flags & MREMAP_FIXED)
950 		map_flags |= MAP_FIXED;
951 
952 	if (vma->vm_flags & VM_MAYSHARE)
953 		map_flags |= MAP_SHARED;
954 
955 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
956 				((addr - vma->vm_start) >> PAGE_SHIFT),
957 				map_flags);
958 	if (IS_ERR_VALUE(ret))
959 		goto out;
960 
961 	/* We got a new mapping */
962 	if (!(flags & MREMAP_FIXED))
963 		new_addr = ret;
964 
965 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
966 		       uf_unmap);
967 
968 out:
969 	return ret;
970 }
971 
972 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
973 {
974 	unsigned long end = vma->vm_end + delta;
975 
976 	if (end < vma->vm_end) /* overflow */
977 		return 0;
978 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
979 		return 0;
980 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
981 			      0, MAP_FIXED) & ~PAGE_MASK)
982 		return 0;
983 	return 1;
984 }
985 
986 /*
987  * Expand (or shrink) an existing mapping, potentially moving it at the
988  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
989  *
990  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
991  * This option implies MREMAP_MAYMOVE.
992  */
993 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
994 		unsigned long, new_len, unsigned long, flags,
995 		unsigned long, new_addr)
996 {
997 	struct mm_struct *mm = current->mm;
998 	struct vm_area_struct *vma;
999 	unsigned long ret = -EINVAL;
1000 	bool locked = false;
1001 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
1002 	LIST_HEAD(uf_unmap_early);
1003 	LIST_HEAD(uf_unmap);
1004 
1005 	/*
1006 	 * There is a deliberate asymmetry here: we strip the pointer tag
1007 	 * from the old address but leave the new address alone. This is
1008 	 * for consistency with mmap(), where we prevent the creation of
1009 	 * aliasing mappings in userspace by leaving the tag bits of the
1010 	 * mapping address intact. A non-zero tag will cause the subsequent
1011 	 * range checks to reject the address as invalid.
1012 	 *
1013 	 * See Documentation/arch/arm64/tagged-address-abi.rst for more
1014 	 * information.
1015 	 */
1016 	addr = untagged_addr(addr);
1017 
1018 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1019 		return ret;
1020 
1021 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1022 		return ret;
1023 
1024 	/*
1025 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
1026 	 * in the process.
1027 	 */
1028 	if (flags & MREMAP_DONTUNMAP &&
1029 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
1030 		return ret;
1031 
1032 
1033 	if (offset_in_page(addr))
1034 		return ret;
1035 
1036 	old_len = PAGE_ALIGN(old_len);
1037 	new_len = PAGE_ALIGN(new_len);
1038 
1039 	/*
1040 	 * We allow a zero old-len as a special case
1041 	 * for DOS-emu "duplicate shm area" thing. But
1042 	 * a zero new-len is nonsensical.
1043 	 */
1044 	if (!new_len)
1045 		return ret;
1046 
1047 	if (mmap_write_lock_killable(current->mm))
1048 		return -EINTR;
1049 	vma = vma_lookup(mm, addr);
1050 	if (!vma) {
1051 		ret = -EFAULT;
1052 		goto out;
1053 	}
1054 
1055 	if (is_vm_hugetlb_page(vma)) {
1056 		struct hstate *h __maybe_unused = hstate_vma(vma);
1057 
1058 		old_len = ALIGN(old_len, huge_page_size(h));
1059 		new_len = ALIGN(new_len, huge_page_size(h));
1060 
1061 		/* addrs must be huge page aligned */
1062 		if (addr & ~huge_page_mask(h))
1063 			goto out;
1064 		if (new_addr & ~huge_page_mask(h))
1065 			goto out;
1066 
1067 		/*
1068 		 * Don't allow remap expansion, because the underlying hugetlb
1069 		 * reservation is not yet capable to handle split reservation.
1070 		 */
1071 		if (new_len > old_len)
1072 			goto out;
1073 	}
1074 
1075 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
1076 		ret = mremap_to(addr, old_len, new_addr, new_len,
1077 				&locked, flags, &uf, &uf_unmap_early,
1078 				&uf_unmap);
1079 		goto out;
1080 	}
1081 
1082 	/*
1083 	 * Below is shrink/expand case (not mremap_to())
1084 	 * Check if src address is sealed, if so, reject.
1085 	 * In other words, prevent shrinking or expanding a sealed VMA.
1086 	 *
1087 	 * Place can_modify_mm here so we can keep the logic related to
1088 	 * shrink/expand together.
1089 	 */
1090 	if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
1091 		ret = -EPERM;
1092 		goto out;
1093 	}
1094 
1095 	/*
1096 	 * Always allow a shrinking remap: that just unmaps
1097 	 * the unnecessary pages..
1098 	 * do_vmi_munmap does all the needed commit accounting, and
1099 	 * unlocks the mmap_lock if so directed.
1100 	 */
1101 	if (old_len >= new_len) {
1102 		VMA_ITERATOR(vmi, mm, addr + new_len);
1103 
1104 		if (old_len == new_len) {
1105 			ret = addr;
1106 			goto out;
1107 		}
1108 
1109 		ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1110 				    &uf_unmap, true);
1111 		if (ret)
1112 			goto out;
1113 
1114 		ret = addr;
1115 		goto out_unlocked;
1116 	}
1117 
1118 	/*
1119 	 * Ok, we need to grow..
1120 	 */
1121 	vma = vma_to_resize(addr, old_len, new_len, flags);
1122 	if (IS_ERR(vma)) {
1123 		ret = PTR_ERR(vma);
1124 		goto out;
1125 	}
1126 
1127 	/* old_len exactly to the end of the area..
1128 	 */
1129 	if (old_len == vma->vm_end - addr) {
1130 		unsigned long delta = new_len - old_len;
1131 
1132 		/* can we just expand the current mapping? */
1133 		if (vma_expandable(vma, delta)) {
1134 			long pages = delta >> PAGE_SHIFT;
1135 			VMA_ITERATOR(vmi, mm, vma->vm_end);
1136 			long charged = 0;
1137 
1138 			if (vma->vm_flags & VM_ACCOUNT) {
1139 				if (security_vm_enough_memory_mm(mm, pages)) {
1140 					ret = -ENOMEM;
1141 					goto out;
1142 				}
1143 				charged = pages;
1144 			}
1145 
1146 			/*
1147 			 * Function vma_merge_extend() is called on the
1148 			 * extension we are adding to the already existing vma,
1149 			 * vma_merge_extend() will merge this extension with the
1150 			 * already existing vma (expand operation itself) and
1151 			 * possibly also with the next vma if it becomes
1152 			 * adjacent to the expanded vma and otherwise
1153 			 * compatible.
1154 			 */
1155 			vma = vma_merge_extend(&vmi, vma, delta);
1156 			if (!vma) {
1157 				vm_unacct_memory(charged);
1158 				ret = -ENOMEM;
1159 				goto out;
1160 			}
1161 
1162 			vm_stat_account(mm, vma->vm_flags, pages);
1163 			if (vma->vm_flags & VM_LOCKED) {
1164 				mm->locked_vm += pages;
1165 				locked = true;
1166 				new_addr = addr;
1167 			}
1168 			ret = addr;
1169 			goto out;
1170 		}
1171 	}
1172 
1173 	/*
1174 	 * We weren't able to just expand or shrink the area,
1175 	 * we need to create a new one and move it..
1176 	 */
1177 	ret = -ENOMEM;
1178 	if (flags & MREMAP_MAYMOVE) {
1179 		unsigned long map_flags = 0;
1180 		if (vma->vm_flags & VM_MAYSHARE)
1181 			map_flags |= MAP_SHARED;
1182 
1183 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1184 					vma->vm_pgoff +
1185 					((addr - vma->vm_start) >> PAGE_SHIFT),
1186 					map_flags);
1187 		if (IS_ERR_VALUE(new_addr)) {
1188 			ret = new_addr;
1189 			goto out;
1190 		}
1191 
1192 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1193 			       &locked, flags, &uf, &uf_unmap);
1194 	}
1195 out:
1196 	if (offset_in_page(ret))
1197 		locked = false;
1198 	mmap_write_unlock(current->mm);
1199 	if (locked && new_len > old_len)
1200 		mm_populate(new_addr + old_len, new_len - old_len);
1201 out_unlocked:
1202 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1203 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1204 	userfaultfd_unmap_complete(mm, &uf_unmap);
1205 	return ret;
1206 }
1207