xref: /linux/mm/mremap.c (revision 0526b56cbc3c489642bd6a5fe4b718dea7ef0ee8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
31 #include <asm/pgalloc.h>
32 
33 #include "internal.h"
34 
35 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 {
37 	pgd_t *pgd;
38 	p4d_t *p4d;
39 	pud_t *pud;
40 
41 	pgd = pgd_offset(mm, addr);
42 	if (pgd_none_or_clear_bad(pgd))
43 		return NULL;
44 
45 	p4d = p4d_offset(pgd, addr);
46 	if (p4d_none_or_clear_bad(p4d))
47 		return NULL;
48 
49 	pud = pud_offset(p4d, addr);
50 	if (pud_none_or_clear_bad(pud))
51 		return NULL;
52 
53 	return pud;
54 }
55 
56 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 {
58 	pud_t *pud;
59 	pmd_t *pmd;
60 
61 	pud = get_old_pud(mm, addr);
62 	if (!pud)
63 		return NULL;
64 
65 	pmd = pmd_offset(pud, addr);
66 	if (pmd_none(*pmd))
67 		return NULL;
68 
69 	return pmd;
70 }
71 
72 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 			    unsigned long addr)
74 {
75 	pgd_t *pgd;
76 	p4d_t *p4d;
77 
78 	pgd = pgd_offset(mm, addr);
79 	p4d = p4d_alloc(mm, pgd, addr);
80 	if (!p4d)
81 		return NULL;
82 
83 	return pud_alloc(mm, p4d, addr);
84 }
85 
86 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 			    unsigned long addr)
88 {
89 	pud_t *pud;
90 	pmd_t *pmd;
91 
92 	pud = alloc_new_pud(mm, vma, addr);
93 	if (!pud)
94 		return NULL;
95 
96 	pmd = pmd_alloc(mm, pud, addr);
97 	if (!pmd)
98 		return NULL;
99 
100 	VM_BUG_ON(pmd_trans_huge(*pmd));
101 
102 	return pmd;
103 }
104 
105 static void take_rmap_locks(struct vm_area_struct *vma)
106 {
107 	if (vma->vm_file)
108 		i_mmap_lock_write(vma->vm_file->f_mapping);
109 	if (vma->anon_vma)
110 		anon_vma_lock_write(vma->anon_vma);
111 }
112 
113 static void drop_rmap_locks(struct vm_area_struct *vma)
114 {
115 	if (vma->anon_vma)
116 		anon_vma_unlock_write(vma->anon_vma);
117 	if (vma->vm_file)
118 		i_mmap_unlock_write(vma->vm_file->f_mapping);
119 }
120 
121 static pte_t move_soft_dirty_pte(pte_t pte)
122 {
123 	/*
124 	 * Set soft dirty bit so we can notice
125 	 * in userspace the ptes were moved.
126 	 */
127 #ifdef CONFIG_MEM_SOFT_DIRTY
128 	if (pte_present(pte))
129 		pte = pte_mksoft_dirty(pte);
130 	else if (is_swap_pte(pte))
131 		pte = pte_swp_mksoft_dirty(pte);
132 #endif
133 	return pte;
134 }
135 
136 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 		unsigned long old_addr, unsigned long old_end,
138 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 		unsigned long new_addr, bool need_rmap_locks)
140 {
141 	struct mm_struct *mm = vma->vm_mm;
142 	pte_t *old_pte, *new_pte, pte;
143 	spinlock_t *old_ptl, *new_ptl;
144 	bool force_flush = false;
145 	unsigned long len = old_end - old_addr;
146 
147 	/*
148 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 	 * locks to ensure that rmap will always observe either the old or the
150 	 * new ptes. This is the easiest way to avoid races with
151 	 * truncate_pagecache(), page migration, etc...
152 	 *
153 	 * When need_rmap_locks is false, we use other ways to avoid
154 	 * such races:
155 	 *
156 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
157 	 *   which rmap call sites look for using vma_is_temporary_stack().
158 	 *
159 	 * - During mremap(), new_vma is often known to be placed after vma
160 	 *   in rmap traversal order. This ensures rmap will always observe
161 	 *   either the old pte, or the new pte, or both (the page table locks
162 	 *   serialize access to individual ptes, but only rmap traversal
163 	 *   order guarantees that we won't miss both the old and new ptes).
164 	 */
165 	if (need_rmap_locks)
166 		take_rmap_locks(vma);
167 
168 	/*
169 	 * We don't have to worry about the ordering of src and dst
170 	 * pte locks because exclusive mmap_lock prevents deadlock.
171 	 */
172 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173 	new_pte = pte_offset_map(new_pmd, new_addr);
174 	new_ptl = pte_lockptr(mm, new_pmd);
175 	if (new_ptl != old_ptl)
176 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177 	flush_tlb_batched_pending(vma->vm_mm);
178 	arch_enter_lazy_mmu_mode();
179 
180 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
181 				   new_pte++, new_addr += PAGE_SIZE) {
182 		if (pte_none(*old_pte))
183 			continue;
184 
185 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
186 		/*
187 		 * If we are remapping a valid PTE, make sure
188 		 * to flush TLB before we drop the PTL for the
189 		 * PTE.
190 		 *
191 		 * NOTE! Both old and new PTL matter: the old one
192 		 * for racing with page_mkclean(), the new one to
193 		 * make sure the physical page stays valid until
194 		 * the TLB entry for the old mapping has been
195 		 * flushed.
196 		 */
197 		if (pte_present(pte))
198 			force_flush = true;
199 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 		pte = move_soft_dirty_pte(pte);
201 		set_pte_at(mm, new_addr, new_pte, pte);
202 	}
203 
204 	arch_leave_lazy_mmu_mode();
205 	if (force_flush)
206 		flush_tlb_range(vma, old_end - len, old_end);
207 	if (new_ptl != old_ptl)
208 		spin_unlock(new_ptl);
209 	pte_unmap(new_pte - 1);
210 	pte_unmap_unlock(old_pte - 1, old_ptl);
211 	if (need_rmap_locks)
212 		drop_rmap_locks(vma);
213 }
214 
215 #ifndef arch_supports_page_table_move
216 #define arch_supports_page_table_move arch_supports_page_table_move
217 static inline bool arch_supports_page_table_move(void)
218 {
219 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
220 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
221 }
222 #endif
223 
224 #ifdef CONFIG_HAVE_MOVE_PMD
225 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 {
228 	spinlock_t *old_ptl, *new_ptl;
229 	struct mm_struct *mm = vma->vm_mm;
230 	pmd_t pmd;
231 
232 	if (!arch_supports_page_table_move())
233 		return false;
234 	/*
235 	 * The destination pmd shouldn't be established, free_pgtables()
236 	 * should have released it.
237 	 *
238 	 * However, there's a case during execve() where we use mremap
239 	 * to move the initial stack, and in that case the target area
240 	 * may overlap the source area (always moving down).
241 	 *
242 	 * If everything is PMD-aligned, that works fine, as moving
243 	 * each pmd down will clear the source pmd. But if we first
244 	 * have a few 4kB-only pages that get moved down, and then
245 	 * hit the "now the rest is PMD-aligned, let's do everything
246 	 * one pmd at a time", we will still have the old (now empty
247 	 * of any 4kB pages, but still there) PMD in the page table
248 	 * tree.
249 	 *
250 	 * Warn on it once - because we really should try to figure
251 	 * out how to do this better - but then say "I won't move
252 	 * this pmd".
253 	 *
254 	 * One alternative might be to just unmap the target pmd at
255 	 * this point, and verify that it really is empty. We'll see.
256 	 */
257 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 		return false;
259 
260 	/*
261 	 * We don't have to worry about the ordering of src and dst
262 	 * ptlocks because exclusive mmap_lock prevents deadlock.
263 	 */
264 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
265 	new_ptl = pmd_lockptr(mm, new_pmd);
266 	if (new_ptl != old_ptl)
267 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
268 
269 	/* Clear the pmd */
270 	pmd = *old_pmd;
271 	pmd_clear(old_pmd);
272 
273 	VM_BUG_ON(!pmd_none(*new_pmd));
274 
275 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
277 	if (new_ptl != old_ptl)
278 		spin_unlock(new_ptl);
279 	spin_unlock(old_ptl);
280 
281 	return true;
282 }
283 #else
284 static inline bool move_normal_pmd(struct vm_area_struct *vma,
285 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
286 		pmd_t *new_pmd)
287 {
288 	return false;
289 }
290 #endif
291 
292 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
294 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
295 {
296 	spinlock_t *old_ptl, *new_ptl;
297 	struct mm_struct *mm = vma->vm_mm;
298 	pud_t pud;
299 
300 	if (!arch_supports_page_table_move())
301 		return false;
302 	/*
303 	 * The destination pud shouldn't be established, free_pgtables()
304 	 * should have released it.
305 	 */
306 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
307 		return false;
308 
309 	/*
310 	 * We don't have to worry about the ordering of src and dst
311 	 * ptlocks because exclusive mmap_lock prevents deadlock.
312 	 */
313 	old_ptl = pud_lock(vma->vm_mm, old_pud);
314 	new_ptl = pud_lockptr(mm, new_pud);
315 	if (new_ptl != old_ptl)
316 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
317 
318 	/* Clear the pud */
319 	pud = *old_pud;
320 	pud_clear(old_pud);
321 
322 	VM_BUG_ON(!pud_none(*new_pud));
323 
324 	pud_populate(mm, new_pud, pud_pgtable(pud));
325 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
326 	if (new_ptl != old_ptl)
327 		spin_unlock(new_ptl);
328 	spin_unlock(old_ptl);
329 
330 	return true;
331 }
332 #else
333 static inline bool move_normal_pud(struct vm_area_struct *vma,
334 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
335 		pud_t *new_pud)
336 {
337 	return false;
338 }
339 #endif
340 
341 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
342 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 {
345 	spinlock_t *old_ptl, *new_ptl;
346 	struct mm_struct *mm = vma->vm_mm;
347 	pud_t pud;
348 
349 	/*
350 	 * The destination pud shouldn't be established, free_pgtables()
351 	 * should have released it.
352 	 */
353 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
354 		return false;
355 
356 	/*
357 	 * We don't have to worry about the ordering of src and dst
358 	 * ptlocks because exclusive mmap_lock prevents deadlock.
359 	 */
360 	old_ptl = pud_lock(vma->vm_mm, old_pud);
361 	new_ptl = pud_lockptr(mm, new_pud);
362 	if (new_ptl != old_ptl)
363 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
364 
365 	/* Clear the pud */
366 	pud = *old_pud;
367 	pud_clear(old_pud);
368 
369 	VM_BUG_ON(!pud_none(*new_pud));
370 
371 	/* Set the new pud */
372 	/* mark soft_ditry when we add pud level soft dirty support */
373 	set_pud_at(mm, new_addr, new_pud, pud);
374 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
375 	if (new_ptl != old_ptl)
376 		spin_unlock(new_ptl);
377 	spin_unlock(old_ptl);
378 
379 	return true;
380 }
381 #else
382 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
383 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
384 {
385 	WARN_ON_ONCE(1);
386 	return false;
387 
388 }
389 #endif
390 
391 enum pgt_entry {
392 	NORMAL_PMD,
393 	HPAGE_PMD,
394 	NORMAL_PUD,
395 	HPAGE_PUD,
396 };
397 
398 /*
399  * Returns an extent of the corresponding size for the pgt_entry specified if
400  * valid. Else returns a smaller extent bounded by the end of the source and
401  * destination pgt_entry.
402  */
403 static __always_inline unsigned long get_extent(enum pgt_entry entry,
404 			unsigned long old_addr, unsigned long old_end,
405 			unsigned long new_addr)
406 {
407 	unsigned long next, extent, mask, size;
408 
409 	switch (entry) {
410 	case HPAGE_PMD:
411 	case NORMAL_PMD:
412 		mask = PMD_MASK;
413 		size = PMD_SIZE;
414 		break;
415 	case HPAGE_PUD:
416 	case NORMAL_PUD:
417 		mask = PUD_MASK;
418 		size = PUD_SIZE;
419 		break;
420 	default:
421 		BUILD_BUG();
422 		break;
423 	}
424 
425 	next = (old_addr + size) & mask;
426 	/* even if next overflowed, extent below will be ok */
427 	extent = next - old_addr;
428 	if (extent > old_end - old_addr)
429 		extent = old_end - old_addr;
430 	next = (new_addr + size) & mask;
431 	if (extent > next - new_addr)
432 		extent = next - new_addr;
433 	return extent;
434 }
435 
436 /*
437  * Attempts to speedup the move by moving entry at the level corresponding to
438  * pgt_entry. Returns true if the move was successful, else false.
439  */
440 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
441 			unsigned long old_addr, unsigned long new_addr,
442 			void *old_entry, void *new_entry, bool need_rmap_locks)
443 {
444 	bool moved = false;
445 
446 	/* See comment in move_ptes() */
447 	if (need_rmap_locks)
448 		take_rmap_locks(vma);
449 
450 	switch (entry) {
451 	case NORMAL_PMD:
452 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
453 					new_entry);
454 		break;
455 	case NORMAL_PUD:
456 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
457 					new_entry);
458 		break;
459 	case HPAGE_PMD:
460 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
461 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
462 				      new_entry);
463 		break;
464 	case HPAGE_PUD:
465 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
466 			move_huge_pud(vma, old_addr, new_addr, old_entry,
467 				      new_entry);
468 		break;
469 
470 	default:
471 		WARN_ON_ONCE(1);
472 		break;
473 	}
474 
475 	if (need_rmap_locks)
476 		drop_rmap_locks(vma);
477 
478 	return moved;
479 }
480 
481 unsigned long move_page_tables(struct vm_area_struct *vma,
482 		unsigned long old_addr, struct vm_area_struct *new_vma,
483 		unsigned long new_addr, unsigned long len,
484 		bool need_rmap_locks)
485 {
486 	unsigned long extent, old_end;
487 	struct mmu_notifier_range range;
488 	pmd_t *old_pmd, *new_pmd;
489 	pud_t *old_pud, *new_pud;
490 
491 	if (!len)
492 		return 0;
493 
494 	old_end = old_addr + len;
495 
496 	if (is_vm_hugetlb_page(vma))
497 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
498 						new_addr, len);
499 
500 	flush_cache_range(vma, old_addr, old_end);
501 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
502 				old_addr, old_end);
503 	mmu_notifier_invalidate_range_start(&range);
504 
505 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
506 		cond_resched();
507 		/*
508 		 * If extent is PUD-sized try to speed up the move by moving at the
509 		 * PUD level if possible.
510 		 */
511 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
512 
513 		old_pud = get_old_pud(vma->vm_mm, old_addr);
514 		if (!old_pud)
515 			continue;
516 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
517 		if (!new_pud)
518 			break;
519 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
520 			if (extent == HPAGE_PUD_SIZE) {
521 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
522 					       old_pud, new_pud, need_rmap_locks);
523 				/* We ignore and continue on error? */
524 				continue;
525 			}
526 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
527 
528 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529 					   old_pud, new_pud, true))
530 				continue;
531 		}
532 
533 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
535 		if (!old_pmd)
536 			continue;
537 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 		if (!new_pmd)
539 			break;
540 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
541 		    pmd_devmap(*old_pmd)) {
542 			if (extent == HPAGE_PMD_SIZE &&
543 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
544 					   old_pmd, new_pmd, need_rmap_locks))
545 				continue;
546 			split_huge_pmd(vma, old_pmd, old_addr);
547 			if (pmd_trans_unstable(old_pmd))
548 				continue;
549 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
550 			   extent == PMD_SIZE) {
551 			/*
552 			 * If the extent is PMD-sized, try to speed the move by
553 			 * moving at the PMD level if possible.
554 			 */
555 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556 					   old_pmd, new_pmd, true))
557 				continue;
558 		}
559 
560 		if (pte_alloc(new_vma->vm_mm, new_pmd))
561 			break;
562 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563 			  new_pmd, new_addr, need_rmap_locks);
564 	}
565 
566 	mmu_notifier_invalidate_range_end(&range);
567 
568 	return len + old_addr - old_end;	/* how much done */
569 }
570 
571 static unsigned long move_vma(struct vm_area_struct *vma,
572 		unsigned long old_addr, unsigned long old_len,
573 		unsigned long new_len, unsigned long new_addr,
574 		bool *locked, unsigned long flags,
575 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
576 {
577 	long to_account = new_len - old_len;
578 	struct mm_struct *mm = vma->vm_mm;
579 	struct vm_area_struct *new_vma;
580 	unsigned long vm_flags = vma->vm_flags;
581 	unsigned long new_pgoff;
582 	unsigned long moved_len;
583 	unsigned long account_start = 0;
584 	unsigned long account_end = 0;
585 	unsigned long hiwater_vm;
586 	int err = 0;
587 	bool need_rmap_locks;
588 	struct vma_iterator vmi;
589 
590 	/*
591 	 * We'd prefer to avoid failure later on in do_munmap:
592 	 * which may split one vma into three before unmapping.
593 	 */
594 	if (mm->map_count >= sysctl_max_map_count - 3)
595 		return -ENOMEM;
596 
597 	if (unlikely(flags & MREMAP_DONTUNMAP))
598 		to_account = new_len;
599 
600 	if (vma->vm_ops && vma->vm_ops->may_split) {
601 		if (vma->vm_start != old_addr)
602 			err = vma->vm_ops->may_split(vma, old_addr);
603 		if (!err && vma->vm_end != old_addr + old_len)
604 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
605 		if (err)
606 			return err;
607 	}
608 
609 	/*
610 	 * Advise KSM to break any KSM pages in the area to be moved:
611 	 * it would be confusing if they were to turn up at the new
612 	 * location, where they happen to coincide with different KSM
613 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
614 	 * so KSM can come around to merge on vma and new_vma afterwards.
615 	 */
616 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
617 						MADV_UNMERGEABLE, &vm_flags);
618 	if (err)
619 		return err;
620 
621 	if (vm_flags & VM_ACCOUNT) {
622 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
623 			return -ENOMEM;
624 	}
625 
626 	vma_start_write(vma);
627 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
628 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
629 			   &need_rmap_locks);
630 	if (!new_vma) {
631 		if (vm_flags & VM_ACCOUNT)
632 			vm_unacct_memory(to_account >> PAGE_SHIFT);
633 		return -ENOMEM;
634 	}
635 
636 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
637 				     need_rmap_locks);
638 	if (moved_len < old_len) {
639 		err = -ENOMEM;
640 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
641 		err = vma->vm_ops->mremap(new_vma);
642 	}
643 
644 	if (unlikely(err)) {
645 		/*
646 		 * On error, move entries back from new area to old,
647 		 * which will succeed since page tables still there,
648 		 * and then proceed to unmap new area instead of old.
649 		 */
650 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
651 				 true);
652 		vma = new_vma;
653 		old_len = new_len;
654 		old_addr = new_addr;
655 		new_addr = err;
656 	} else {
657 		mremap_userfaultfd_prep(new_vma, uf);
658 	}
659 
660 	if (is_vm_hugetlb_page(vma)) {
661 		clear_vma_resv_huge_pages(vma);
662 	}
663 
664 	/* Conceal VM_ACCOUNT so old reservation is not undone */
665 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
666 		vm_flags_clear(vma, VM_ACCOUNT);
667 		if (vma->vm_start < old_addr)
668 			account_start = vma->vm_start;
669 		if (vma->vm_end > old_addr + old_len)
670 			account_end = vma->vm_end;
671 	}
672 
673 	/*
674 	 * If we failed to move page tables we still do total_vm increment
675 	 * since do_munmap() will decrement it by old_len == new_len.
676 	 *
677 	 * Since total_vm is about to be raised artificially high for a
678 	 * moment, we need to restore high watermark afterwards: if stats
679 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
680 	 * If this were a serious issue, we'd add a flag to do_munmap().
681 	 */
682 	hiwater_vm = mm->hiwater_vm;
683 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
684 
685 	/* Tell pfnmap has moved from this vma */
686 	if (unlikely(vma->vm_flags & VM_PFNMAP))
687 		untrack_pfn_clear(vma);
688 
689 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
690 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
691 		vm_flags_clear(vma, VM_LOCKED_MASK);
692 
693 		/*
694 		 * anon_vma links of the old vma is no longer needed after its page
695 		 * table has been moved.
696 		 */
697 		if (new_vma != vma && vma->vm_start == old_addr &&
698 			vma->vm_end == (old_addr + old_len))
699 			unlink_anon_vmas(vma);
700 
701 		/* Because we won't unmap we don't need to touch locked_vm */
702 		return new_addr;
703 	}
704 
705 	vma_iter_init(&vmi, mm, old_addr);
706 	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
707 		/* OOM: unable to split vma, just get accounts right */
708 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
709 			vm_acct_memory(old_len >> PAGE_SHIFT);
710 		account_start = account_end = 0;
711 	}
712 
713 	if (vm_flags & VM_LOCKED) {
714 		mm->locked_vm += new_len >> PAGE_SHIFT;
715 		*locked = true;
716 	}
717 
718 	mm->hiwater_vm = hiwater_vm;
719 
720 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
721 	if (account_start) {
722 		vma = vma_prev(&vmi);
723 		vm_flags_set(vma, VM_ACCOUNT);
724 	}
725 
726 	if (account_end) {
727 		vma = vma_next(&vmi);
728 		vm_flags_set(vma, VM_ACCOUNT);
729 	}
730 
731 	return new_addr;
732 }
733 
734 static struct vm_area_struct *vma_to_resize(unsigned long addr,
735 	unsigned long old_len, unsigned long new_len, unsigned long flags)
736 {
737 	struct mm_struct *mm = current->mm;
738 	struct vm_area_struct *vma;
739 	unsigned long pgoff;
740 
741 	vma = vma_lookup(mm, addr);
742 	if (!vma)
743 		return ERR_PTR(-EFAULT);
744 
745 	/*
746 	 * !old_len is a special case where an attempt is made to 'duplicate'
747 	 * a mapping.  This makes no sense for private mappings as it will
748 	 * instead create a fresh/new mapping unrelated to the original.  This
749 	 * is contrary to the basic idea of mremap which creates new mappings
750 	 * based on the original.  There are no known use cases for this
751 	 * behavior.  As a result, fail such attempts.
752 	 */
753 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
754 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
755 		return ERR_PTR(-EINVAL);
756 	}
757 
758 	if ((flags & MREMAP_DONTUNMAP) &&
759 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
760 		return ERR_PTR(-EINVAL);
761 
762 	/* We can't remap across vm area boundaries */
763 	if (old_len > vma->vm_end - addr)
764 		return ERR_PTR(-EFAULT);
765 
766 	if (new_len == old_len)
767 		return vma;
768 
769 	/* Need to be careful about a growing mapping */
770 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
771 	pgoff += vma->vm_pgoff;
772 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
773 		return ERR_PTR(-EINVAL);
774 
775 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
776 		return ERR_PTR(-EFAULT);
777 
778 	if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
779 		return ERR_PTR(-EAGAIN);
780 
781 	if (!may_expand_vm(mm, vma->vm_flags,
782 				(new_len - old_len) >> PAGE_SHIFT))
783 		return ERR_PTR(-ENOMEM);
784 
785 	return vma;
786 }
787 
788 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
789 		unsigned long new_addr, unsigned long new_len, bool *locked,
790 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
791 		struct list_head *uf_unmap_early,
792 		struct list_head *uf_unmap)
793 {
794 	struct mm_struct *mm = current->mm;
795 	struct vm_area_struct *vma;
796 	unsigned long ret = -EINVAL;
797 	unsigned long map_flags = 0;
798 
799 	if (offset_in_page(new_addr))
800 		goto out;
801 
802 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
803 		goto out;
804 
805 	/* Ensure the old/new locations do not overlap */
806 	if (addr + old_len > new_addr && new_addr + new_len > addr)
807 		goto out;
808 
809 	/*
810 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
811 	 * it will bail out at the very beginning.
812 	 * That is a problem if we have already unmaped the regions here
813 	 * (new_addr, and old_addr), because userspace will not know the
814 	 * state of the vma's after it gets -ENOMEM.
815 	 * So, to avoid such scenario we can pre-compute if the whole
816 	 * operation has high chances to success map-wise.
817 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
818 	 * split in 3 before unmapping it.
819 	 * That means 2 more maps (1 for each) to the ones we already hold.
820 	 * Check whether current map count plus 2 still leads us to 4 maps below
821 	 * the threshold, otherwise return -ENOMEM here to be more safe.
822 	 */
823 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
824 		return -ENOMEM;
825 
826 	if (flags & MREMAP_FIXED) {
827 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
828 		if (ret)
829 			goto out;
830 	}
831 
832 	if (old_len > new_len) {
833 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
834 		if (ret)
835 			goto out;
836 		old_len = new_len;
837 	}
838 
839 	vma = vma_to_resize(addr, old_len, new_len, flags);
840 	if (IS_ERR(vma)) {
841 		ret = PTR_ERR(vma);
842 		goto out;
843 	}
844 
845 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
846 	if (flags & MREMAP_DONTUNMAP &&
847 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
848 		ret = -ENOMEM;
849 		goto out;
850 	}
851 
852 	if (flags & MREMAP_FIXED)
853 		map_flags |= MAP_FIXED;
854 
855 	if (vma->vm_flags & VM_MAYSHARE)
856 		map_flags |= MAP_SHARED;
857 
858 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
859 				((addr - vma->vm_start) >> PAGE_SHIFT),
860 				map_flags);
861 	if (IS_ERR_VALUE(ret))
862 		goto out;
863 
864 	/* We got a new mapping */
865 	if (!(flags & MREMAP_FIXED))
866 		new_addr = ret;
867 
868 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
869 		       uf_unmap);
870 
871 out:
872 	return ret;
873 }
874 
875 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
876 {
877 	unsigned long end = vma->vm_end + delta;
878 
879 	if (end < vma->vm_end) /* overflow */
880 		return 0;
881 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
882 		return 0;
883 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
884 			      0, MAP_FIXED) & ~PAGE_MASK)
885 		return 0;
886 	return 1;
887 }
888 
889 /*
890  * Expand (or shrink) an existing mapping, potentially moving it at the
891  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
892  *
893  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
894  * This option implies MREMAP_MAYMOVE.
895  */
896 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
897 		unsigned long, new_len, unsigned long, flags,
898 		unsigned long, new_addr)
899 {
900 	struct mm_struct *mm = current->mm;
901 	struct vm_area_struct *vma;
902 	unsigned long ret = -EINVAL;
903 	bool locked = false;
904 	bool downgraded = false;
905 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
906 	LIST_HEAD(uf_unmap_early);
907 	LIST_HEAD(uf_unmap);
908 
909 	/*
910 	 * There is a deliberate asymmetry here: we strip the pointer tag
911 	 * from the old address but leave the new address alone. This is
912 	 * for consistency with mmap(), where we prevent the creation of
913 	 * aliasing mappings in userspace by leaving the tag bits of the
914 	 * mapping address intact. A non-zero tag will cause the subsequent
915 	 * range checks to reject the address as invalid.
916 	 *
917 	 * See Documentation/arm64/tagged-address-abi.rst for more information.
918 	 */
919 	addr = untagged_addr(addr);
920 
921 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
922 		return ret;
923 
924 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
925 		return ret;
926 
927 	/*
928 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
929 	 * in the process.
930 	 */
931 	if (flags & MREMAP_DONTUNMAP &&
932 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
933 		return ret;
934 
935 
936 	if (offset_in_page(addr))
937 		return ret;
938 
939 	old_len = PAGE_ALIGN(old_len);
940 	new_len = PAGE_ALIGN(new_len);
941 
942 	/*
943 	 * We allow a zero old-len as a special case
944 	 * for DOS-emu "duplicate shm area" thing. But
945 	 * a zero new-len is nonsensical.
946 	 */
947 	if (!new_len)
948 		return ret;
949 
950 	if (mmap_write_lock_killable(current->mm))
951 		return -EINTR;
952 	vma = vma_lookup(mm, addr);
953 	if (!vma) {
954 		ret = -EFAULT;
955 		goto out;
956 	}
957 
958 	if (is_vm_hugetlb_page(vma)) {
959 		struct hstate *h __maybe_unused = hstate_vma(vma);
960 
961 		old_len = ALIGN(old_len, huge_page_size(h));
962 		new_len = ALIGN(new_len, huge_page_size(h));
963 
964 		/* addrs must be huge page aligned */
965 		if (addr & ~huge_page_mask(h))
966 			goto out;
967 		if (new_addr & ~huge_page_mask(h))
968 			goto out;
969 
970 		/*
971 		 * Don't allow remap expansion, because the underlying hugetlb
972 		 * reservation is not yet capable to handle split reservation.
973 		 */
974 		if (new_len > old_len)
975 			goto out;
976 	}
977 
978 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
979 		ret = mremap_to(addr, old_len, new_addr, new_len,
980 				&locked, flags, &uf, &uf_unmap_early,
981 				&uf_unmap);
982 		goto out;
983 	}
984 
985 	/*
986 	 * Always allow a shrinking remap: that just unmaps
987 	 * the unnecessary pages..
988 	 * do_vmi_munmap does all the needed commit accounting, and
989 	 * downgrades mmap_lock to read if so directed.
990 	 */
991 	if (old_len >= new_len) {
992 		int retval;
993 		VMA_ITERATOR(vmi, mm, addr + new_len);
994 
995 		retval = do_vmi_munmap(&vmi, mm, addr + new_len,
996 				       old_len - new_len, &uf_unmap, true);
997 		/* Returning 1 indicates mmap_lock is downgraded to read. */
998 		if (retval == 1) {
999 			downgraded = true;
1000 		} else if (retval < 0 && old_len != new_len) {
1001 			ret = retval;
1002 			goto out;
1003 		}
1004 
1005 		ret = addr;
1006 		goto out;
1007 	}
1008 
1009 	/*
1010 	 * Ok, we need to grow..
1011 	 */
1012 	vma = vma_to_resize(addr, old_len, new_len, flags);
1013 	if (IS_ERR(vma)) {
1014 		ret = PTR_ERR(vma);
1015 		goto out;
1016 	}
1017 
1018 	/* old_len exactly to the end of the area..
1019 	 */
1020 	if (old_len == vma->vm_end - addr) {
1021 		/* can we just expand the current mapping? */
1022 		if (vma_expandable(vma, new_len - old_len)) {
1023 			long pages = (new_len - old_len) >> PAGE_SHIFT;
1024 			unsigned long extension_start = addr + old_len;
1025 			unsigned long extension_end = addr + new_len;
1026 			pgoff_t extension_pgoff = vma->vm_pgoff +
1027 				((extension_start - vma->vm_start) >> PAGE_SHIFT);
1028 			VMA_ITERATOR(vmi, mm, extension_start);
1029 
1030 			if (vma->vm_flags & VM_ACCOUNT) {
1031 				if (security_vm_enough_memory_mm(mm, pages)) {
1032 					ret = -ENOMEM;
1033 					goto out;
1034 				}
1035 			}
1036 
1037 			/*
1038 			 * Function vma_merge() is called on the extension we
1039 			 * are adding to the already existing vma, vma_merge()
1040 			 * will merge this extension with the already existing
1041 			 * vma (expand operation itself) and possibly also with
1042 			 * the next vma if it becomes adjacent to the expanded
1043 			 * vma and  otherwise compatible.
1044 			 */
1045 			vma = vma_merge(&vmi, mm, vma, extension_start,
1046 				extension_end, vma->vm_flags, vma->anon_vma,
1047 				vma->vm_file, extension_pgoff, vma_policy(vma),
1048 				vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1049 			if (!vma) {
1050 				vm_unacct_memory(pages);
1051 				ret = -ENOMEM;
1052 				goto out;
1053 			}
1054 
1055 			vm_stat_account(mm, vma->vm_flags, pages);
1056 			if (vma->vm_flags & VM_LOCKED) {
1057 				mm->locked_vm += pages;
1058 				locked = true;
1059 				new_addr = addr;
1060 			}
1061 			ret = addr;
1062 			goto out;
1063 		}
1064 	}
1065 
1066 	/*
1067 	 * We weren't able to just expand or shrink the area,
1068 	 * we need to create a new one and move it..
1069 	 */
1070 	ret = -ENOMEM;
1071 	if (flags & MREMAP_MAYMOVE) {
1072 		unsigned long map_flags = 0;
1073 		if (vma->vm_flags & VM_MAYSHARE)
1074 			map_flags |= MAP_SHARED;
1075 
1076 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1077 					vma->vm_pgoff +
1078 					((addr - vma->vm_start) >> PAGE_SHIFT),
1079 					map_flags);
1080 		if (IS_ERR_VALUE(new_addr)) {
1081 			ret = new_addr;
1082 			goto out;
1083 		}
1084 
1085 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1086 			       &locked, flags, &uf, &uf_unmap);
1087 	}
1088 out:
1089 	if (offset_in_page(ret))
1090 		locked = false;
1091 	if (downgraded)
1092 		mmap_read_unlock(current->mm);
1093 	else
1094 		mmap_write_unlock(current->mm);
1095 	if (locked && new_len > old_len)
1096 		mm_populate(new_addr + old_len, new_len - old_len);
1097 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1098 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1099 	userfaultfd_unmap_complete(mm, &uf_unmap);
1100 	return ret;
1101 }
1102