xref: /linux/mm/vma.c (revision b7012d513f81959596a01083415597edb04cf509)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 /*
4  * VMA-specific functions.
5  */
6 
7 #include "vma_internal.h"
8 #include "vma.h"
9 
10 /*
11  * If the vma has a ->close operation then the driver probably needs to release
12  * per-vma resources, so we don't attempt to merge those if the caller indicates
13  * the current vma may be removed as part of the merge.
14  */
15 static inline bool is_mergeable_vma(struct vm_area_struct *vma,
16 		struct file *file, unsigned long vm_flags,
17 		struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
18 		struct anon_vma_name *anon_name, bool may_remove_vma)
19 {
20 	/*
21 	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
22 	 * match the flags but dirty bit -- the caller should mark
23 	 * merged VMA as dirty. If dirty bit won't be excluded from
24 	 * comparison, we increase pressure on the memory system forcing
25 	 * the kernel to generate new VMAs when old one could be
26 	 * extended instead.
27 	 */
28 	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
29 		return false;
30 	if (vma->vm_file != file)
31 		return false;
32 	if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
33 		return false;
34 	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
35 		return false;
36 	if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
37 		return false;
38 	return true;
39 }
40 
41 static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
42 		 struct anon_vma *anon_vma2, struct vm_area_struct *vma)
43 {
44 	/*
45 	 * The list_is_singular() test is to avoid merging VMA cloned from
46 	 * parents. This can improve scalability caused by anon_vma lock.
47 	 */
48 	if ((!anon_vma1 || !anon_vma2) && (!vma ||
49 		list_is_singular(&vma->anon_vma_chain)))
50 		return true;
51 	return anon_vma1 == anon_vma2;
52 }
53 
54 /*
55  * init_multi_vma_prep() - Initializer for struct vma_prepare
56  * @vp: The vma_prepare struct
57  * @vma: The vma that will be altered once locked
58  * @next: The next vma if it is to be adjusted
59  * @remove: The first vma to be removed
60  * @remove2: The second vma to be removed
61  */
62 static void init_multi_vma_prep(struct vma_prepare *vp,
63 				struct vm_area_struct *vma,
64 				struct vm_area_struct *next,
65 				struct vm_area_struct *remove,
66 				struct vm_area_struct *remove2)
67 {
68 	memset(vp, 0, sizeof(struct vma_prepare));
69 	vp->vma = vma;
70 	vp->anon_vma = vma->anon_vma;
71 	vp->remove = remove;
72 	vp->remove2 = remove2;
73 	vp->adj_next = next;
74 	if (!vp->anon_vma && next)
75 		vp->anon_vma = next->anon_vma;
76 
77 	vp->file = vma->vm_file;
78 	if (vp->file)
79 		vp->mapping = vma->vm_file->f_mapping;
80 
81 }
82 
83 /*
84  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
85  * in front of (at a lower virtual address and file offset than) the vma.
86  *
87  * We cannot merge two vmas if they have differently assigned (non-NULL)
88  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
89  *
90  * We don't check here for the merged mmap wrapping around the end of pagecache
91  * indices (16TB on ia32) because do_mmap() does not permit mmap's which
92  * wrap, nor mmaps which cover the final page at index -1UL.
93  *
94  * We assume the vma may be removed as part of the merge.
95  */
96 bool
97 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
98 		struct anon_vma *anon_vma, struct file *file,
99 		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
100 		struct anon_vma_name *anon_name)
101 {
102 	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
103 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
104 		if (vma->vm_pgoff == vm_pgoff)
105 			return true;
106 	}
107 	return false;
108 }
109 
110 /*
111  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
112  * beyond (at a higher virtual address and file offset than) the vma.
113  *
114  * We cannot merge two vmas if they have differently assigned (non-NULL)
115  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
116  *
117  * We assume that vma is not removed as part of the merge.
118  */
119 bool
120 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
121 		struct anon_vma *anon_vma, struct file *file,
122 		pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
123 		struct anon_vma_name *anon_name)
124 {
125 	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
126 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
127 		pgoff_t vm_pglen;
128 
129 		vm_pglen = vma_pages(vma);
130 		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
131 			return true;
132 	}
133 	return false;
134 }
135 
136 /*
137  * Close a vm structure and free it.
138  */
139 void remove_vma(struct vm_area_struct *vma, bool unreachable)
140 {
141 	might_sleep();
142 	if (vma->vm_ops && vma->vm_ops->close)
143 		vma->vm_ops->close(vma);
144 	if (vma->vm_file)
145 		fput(vma->vm_file);
146 	mpol_put(vma_policy(vma));
147 	if (unreachable)
148 		__vm_area_free(vma);
149 	else
150 		vm_area_free(vma);
151 }
152 
153 /*
154  * Get rid of page table information in the indicated region.
155  *
156  * Called with the mm semaphore held.
157  */
158 void unmap_region(struct mm_struct *mm, struct ma_state *mas,
159 		struct vm_area_struct *vma, struct vm_area_struct *prev,
160 		struct vm_area_struct *next, unsigned long start,
161 		unsigned long end, unsigned long tree_end, bool mm_wr_locked)
162 {
163 	struct mmu_gather tlb;
164 	unsigned long mt_start = mas->index;
165 
166 	lru_add_drain();
167 	tlb_gather_mmu(&tlb, mm);
168 	update_hiwater_rss(mm);
169 	unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
170 	mas_set(mas, mt_start);
171 	free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
172 				 next ? next->vm_start : USER_PGTABLES_CEILING,
173 				 mm_wr_locked);
174 	tlb_finish_mmu(&tlb);
175 }
176 
177 /*
178  * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
179  * has already been checked or doesn't make sense to fail.
180  * VMA Iterator will point to the original VMA.
181  */
182 static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
183 		       unsigned long addr, int new_below)
184 {
185 	struct vma_prepare vp;
186 	struct vm_area_struct *new;
187 	int err;
188 
189 	WARN_ON(vma->vm_start >= addr);
190 	WARN_ON(vma->vm_end <= addr);
191 
192 	if (vma->vm_ops && vma->vm_ops->may_split) {
193 		err = vma->vm_ops->may_split(vma, addr);
194 		if (err)
195 			return err;
196 	}
197 
198 	new = vm_area_dup(vma);
199 	if (!new)
200 		return -ENOMEM;
201 
202 	if (new_below) {
203 		new->vm_end = addr;
204 	} else {
205 		new->vm_start = addr;
206 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
207 	}
208 
209 	err = -ENOMEM;
210 	vma_iter_config(vmi, new->vm_start, new->vm_end);
211 	if (vma_iter_prealloc(vmi, new))
212 		goto out_free_vma;
213 
214 	err = vma_dup_policy(vma, new);
215 	if (err)
216 		goto out_free_vmi;
217 
218 	err = anon_vma_clone(new, vma);
219 	if (err)
220 		goto out_free_mpol;
221 
222 	if (new->vm_file)
223 		get_file(new->vm_file);
224 
225 	if (new->vm_ops && new->vm_ops->open)
226 		new->vm_ops->open(new);
227 
228 	vma_start_write(vma);
229 	vma_start_write(new);
230 
231 	init_vma_prep(&vp, vma);
232 	vp.insert = new;
233 	vma_prepare(&vp);
234 	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
235 
236 	if (new_below) {
237 		vma->vm_start = addr;
238 		vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
239 	} else {
240 		vma->vm_end = addr;
241 	}
242 
243 	/* vma_complete stores the new vma */
244 	vma_complete(&vp, vmi, vma->vm_mm);
245 
246 	/* Success. */
247 	if (new_below)
248 		vma_next(vmi);
249 	else
250 		vma_prev(vmi);
251 
252 	return 0;
253 
254 out_free_mpol:
255 	mpol_put(vma_policy(new));
256 out_free_vmi:
257 	vma_iter_free(vmi);
258 out_free_vma:
259 	vm_area_free(new);
260 	return err;
261 }
262 
263 /*
264  * Split a vma into two pieces at address 'addr', a new vma is allocated
265  * either for the first part or the tail.
266  */
267 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
268 		     unsigned long addr, int new_below)
269 {
270 	if (vma->vm_mm->map_count >= sysctl_max_map_count)
271 		return -ENOMEM;
272 
273 	return __split_vma(vmi, vma, addr, new_below);
274 }
275 
276 /*
277  * Ok - we have the memory areas we should free on a maple tree so release them,
278  * and do the vma updates.
279  *
280  * Called with the mm semaphore held.
281  */
282 static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
283 {
284 	unsigned long nr_accounted = 0;
285 	struct vm_area_struct *vma;
286 
287 	/* Update high watermark before we lower total_vm */
288 	update_hiwater_vm(mm);
289 	mas_for_each(mas, vma, ULONG_MAX) {
290 		long nrpages = vma_pages(vma);
291 
292 		if (vma->vm_flags & VM_ACCOUNT)
293 			nr_accounted += nrpages;
294 		vm_stat_account(mm, vma->vm_flags, -nrpages);
295 		remove_vma(vma, false);
296 	}
297 	vm_unacct_memory(nr_accounted);
298 }
299 
300 /*
301  * init_vma_prep() - Initializer wrapper for vma_prepare struct
302  * @vp: The vma_prepare struct
303  * @vma: The vma that will be altered once locked
304  */
305 void init_vma_prep(struct vma_prepare *vp,
306 		   struct vm_area_struct *vma)
307 {
308 	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
309 }
310 
311 /*
312  * Requires inode->i_mapping->i_mmap_rwsem
313  */
314 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
315 				      struct address_space *mapping)
316 {
317 	if (vma_is_shared_maywrite(vma))
318 		mapping_unmap_writable(mapping);
319 
320 	flush_dcache_mmap_lock(mapping);
321 	vma_interval_tree_remove(vma, &mapping->i_mmap);
322 	flush_dcache_mmap_unlock(mapping);
323 }
324 
325 /*
326  * vma has some anon_vma assigned, and is already inserted on that
327  * anon_vma's interval trees.
328  *
329  * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
330  * vma must be removed from the anon_vma's interval trees using
331  * anon_vma_interval_tree_pre_update_vma().
332  *
333  * After the update, the vma will be reinserted using
334  * anon_vma_interval_tree_post_update_vma().
335  *
336  * The entire update must be protected by exclusive mmap_lock and by
337  * the root anon_vma's mutex.
338  */
339 void
340 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
341 {
342 	struct anon_vma_chain *avc;
343 
344 	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
345 		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
346 }
347 
348 void
349 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
350 {
351 	struct anon_vma_chain *avc;
352 
353 	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
354 		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
355 }
356 
357 static void __vma_link_file(struct vm_area_struct *vma,
358 			    struct address_space *mapping)
359 {
360 	if (vma_is_shared_maywrite(vma))
361 		mapping_allow_writable(mapping);
362 
363 	flush_dcache_mmap_lock(mapping);
364 	vma_interval_tree_insert(vma, &mapping->i_mmap);
365 	flush_dcache_mmap_unlock(mapping);
366 }
367 
368 /*
369  * vma_prepare() - Helper function for handling locking VMAs prior to altering
370  * @vp: The initialized vma_prepare struct
371  */
372 void vma_prepare(struct vma_prepare *vp)
373 {
374 	if (vp->file) {
375 		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
376 
377 		if (vp->adj_next)
378 			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
379 				      vp->adj_next->vm_end);
380 
381 		i_mmap_lock_write(vp->mapping);
382 		if (vp->insert && vp->insert->vm_file) {
383 			/*
384 			 * Put into interval tree now, so instantiated pages
385 			 * are visible to arm/parisc __flush_dcache_page
386 			 * throughout; but we cannot insert into address
387 			 * space until vma start or end is updated.
388 			 */
389 			__vma_link_file(vp->insert,
390 					vp->insert->vm_file->f_mapping);
391 		}
392 	}
393 
394 	if (vp->anon_vma) {
395 		anon_vma_lock_write(vp->anon_vma);
396 		anon_vma_interval_tree_pre_update_vma(vp->vma);
397 		if (vp->adj_next)
398 			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
399 	}
400 
401 	if (vp->file) {
402 		flush_dcache_mmap_lock(vp->mapping);
403 		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
404 		if (vp->adj_next)
405 			vma_interval_tree_remove(vp->adj_next,
406 						 &vp->mapping->i_mmap);
407 	}
408 
409 }
410 
411 /*
412  * dup_anon_vma() - Helper function to duplicate anon_vma
413  * @dst: The destination VMA
414  * @src: The source VMA
415  * @dup: Pointer to the destination VMA when successful.
416  *
417  * Returns: 0 on success.
418  */
419 static int dup_anon_vma(struct vm_area_struct *dst,
420 			struct vm_area_struct *src, struct vm_area_struct **dup)
421 {
422 	/*
423 	 * Easily overlooked: when mprotect shifts the boundary, make sure the
424 	 * expanding vma has anon_vma set if the shrinking vma had, to cover any
425 	 * anon pages imported.
426 	 */
427 	if (src->anon_vma && !dst->anon_vma) {
428 		int ret;
429 
430 		vma_assert_write_locked(dst);
431 		dst->anon_vma = src->anon_vma;
432 		ret = anon_vma_clone(dst, src);
433 		if (ret)
434 			return ret;
435 
436 		*dup = dst;
437 	}
438 
439 	return 0;
440 }
441 
442 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
443 void validate_mm(struct mm_struct *mm)
444 {
445 	int bug = 0;
446 	int i = 0;
447 	struct vm_area_struct *vma;
448 	VMA_ITERATOR(vmi, mm, 0);
449 
450 	mt_validate(&mm->mm_mt);
451 	for_each_vma(vmi, vma) {
452 #ifdef CONFIG_DEBUG_VM_RB
453 		struct anon_vma *anon_vma = vma->anon_vma;
454 		struct anon_vma_chain *avc;
455 #endif
456 		unsigned long vmi_start, vmi_end;
457 		bool warn = 0;
458 
459 		vmi_start = vma_iter_addr(&vmi);
460 		vmi_end = vma_iter_end(&vmi);
461 		if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
462 			warn = 1;
463 
464 		if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
465 			warn = 1;
466 
467 		if (warn) {
468 			pr_emerg("issue in %s\n", current->comm);
469 			dump_stack();
470 			dump_vma(vma);
471 			pr_emerg("tree range: %px start %lx end %lx\n", vma,
472 				 vmi_start, vmi_end - 1);
473 			vma_iter_dump_tree(&vmi);
474 		}
475 
476 #ifdef CONFIG_DEBUG_VM_RB
477 		if (anon_vma) {
478 			anon_vma_lock_read(anon_vma);
479 			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
480 				anon_vma_interval_tree_verify(avc);
481 			anon_vma_unlock_read(anon_vma);
482 		}
483 #endif
484 		i++;
485 	}
486 	if (i != mm->map_count) {
487 		pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
488 		bug = 1;
489 	}
490 	VM_BUG_ON_MM(bug, mm);
491 }
492 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
493 
494 /*
495  * vma_expand - Expand an existing VMA
496  *
497  * @vmi: The vma iterator
498  * @vma: The vma to expand
499  * @start: The start of the vma
500  * @end: The exclusive end of the vma
501  * @pgoff: The page offset of vma
502  * @next: The current of next vma.
503  *
504  * Expand @vma to @start and @end.  Can expand off the start and end.  Will
505  * expand over @next if it's different from @vma and @end == @next->vm_end.
506  * Checking if the @vma can expand and merge with @next needs to be handled by
507  * the caller.
508  *
509  * Returns: 0 on success
510  */
511 int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
512 	       unsigned long start, unsigned long end, pgoff_t pgoff,
513 	       struct vm_area_struct *next)
514 {
515 	struct vm_area_struct *anon_dup = NULL;
516 	bool remove_next = false;
517 	struct vma_prepare vp;
518 
519 	vma_start_write(vma);
520 	if (next && (vma != next) && (end == next->vm_end)) {
521 		int ret;
522 
523 		remove_next = true;
524 		vma_start_write(next);
525 		ret = dup_anon_vma(vma, next, &anon_dup);
526 		if (ret)
527 			return ret;
528 	}
529 
530 	init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
531 	/* Not merging but overwriting any part of next is not handled. */
532 	VM_WARN_ON(next && !vp.remove &&
533 		  next != vma && end > next->vm_start);
534 	/* Only handles expanding */
535 	VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
536 
537 	/* Note: vma iterator must be pointing to 'start' */
538 	vma_iter_config(vmi, start, end);
539 	if (vma_iter_prealloc(vmi, vma))
540 		goto nomem;
541 
542 	vma_prepare(&vp);
543 	vma_adjust_trans_huge(vma, start, end, 0);
544 	vma_set_range(vma, start, end, pgoff);
545 	vma_iter_store(vmi, vma);
546 
547 	vma_complete(&vp, vmi, vma->vm_mm);
548 	return 0;
549 
550 nomem:
551 	if (anon_dup)
552 		unlink_anon_vmas(anon_dup);
553 	return -ENOMEM;
554 }
555 
556 /*
557  * vma_shrink() - Reduce an existing VMAs memory area
558  * @vmi: The vma iterator
559  * @vma: The VMA to modify
560  * @start: The new start
561  * @end: The new end
562  *
563  * Returns: 0 on success, -ENOMEM otherwise
564  */
565 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
566 	       unsigned long start, unsigned long end, pgoff_t pgoff)
567 {
568 	struct vma_prepare vp;
569 
570 	WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
571 
572 	if (vma->vm_start < start)
573 		vma_iter_config(vmi, vma->vm_start, start);
574 	else
575 		vma_iter_config(vmi, end, vma->vm_end);
576 
577 	if (vma_iter_prealloc(vmi, NULL))
578 		return -ENOMEM;
579 
580 	vma_start_write(vma);
581 
582 	init_vma_prep(&vp, vma);
583 	vma_prepare(&vp);
584 	vma_adjust_trans_huge(vma, start, end, 0);
585 
586 	vma_iter_clear(vmi);
587 	vma_set_range(vma, start, end, pgoff);
588 	vma_complete(&vp, vmi, vma->vm_mm);
589 	return 0;
590 }
591 
592 /*
593  * vma_complete- Helper function for handling the unlocking after altering VMAs,
594  * or for inserting a VMA.
595  *
596  * @vp: The vma_prepare struct
597  * @vmi: The vma iterator
598  * @mm: The mm_struct
599  */
600 void vma_complete(struct vma_prepare *vp,
601 		  struct vma_iterator *vmi, struct mm_struct *mm)
602 {
603 	if (vp->file) {
604 		if (vp->adj_next)
605 			vma_interval_tree_insert(vp->adj_next,
606 						 &vp->mapping->i_mmap);
607 		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
608 		flush_dcache_mmap_unlock(vp->mapping);
609 	}
610 
611 	if (vp->remove && vp->file) {
612 		__remove_shared_vm_struct(vp->remove, vp->mapping);
613 		if (vp->remove2)
614 			__remove_shared_vm_struct(vp->remove2, vp->mapping);
615 	} else if (vp->insert) {
616 		/*
617 		 * split_vma has split insert from vma, and needs
618 		 * us to insert it before dropping the locks
619 		 * (it may either follow vma or precede it).
620 		 */
621 		vma_iter_store(vmi, vp->insert);
622 		mm->map_count++;
623 	}
624 
625 	if (vp->anon_vma) {
626 		anon_vma_interval_tree_post_update_vma(vp->vma);
627 		if (vp->adj_next)
628 			anon_vma_interval_tree_post_update_vma(vp->adj_next);
629 		anon_vma_unlock_write(vp->anon_vma);
630 	}
631 
632 	if (vp->file) {
633 		i_mmap_unlock_write(vp->mapping);
634 		uprobe_mmap(vp->vma);
635 
636 		if (vp->adj_next)
637 			uprobe_mmap(vp->adj_next);
638 	}
639 
640 	if (vp->remove) {
641 again:
642 		vma_mark_detached(vp->remove, true);
643 		if (vp->file) {
644 			uprobe_munmap(vp->remove, vp->remove->vm_start,
645 				      vp->remove->vm_end);
646 			fput(vp->file);
647 		}
648 		if (vp->remove->anon_vma)
649 			anon_vma_merge(vp->vma, vp->remove);
650 		mm->map_count--;
651 		mpol_put(vma_policy(vp->remove));
652 		if (!vp->remove2)
653 			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
654 		vm_area_free(vp->remove);
655 
656 		/*
657 		 * In mprotect's case 6 (see comments on vma_merge),
658 		 * we are removing both mid and next vmas
659 		 */
660 		if (vp->remove2) {
661 			vp->remove = vp->remove2;
662 			vp->remove2 = NULL;
663 			goto again;
664 		}
665 	}
666 	if (vp->insert && vp->file)
667 		uprobe_mmap(vp->insert);
668 	validate_mm(mm);
669 }
670 
671 /*
672  * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
673  * @vmi: The vma iterator
674  * @vma: The starting vm_area_struct
675  * @mm: The mm_struct
676  * @start: The aligned start address to munmap.
677  * @end: The aligned end address to munmap.
678  * @uf: The userfaultfd list_head
679  * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
680  * success.
681  *
682  * Return: 0 on success and drops the lock if so directed, error and leaves the
683  * lock held otherwise.
684  */
685 int
686 do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
687 		    struct mm_struct *mm, unsigned long start,
688 		    unsigned long end, struct list_head *uf, bool unlock)
689 {
690 	struct vm_area_struct *prev, *next = NULL;
691 	struct maple_tree mt_detach;
692 	int count = 0;
693 	int error = -ENOMEM;
694 	unsigned long locked_vm = 0;
695 	MA_STATE(mas_detach, &mt_detach, 0, 0);
696 	mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
697 	mt_on_stack(mt_detach);
698 
699 	/*
700 	 * If we need to split any vma, do it now to save pain later.
701 	 *
702 	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
703 	 * unmapped vm_area_struct will remain in use: so lower split_vma
704 	 * places tmp vma above, and higher split_vma places tmp vma below.
705 	 */
706 
707 	/* Does it split the first one? */
708 	if (start > vma->vm_start) {
709 
710 		/*
711 		 * Make sure that map_count on return from munmap() will
712 		 * not exceed its limit; but let map_count go just above
713 		 * its limit temporarily, to help free resources as expected.
714 		 */
715 		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
716 			goto map_count_exceeded;
717 
718 		/* Don't bother splitting the VMA if we can't unmap it anyway */
719 		if (!can_modify_vma(vma)) {
720 			error = -EPERM;
721 			goto start_split_failed;
722 		}
723 
724 		error = __split_vma(vmi, vma, start, 1);
725 		if (error)
726 			goto start_split_failed;
727 	}
728 
729 	/*
730 	 * Detach a range of VMAs from the mm. Using next as a temp variable as
731 	 * it is always overwritten.
732 	 */
733 	next = vma;
734 	do {
735 		if (!can_modify_vma(next)) {
736 			error = -EPERM;
737 			goto modify_vma_failed;
738 		}
739 
740 		/* Does it split the end? */
741 		if (next->vm_end > end) {
742 			error = __split_vma(vmi, next, end, 0);
743 			if (error)
744 				goto end_split_failed;
745 		}
746 		vma_start_write(next);
747 		mas_set(&mas_detach, count);
748 		error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
749 		if (error)
750 			goto munmap_gather_failed;
751 		vma_mark_detached(next, true);
752 		if (next->vm_flags & VM_LOCKED)
753 			locked_vm += vma_pages(next);
754 
755 		count++;
756 		if (unlikely(uf)) {
757 			/*
758 			 * If userfaultfd_unmap_prep returns an error the vmas
759 			 * will remain split, but userland will get a
760 			 * highly unexpected error anyway. This is no
761 			 * different than the case where the first of the two
762 			 * __split_vma fails, but we don't undo the first
763 			 * split, despite we could. This is unlikely enough
764 			 * failure that it's not worth optimizing it for.
765 			 */
766 			error = userfaultfd_unmap_prep(next, start, end, uf);
767 
768 			if (error)
769 				goto userfaultfd_error;
770 		}
771 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
772 		BUG_ON(next->vm_start < start);
773 		BUG_ON(next->vm_start > end);
774 #endif
775 	} for_each_vma_range(*vmi, next, end);
776 
777 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
778 	/* Make sure no VMAs are about to be lost. */
779 	{
780 		MA_STATE(test, &mt_detach, 0, 0);
781 		struct vm_area_struct *vma_mas, *vma_test;
782 		int test_count = 0;
783 
784 		vma_iter_set(vmi, start);
785 		rcu_read_lock();
786 		vma_test = mas_find(&test, count - 1);
787 		for_each_vma_range(*vmi, vma_mas, end) {
788 			BUG_ON(vma_mas != vma_test);
789 			test_count++;
790 			vma_test = mas_next(&test, count - 1);
791 		}
792 		rcu_read_unlock();
793 		BUG_ON(count != test_count);
794 	}
795 #endif
796 
797 	while (vma_iter_addr(vmi) > start)
798 		vma_iter_prev_range(vmi);
799 
800 	error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
801 	if (error)
802 		goto clear_tree_failed;
803 
804 	/* Point of no return */
805 	mm->locked_vm -= locked_vm;
806 	mm->map_count -= count;
807 	if (unlock)
808 		mmap_write_downgrade(mm);
809 
810 	prev = vma_iter_prev_range(vmi);
811 	next = vma_next(vmi);
812 	if (next)
813 		vma_iter_prev_range(vmi);
814 
815 	/*
816 	 * We can free page tables without write-locking mmap_lock because VMAs
817 	 * were isolated before we downgraded mmap_lock.
818 	 */
819 	mas_set(&mas_detach, 1);
820 	unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
821 		     !unlock);
822 	/* Statistics and freeing VMAs */
823 	mas_set(&mas_detach, 0);
824 	remove_mt(mm, &mas_detach);
825 	validate_mm(mm);
826 	if (unlock)
827 		mmap_read_unlock(mm);
828 
829 	__mt_destroy(&mt_detach);
830 	return 0;
831 
832 modify_vma_failed:
833 clear_tree_failed:
834 userfaultfd_error:
835 munmap_gather_failed:
836 end_split_failed:
837 	mas_set(&mas_detach, 0);
838 	mas_for_each(&mas_detach, next, end)
839 		vma_mark_detached(next, false);
840 
841 	__mt_destroy(&mt_detach);
842 start_split_failed:
843 map_count_exceeded:
844 	validate_mm(mm);
845 	return error;
846 }
847 
848 /*
849  * do_vmi_munmap() - munmap a given range.
850  * @vmi: The vma iterator
851  * @mm: The mm_struct
852  * @start: The start address to munmap
853  * @len: The length of the range to munmap
854  * @uf: The userfaultfd list_head
855  * @unlock: set to true if the user wants to drop the mmap_lock on success
856  *
857  * This function takes a @mas that is either pointing to the previous VMA or set
858  * to MA_START and sets it up to remove the mapping(s).  The @len will be
859  * aligned.
860  *
861  * Return: 0 on success and drops the lock if so directed, error and leaves the
862  * lock held otherwise.
863  */
864 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
865 		  unsigned long start, size_t len, struct list_head *uf,
866 		  bool unlock)
867 {
868 	unsigned long end;
869 	struct vm_area_struct *vma;
870 
871 	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
872 		return -EINVAL;
873 
874 	end = start + PAGE_ALIGN(len);
875 	if (end == start)
876 		return -EINVAL;
877 
878 	/* Find the first overlapping VMA */
879 	vma = vma_find(vmi, end);
880 	if (!vma) {
881 		if (unlock)
882 			mmap_write_unlock(mm);
883 		return 0;
884 	}
885 
886 	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
887 }
888 
889 /*
890  * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
891  * figure out whether that can be merged with its predecessor or its
892  * successor.  Or both (it neatly fills a hole).
893  *
894  * In most cases - when called for mmap, brk or mremap - [addr,end) is
895  * certain not to be mapped by the time vma_merge is called; but when
896  * called for mprotect, it is certain to be already mapped (either at
897  * an offset within prev, or at the start of next), and the flags of
898  * this area are about to be changed to vm_flags - and the no-change
899  * case has already been eliminated.
900  *
901  * The following mprotect cases have to be considered, where **** is
902  * the area passed down from mprotect_fixup, never extending beyond one
903  * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
904  * at the same address as **** and is of the same or larger span, and
905  * NNNN the next vma after ****:
906  *
907  *     ****             ****                   ****
908  *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPCCCCCC
909  *    cannot merge    might become       might become
910  *                    PPNNNNNNNNNN       PPPPPPPPPPCC
911  *    mmap, brk or    case 4 below       case 5 below
912  *    mremap move:
913  *                        ****               ****
914  *                    PPPP    NNNN       PPPPCCCCNNNN
915  *                    might become       might become
916  *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
917  *                    PPPPPPPPNNNN 2 or  PPPPPPPPNNNN 7 or
918  *                    PPPPNNNNNNNN 3     PPPPNNNNNNNN 8
919  *
920  * It is important for case 8 that the vma CCCC overlapping the
921  * region **** is never going to extended over NNNN. Instead NNNN must
922  * be extended in region **** and CCCC must be removed. This way in
923  * all cases where vma_merge succeeds, the moment vma_merge drops the
924  * rmap_locks, the properties of the merged vma will be already
925  * correct for the whole merged range. Some of those properties like
926  * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
927  * be correct for the whole merged range immediately after the
928  * rmap_locks are released. Otherwise if NNNN would be removed and
929  * CCCC would be extended over the NNNN range, remove_migration_ptes
930  * or other rmap walkers (if working on addresses beyond the "end"
931  * parameter) may establish ptes with the wrong permissions of CCCC
932  * instead of the right permissions of NNNN.
933  *
934  * In the code below:
935  * PPPP is represented by *prev
936  * CCCC is represented by *curr or not represented at all (NULL)
937  * NNNN is represented by *next or not represented at all (NULL)
938  * **** is not represented - it will be merged and the vma containing the
939  *      area is returned, or the function will return NULL
940  */
941 static struct vm_area_struct
942 *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
943 	   struct vm_area_struct *src, unsigned long addr, unsigned long end,
944 	   unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
945 	   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
946 	   struct anon_vma_name *anon_name)
947 {
948 	struct mm_struct *mm = src->vm_mm;
949 	struct anon_vma *anon_vma = src->anon_vma;
950 	struct file *file = src->vm_file;
951 	struct vm_area_struct *curr, *next, *res;
952 	struct vm_area_struct *vma, *adjust, *remove, *remove2;
953 	struct vm_area_struct *anon_dup = NULL;
954 	struct vma_prepare vp;
955 	pgoff_t vma_pgoff;
956 	int err = 0;
957 	bool merge_prev = false;
958 	bool merge_next = false;
959 	bool vma_expanded = false;
960 	unsigned long vma_start = addr;
961 	unsigned long vma_end = end;
962 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
963 	long adj_start = 0;
964 
965 	/*
966 	 * We later require that vma->vm_flags == vm_flags,
967 	 * so this tests vma->vm_flags & VM_SPECIAL, too.
968 	 */
969 	if (vm_flags & VM_SPECIAL)
970 		return NULL;
971 
972 	/* Does the input range span an existing VMA? (cases 5 - 8) */
973 	curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
974 
975 	if (!curr ||			/* cases 1 - 4 */
976 	    end == curr->vm_end)	/* cases 6 - 8, adjacent VMA */
977 		next = vma_lookup(mm, end);
978 	else
979 		next = NULL;		/* case 5 */
980 
981 	if (prev) {
982 		vma_start = prev->vm_start;
983 		vma_pgoff = prev->vm_pgoff;
984 
985 		/* Can we merge the predecessor? */
986 		if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
987 		    && can_vma_merge_after(prev, vm_flags, anon_vma, file,
988 					   pgoff, vm_userfaultfd_ctx, anon_name)) {
989 			merge_prev = true;
990 			vma_prev(vmi);
991 		}
992 	}
993 
994 	/* Can we merge the successor? */
995 	if (next && mpol_equal(policy, vma_policy(next)) &&
996 	    can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
997 				 vm_userfaultfd_ctx, anon_name)) {
998 		merge_next = true;
999 	}
1000 
1001 	/* Verify some invariant that must be enforced by the caller. */
1002 	VM_WARN_ON(prev && addr <= prev->vm_start);
1003 	VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
1004 	VM_WARN_ON(addr >= end);
1005 
1006 	if (!merge_prev && !merge_next)
1007 		return NULL; /* Not mergeable. */
1008 
1009 	if (merge_prev)
1010 		vma_start_write(prev);
1011 
1012 	res = vma = prev;
1013 	remove = remove2 = adjust = NULL;
1014 
1015 	/* Can we merge both the predecessor and the successor? */
1016 	if (merge_prev && merge_next &&
1017 	    is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
1018 		vma_start_write(next);
1019 		remove = next;				/* case 1 */
1020 		vma_end = next->vm_end;
1021 		err = dup_anon_vma(prev, next, &anon_dup);
1022 		if (curr) {				/* case 6 */
1023 			vma_start_write(curr);
1024 			remove = curr;
1025 			remove2 = next;
1026 			/*
1027 			 * Note that the dup_anon_vma below cannot overwrite err
1028 			 * since the first caller would do nothing unless next
1029 			 * has an anon_vma.
1030 			 */
1031 			if (!next->anon_vma)
1032 				err = dup_anon_vma(prev, curr, &anon_dup);
1033 		}
1034 	} else if (merge_prev) {			/* case 2 */
1035 		if (curr) {
1036 			vma_start_write(curr);
1037 			if (end == curr->vm_end) {	/* case 7 */
1038 				/*
1039 				 * can_vma_merge_after() assumed we would not be
1040 				 * removing prev vma, so it skipped the check
1041 				 * for vm_ops->close, but we are removing curr
1042 				 */
1043 				if (curr->vm_ops && curr->vm_ops->close)
1044 					err = -EINVAL;
1045 				remove = curr;
1046 			} else {			/* case 5 */
1047 				adjust = curr;
1048 				adj_start = (end - curr->vm_start);
1049 			}
1050 			if (!err)
1051 				err = dup_anon_vma(prev, curr, &anon_dup);
1052 		}
1053 	} else { /* merge_next */
1054 		vma_start_write(next);
1055 		res = next;
1056 		if (prev && addr < prev->vm_end) {	/* case 4 */
1057 			vma_start_write(prev);
1058 			vma_end = addr;
1059 			adjust = next;
1060 			adj_start = -(prev->vm_end - addr);
1061 			err = dup_anon_vma(next, prev, &anon_dup);
1062 		} else {
1063 			/*
1064 			 * Note that cases 3 and 8 are the ONLY ones where prev
1065 			 * is permitted to be (but is not necessarily) NULL.
1066 			 */
1067 			vma = next;			/* case 3 */
1068 			vma_start = addr;
1069 			vma_end = next->vm_end;
1070 			vma_pgoff = next->vm_pgoff - pglen;
1071 			if (curr) {			/* case 8 */
1072 				vma_pgoff = curr->vm_pgoff;
1073 				vma_start_write(curr);
1074 				remove = curr;
1075 				err = dup_anon_vma(next, curr, &anon_dup);
1076 			}
1077 		}
1078 	}
1079 
1080 	/* Error in anon_vma clone. */
1081 	if (err)
1082 		goto anon_vma_fail;
1083 
1084 	if (vma_start < vma->vm_start || vma_end > vma->vm_end)
1085 		vma_expanded = true;
1086 
1087 	if (vma_expanded) {
1088 		vma_iter_config(vmi, vma_start, vma_end);
1089 	} else {
1090 		vma_iter_config(vmi, adjust->vm_start + adj_start,
1091 				adjust->vm_end);
1092 	}
1093 
1094 	if (vma_iter_prealloc(vmi, vma))
1095 		goto prealloc_fail;
1096 
1097 	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
1098 	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
1099 		   vp.anon_vma != adjust->anon_vma);
1100 
1101 	vma_prepare(&vp);
1102 	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
1103 	vma_set_range(vma, vma_start, vma_end, vma_pgoff);
1104 
1105 	if (vma_expanded)
1106 		vma_iter_store(vmi, vma);
1107 
1108 	if (adj_start) {
1109 		adjust->vm_start += adj_start;
1110 		adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
1111 		if (adj_start < 0) {
1112 			WARN_ON(vma_expanded);
1113 			vma_iter_store(vmi, next);
1114 		}
1115 	}
1116 
1117 	vma_complete(&vp, vmi, mm);
1118 	khugepaged_enter_vma(res, vm_flags);
1119 	return res;
1120 
1121 prealloc_fail:
1122 	if (anon_dup)
1123 		unlink_anon_vmas(anon_dup);
1124 
1125 anon_vma_fail:
1126 	vma_iter_set(vmi, addr);
1127 	vma_iter_load(vmi);
1128 	return NULL;
1129 }
1130 
1131 /*
1132  * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
1133  * context and anonymous VMA name within the range [start, end).
1134  *
1135  * As a result, we might be able to merge the newly modified VMA range with an
1136  * adjacent VMA with identical properties.
1137  *
1138  * If no merge is possible and the range does not span the entirety of the VMA,
1139  * we then need to split the VMA to accommodate the change.
1140  *
1141  * The function returns either the merged VMA, the original VMA if a split was
1142  * required instead, or an error if the split failed.
1143  */
1144 struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
1145 				  struct vm_area_struct *prev,
1146 				  struct vm_area_struct *vma,
1147 				  unsigned long start, unsigned long end,
1148 				  unsigned long vm_flags,
1149 				  struct mempolicy *policy,
1150 				  struct vm_userfaultfd_ctx uffd_ctx,
1151 				  struct anon_vma_name *anon_name)
1152 {
1153 	pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
1154 	struct vm_area_struct *merged;
1155 
1156 	merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
1157 			   pgoff, policy, uffd_ctx, anon_name);
1158 	if (merged)
1159 		return merged;
1160 
1161 	if (vma->vm_start < start) {
1162 		int err = split_vma(vmi, vma, start, 1);
1163 
1164 		if (err)
1165 			return ERR_PTR(err);
1166 	}
1167 
1168 	if (vma->vm_end > end) {
1169 		int err = split_vma(vmi, vma, end, 0);
1170 
1171 		if (err)
1172 			return ERR_PTR(err);
1173 	}
1174 
1175 	return vma;
1176 }
1177 
1178 /*
1179  * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
1180  * must ensure that [start, end) does not overlap any existing VMA.
1181  */
1182 struct vm_area_struct
1183 *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
1184 		   struct vm_area_struct *vma, unsigned long start,
1185 		   unsigned long end, pgoff_t pgoff)
1186 {
1187 	return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
1188 			 vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1189 }
1190 
1191 /*
1192  * Expand vma by delta bytes, potentially merging with an immediately adjacent
1193  * VMA with identical properties.
1194  */
1195 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
1196 					struct vm_area_struct *vma,
1197 					unsigned long delta)
1198 {
1199 	pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
1200 
1201 	/* vma is specified as prev, so case 1 or 2 will apply. */
1202 	return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
1203 			 vma->vm_flags, pgoff, vma_policy(vma),
1204 			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1205 }
1206 
1207 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
1208 {
1209 	vb->count = 0;
1210 }
1211 
1212 static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
1213 {
1214 	struct address_space *mapping;
1215 	int i;
1216 
1217 	mapping = vb->vmas[0]->vm_file->f_mapping;
1218 	i_mmap_lock_write(mapping);
1219 	for (i = 0; i < vb->count; i++) {
1220 		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
1221 		__remove_shared_vm_struct(vb->vmas[i], mapping);
1222 	}
1223 	i_mmap_unlock_write(mapping);
1224 
1225 	unlink_file_vma_batch_init(vb);
1226 }
1227 
1228 void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
1229 			       struct vm_area_struct *vma)
1230 {
1231 	if (vma->vm_file == NULL)
1232 		return;
1233 
1234 	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
1235 	    vb->count == ARRAY_SIZE(vb->vmas))
1236 		unlink_file_vma_batch_process(vb);
1237 
1238 	vb->vmas[vb->count] = vma;
1239 	vb->count++;
1240 }
1241 
1242 void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
1243 {
1244 	if (vb->count > 0)
1245 		unlink_file_vma_batch_process(vb);
1246 }
1247 
1248 /*
1249  * Unlink a file-based vm structure from its interval tree, to hide
1250  * vma from rmap and vmtruncate before freeing its page tables.
1251  */
1252 void unlink_file_vma(struct vm_area_struct *vma)
1253 {
1254 	struct file *file = vma->vm_file;
1255 
1256 	if (file) {
1257 		struct address_space *mapping = file->f_mapping;
1258 
1259 		i_mmap_lock_write(mapping);
1260 		__remove_shared_vm_struct(vma, mapping);
1261 		i_mmap_unlock_write(mapping);
1262 	}
1263 }
1264 
1265 void vma_link_file(struct vm_area_struct *vma)
1266 {
1267 	struct file *file = vma->vm_file;
1268 	struct address_space *mapping;
1269 
1270 	if (file) {
1271 		mapping = file->f_mapping;
1272 		i_mmap_lock_write(mapping);
1273 		__vma_link_file(vma, mapping);
1274 		i_mmap_unlock_write(mapping);
1275 	}
1276 }
1277 
1278 int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
1279 {
1280 	VMA_ITERATOR(vmi, mm, 0);
1281 
1282 	vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
1283 	if (vma_iter_prealloc(&vmi, vma))
1284 		return -ENOMEM;
1285 
1286 	vma_start_write(vma);
1287 	vma_iter_store(&vmi, vma);
1288 	vma_link_file(vma);
1289 	mm->map_count++;
1290 	validate_mm(mm);
1291 	return 0;
1292 }
1293 
1294 /*
1295  * Copy the vma structure to a new location in the same mm,
1296  * prior to moving page table entries, to effect an mremap move.
1297  */
1298 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1299 	unsigned long addr, unsigned long len, pgoff_t pgoff,
1300 	bool *need_rmap_locks)
1301 {
1302 	struct vm_area_struct *vma = *vmap;
1303 	unsigned long vma_start = vma->vm_start;
1304 	struct mm_struct *mm = vma->vm_mm;
1305 	struct vm_area_struct *new_vma, *prev;
1306 	bool faulted_in_anon_vma = true;
1307 	VMA_ITERATOR(vmi, mm, addr);
1308 
1309 	/*
1310 	 * If anonymous vma has not yet been faulted, update new pgoff
1311 	 * to match new location, to increase its chance of merging.
1312 	 */
1313 	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
1314 		pgoff = addr >> PAGE_SHIFT;
1315 		faulted_in_anon_vma = false;
1316 	}
1317 
1318 	new_vma = find_vma_prev(mm, addr, &prev);
1319 	if (new_vma && new_vma->vm_start < addr + len)
1320 		return NULL;	/* should never get here */
1321 
1322 	new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
1323 	if (new_vma) {
1324 		/*
1325 		 * Source vma may have been merged into new_vma
1326 		 */
1327 		if (unlikely(vma_start >= new_vma->vm_start &&
1328 			     vma_start < new_vma->vm_end)) {
1329 			/*
1330 			 * The only way we can get a vma_merge with
1331 			 * self during an mremap is if the vma hasn't
1332 			 * been faulted in yet and we were allowed to
1333 			 * reset the dst vma->vm_pgoff to the
1334 			 * destination address of the mremap to allow
1335 			 * the merge to happen. mremap must change the
1336 			 * vm_pgoff linearity between src and dst vmas
1337 			 * (in turn preventing a vma_merge) to be
1338 			 * safe. It is only safe to keep the vm_pgoff
1339 			 * linear if there are no pages mapped yet.
1340 			 */
1341 			VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
1342 			*vmap = vma = new_vma;
1343 		}
1344 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
1345 	} else {
1346 		new_vma = vm_area_dup(vma);
1347 		if (!new_vma)
1348 			goto out;
1349 		vma_set_range(new_vma, addr, addr + len, pgoff);
1350 		if (vma_dup_policy(vma, new_vma))
1351 			goto out_free_vma;
1352 		if (anon_vma_clone(new_vma, vma))
1353 			goto out_free_mempol;
1354 		if (new_vma->vm_file)
1355 			get_file(new_vma->vm_file);
1356 		if (new_vma->vm_ops && new_vma->vm_ops->open)
1357 			new_vma->vm_ops->open(new_vma);
1358 		if (vma_link(mm, new_vma))
1359 			goto out_vma_link;
1360 		*need_rmap_locks = false;
1361 	}
1362 	return new_vma;
1363 
1364 out_vma_link:
1365 	if (new_vma->vm_ops && new_vma->vm_ops->close)
1366 		new_vma->vm_ops->close(new_vma);
1367 
1368 	if (new_vma->vm_file)
1369 		fput(new_vma->vm_file);
1370 
1371 	unlink_anon_vmas(new_vma);
1372 out_free_mempol:
1373 	mpol_put(vma_policy(new_vma));
1374 out_free_vma:
1375 	vm_area_free(new_vma);
1376 out:
1377 	return NULL;
1378 }
1379 
1380 /*
1381  * Rough compatibility check to quickly see if it's even worth looking
1382  * at sharing an anon_vma.
1383  *
1384  * They need to have the same vm_file, and the flags can only differ
1385  * in things that mprotect may change.
1386  *
1387  * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1388  * we can merge the two vma's. For example, we refuse to merge a vma if
1389  * there is a vm_ops->close() function, because that indicates that the
1390  * driver is doing some kind of reference counting. But that doesn't
1391  * really matter for the anon_vma sharing case.
1392  */
1393 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1394 {
1395 	return a->vm_end == b->vm_start &&
1396 		mpol_equal(vma_policy(a), vma_policy(b)) &&
1397 		a->vm_file == b->vm_file &&
1398 		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
1399 		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1400 }
1401 
1402 /*
1403  * Do some basic sanity checking to see if we can re-use the anon_vma
1404  * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1405  * the same as 'old', the other will be the new one that is trying
1406  * to share the anon_vma.
1407  *
1408  * NOTE! This runs with mmap_lock held for reading, so it is possible that
1409  * the anon_vma of 'old' is concurrently in the process of being set up
1410  * by another page fault trying to merge _that_. But that's ok: if it
1411  * is being set up, that automatically means that it will be a singleton
1412  * acceptable for merging, so we can do all of this optimistically. But
1413  * we do that READ_ONCE() to make sure that we never re-load the pointer.
1414  *
1415  * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1416  * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1417  * is to return an anon_vma that is "complex" due to having gone through
1418  * a fork).
1419  *
1420  * We also make sure that the two vma's are compatible (adjacent,
1421  * and with the same memory policies). That's all stable, even with just
1422  * a read lock on the mmap_lock.
1423  */
1424 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
1425 					  struct vm_area_struct *a,
1426 					  struct vm_area_struct *b)
1427 {
1428 	if (anon_vma_compatible(a, b)) {
1429 		struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1430 
1431 		if (anon_vma && list_is_singular(&old->anon_vma_chain))
1432 			return anon_vma;
1433 	}
1434 	return NULL;
1435 }
1436 
1437 /*
1438  * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1439  * neighbouring vmas for a suitable anon_vma, before it goes off
1440  * to allocate a new anon_vma.  It checks because a repetitive
1441  * sequence of mprotects and faults may otherwise lead to distinct
1442  * anon_vmas being allocated, preventing vma merge in subsequent
1443  * mprotect.
1444  */
1445 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1446 {
1447 	struct anon_vma *anon_vma = NULL;
1448 	struct vm_area_struct *prev, *next;
1449 	VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
1450 
1451 	/* Try next first. */
1452 	next = vma_iter_load(&vmi);
1453 	if (next) {
1454 		anon_vma = reusable_anon_vma(next, vma, next);
1455 		if (anon_vma)
1456 			return anon_vma;
1457 	}
1458 
1459 	prev = vma_prev(&vmi);
1460 	VM_BUG_ON_VMA(prev != vma, vma);
1461 	prev = vma_prev(&vmi);
1462 	/* Try prev next. */
1463 	if (prev)
1464 		anon_vma = reusable_anon_vma(prev, prev, vma);
1465 
1466 	/*
1467 	 * We might reach here with anon_vma == NULL if we can't find
1468 	 * any reusable anon_vma.
1469 	 * There's no absolute need to look only at touching neighbours:
1470 	 * we could search further afield for "compatible" anon_vmas.
1471 	 * But it would probably just be a waste of time searching,
1472 	 * or lead to too many vmas hanging off the same anon_vma.
1473 	 * We're trying to allow mprotect remerging later on,
1474 	 * not trying to minimize memory used for anon_vmas.
1475 	 */
1476 	return anon_vma;
1477 }
1478 
1479 static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
1480 {
1481 	return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
1482 }
1483 
1484 static bool vma_is_shared_writable(struct vm_area_struct *vma)
1485 {
1486 	return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
1487 		(VM_WRITE | VM_SHARED);
1488 }
1489 
1490 static bool vma_fs_can_writeback(struct vm_area_struct *vma)
1491 {
1492 	/* No managed pages to writeback. */
1493 	if (vma->vm_flags & VM_PFNMAP)
1494 		return false;
1495 
1496 	return vma->vm_file && vma->vm_file->f_mapping &&
1497 		mapping_can_writeback(vma->vm_file->f_mapping);
1498 }
1499 
1500 /*
1501  * Does this VMA require the underlying folios to have their dirty state
1502  * tracked?
1503  */
1504 bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
1505 {
1506 	/* Only shared, writable VMAs require dirty tracking. */
1507 	if (!vma_is_shared_writable(vma))
1508 		return false;
1509 
1510 	/* Does the filesystem need to be notified? */
1511 	if (vm_ops_needs_writenotify(vma->vm_ops))
1512 		return true;
1513 
1514 	/*
1515 	 * Even if the filesystem doesn't indicate a need for writenotify, if it
1516 	 * can writeback, dirty tracking is still required.
1517 	 */
1518 	return vma_fs_can_writeback(vma);
1519 }
1520 
1521 /*
1522  * Some shared mappings will want the pages marked read-only
1523  * to track write events. If so, we'll downgrade vm_page_prot
1524  * to the private version (using protection_map[] without the
1525  * VM_SHARED bit).
1526  */
1527 bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
1528 {
1529 	/* If it was private or non-writable, the write bit is already clear */
1530 	if (!vma_is_shared_writable(vma))
1531 		return false;
1532 
1533 	/* The backer wishes to know when pages are first written to? */
1534 	if (vm_ops_needs_writenotify(vma->vm_ops))
1535 		return true;
1536 
1537 	/* The open routine did something to the protections that pgprot_modify
1538 	 * won't preserve? */
1539 	if (pgprot_val(vm_page_prot) !=
1540 	    pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
1541 		return false;
1542 
1543 	/*
1544 	 * Do we need to track softdirty? hugetlb does not support softdirty
1545 	 * tracking yet.
1546 	 */
1547 	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
1548 		return true;
1549 
1550 	/* Do we need write faults for uffd-wp tracking? */
1551 	if (userfaultfd_wp(vma))
1552 		return true;
1553 
1554 	/* Can the mapping track the dirty pages? */
1555 	return vma_fs_can_writeback(vma);
1556 }
1557 
1558 unsigned long count_vma_pages_range(struct mm_struct *mm,
1559 				    unsigned long addr, unsigned long end)
1560 {
1561 	VMA_ITERATOR(vmi, mm, addr);
1562 	struct vm_area_struct *vma;
1563 	unsigned long nr_pages = 0;
1564 
1565 	for_each_vma_range(vmi, vma, end) {
1566 		unsigned long vm_start = max(addr, vma->vm_start);
1567 		unsigned long vm_end = min(end, vma->vm_end);
1568 
1569 		nr_pages += PHYS_PFN(vm_end - vm_start);
1570 	}
1571 
1572 	return nr_pages;
1573 }
1574 
1575 static DEFINE_MUTEX(mm_all_locks_mutex);
1576 
1577 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
1578 {
1579 	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
1580 		/*
1581 		 * The LSB of head.next can't change from under us
1582 		 * because we hold the mm_all_locks_mutex.
1583 		 */
1584 		down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
1585 		/*
1586 		 * We can safely modify head.next after taking the
1587 		 * anon_vma->root->rwsem. If some other vma in this mm shares
1588 		 * the same anon_vma we won't take it again.
1589 		 *
1590 		 * No need of atomic instructions here, head.next
1591 		 * can't change from under us thanks to the
1592 		 * anon_vma->root->rwsem.
1593 		 */
1594 		if (__test_and_set_bit(0, (unsigned long *)
1595 				       &anon_vma->root->rb_root.rb_root.rb_node))
1596 			BUG();
1597 	}
1598 }
1599 
1600 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
1601 {
1602 	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
1603 		/*
1604 		 * AS_MM_ALL_LOCKS can't change from under us because
1605 		 * we hold the mm_all_locks_mutex.
1606 		 *
1607 		 * Operations on ->flags have to be atomic because
1608 		 * even if AS_MM_ALL_LOCKS is stable thanks to the
1609 		 * mm_all_locks_mutex, there may be other cpus
1610 		 * changing other bitflags in parallel to us.
1611 		 */
1612 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
1613 			BUG();
1614 		down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
1615 	}
1616 }
1617 
1618 /*
1619  * This operation locks against the VM for all pte/vma/mm related
1620  * operations that could ever happen on a certain mm. This includes
1621  * vmtruncate, try_to_unmap, and all page faults.
1622  *
1623  * The caller must take the mmap_lock in write mode before calling
1624  * mm_take_all_locks(). The caller isn't allowed to release the
1625  * mmap_lock until mm_drop_all_locks() returns.
1626  *
1627  * mmap_lock in write mode is required in order to block all operations
1628  * that could modify pagetables and free pages without need of
1629  * altering the vma layout. It's also needed in write mode to avoid new
1630  * anon_vmas to be associated with existing vmas.
1631  *
1632  * A single task can't take more than one mm_take_all_locks() in a row
1633  * or it would deadlock.
1634  *
1635  * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
1636  * mapping->flags avoid to take the same lock twice, if more than one
1637  * vma in this mm is backed by the same anon_vma or address_space.
1638  *
1639  * We take locks in following order, accordingly to comment at beginning
1640  * of mm/rmap.c:
1641  *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
1642  *     hugetlb mapping);
1643  *   - all vmas marked locked
1644  *   - all i_mmap_rwsem locks;
1645  *   - all anon_vma->rwseml
1646  *
1647  * We can take all locks within these types randomly because the VM code
1648  * doesn't nest them and we protected from parallel mm_take_all_locks() by
1649  * mm_all_locks_mutex.
1650  *
1651  * mm_take_all_locks() and mm_drop_all_locks are expensive operations
1652  * that may have to take thousand of locks.
1653  *
1654  * mm_take_all_locks() can fail if it's interrupted by signals.
1655  */
1656 int mm_take_all_locks(struct mm_struct *mm)
1657 {
1658 	struct vm_area_struct *vma;
1659 	struct anon_vma_chain *avc;
1660 	VMA_ITERATOR(vmi, mm, 0);
1661 
1662 	mmap_assert_write_locked(mm);
1663 
1664 	mutex_lock(&mm_all_locks_mutex);
1665 
1666 	/*
1667 	 * vma_start_write() does not have a complement in mm_drop_all_locks()
1668 	 * because vma_start_write() is always asymmetrical; it marks a VMA as
1669 	 * being written to until mmap_write_unlock() or mmap_write_downgrade()
1670 	 * is reached.
1671 	 */
1672 	for_each_vma(vmi, vma) {
1673 		if (signal_pending(current))
1674 			goto out_unlock;
1675 		vma_start_write(vma);
1676 	}
1677 
1678 	vma_iter_init(&vmi, mm, 0);
1679 	for_each_vma(vmi, vma) {
1680 		if (signal_pending(current))
1681 			goto out_unlock;
1682 		if (vma->vm_file && vma->vm_file->f_mapping &&
1683 				is_vm_hugetlb_page(vma))
1684 			vm_lock_mapping(mm, vma->vm_file->f_mapping);
1685 	}
1686 
1687 	vma_iter_init(&vmi, mm, 0);
1688 	for_each_vma(vmi, vma) {
1689 		if (signal_pending(current))
1690 			goto out_unlock;
1691 		if (vma->vm_file && vma->vm_file->f_mapping &&
1692 				!is_vm_hugetlb_page(vma))
1693 			vm_lock_mapping(mm, vma->vm_file->f_mapping);
1694 	}
1695 
1696 	vma_iter_init(&vmi, mm, 0);
1697 	for_each_vma(vmi, vma) {
1698 		if (signal_pending(current))
1699 			goto out_unlock;
1700 		if (vma->anon_vma)
1701 			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
1702 				vm_lock_anon_vma(mm, avc->anon_vma);
1703 	}
1704 
1705 	return 0;
1706 
1707 out_unlock:
1708 	mm_drop_all_locks(mm);
1709 	return -EINTR;
1710 }
1711 
1712 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
1713 {
1714 	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
1715 		/*
1716 		 * The LSB of head.next can't change to 0 from under
1717 		 * us because we hold the mm_all_locks_mutex.
1718 		 *
1719 		 * We must however clear the bitflag before unlocking
1720 		 * the vma so the users using the anon_vma->rb_root will
1721 		 * never see our bitflag.
1722 		 *
1723 		 * No need of atomic instructions here, head.next
1724 		 * can't change from under us until we release the
1725 		 * anon_vma->root->rwsem.
1726 		 */
1727 		if (!__test_and_clear_bit(0, (unsigned long *)
1728 					  &anon_vma->root->rb_root.rb_root.rb_node))
1729 			BUG();
1730 		anon_vma_unlock_write(anon_vma);
1731 	}
1732 }
1733 
1734 static void vm_unlock_mapping(struct address_space *mapping)
1735 {
1736 	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
1737 		/*
1738 		 * AS_MM_ALL_LOCKS can't change to 0 from under us
1739 		 * because we hold the mm_all_locks_mutex.
1740 		 */
1741 		i_mmap_unlock_write(mapping);
1742 		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
1743 					&mapping->flags))
1744 			BUG();
1745 	}
1746 }
1747 
1748 /*
1749  * The mmap_lock cannot be released by the caller until
1750  * mm_drop_all_locks() returns.
1751  */
1752 void mm_drop_all_locks(struct mm_struct *mm)
1753 {
1754 	struct vm_area_struct *vma;
1755 	struct anon_vma_chain *avc;
1756 	VMA_ITERATOR(vmi, mm, 0);
1757 
1758 	mmap_assert_write_locked(mm);
1759 	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
1760 
1761 	for_each_vma(vmi, vma) {
1762 		if (vma->anon_vma)
1763 			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
1764 				vm_unlock_anon_vma(avc->anon_vma);
1765 		if (vma->vm_file && vma->vm_file->f_mapping)
1766 			vm_unlock_mapping(vma->vm_file->f_mapping);
1767 	}
1768 
1769 	mutex_unlock(&mm_all_locks_mutex);
1770 }
1771