1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 /*
4 * VMA-specific functions.
5 */
6
7 #include "vma_internal.h"
8 #include "vma.h"
9
10 struct mmap_state {
11 struct mm_struct *mm;
12 struct vma_iterator *vmi;
13
14 unsigned long addr;
15 unsigned long end;
16 pgoff_t pgoff;
17 unsigned long pglen;
18 vm_flags_t vm_flags;
19 struct file *file;
20 pgprot_t page_prot;
21
22 /* User-defined fields, perhaps updated by .mmap_prepare(). */
23 const struct vm_operations_struct *vm_ops;
24 void *vm_private_data;
25
26 unsigned long charged;
27
28 struct vm_area_struct *prev;
29 struct vm_area_struct *next;
30
31 /* Unmapping state. */
32 struct vma_munmap_struct vms;
33 struct ma_state mas_detach;
34 struct maple_tree mt_detach;
35
36 /* Determine if we can check KSM flags early in mmap() logic. */
37 bool check_ksm_early;
38 };
39
40 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
41 struct mmap_state name = { \
42 .mm = mm_, \
43 .vmi = vmi_, \
44 .addr = addr_, \
45 .end = (addr_) + (len_), \
46 .pgoff = pgoff_, \
47 .pglen = PHYS_PFN(len_), \
48 .vm_flags = vm_flags_, \
49 .file = file_, \
50 .page_prot = vm_get_page_prot(vm_flags_), \
51 }
52
53 #define VMG_MMAP_STATE(name, map_, vma_) \
54 struct vma_merge_struct name = { \
55 .mm = (map_)->mm, \
56 .vmi = (map_)->vmi, \
57 .start = (map_)->addr, \
58 .end = (map_)->end, \
59 .vm_flags = (map_)->vm_flags, \
60 .pgoff = (map_)->pgoff, \
61 .file = (map_)->file, \
62 .prev = (map_)->prev, \
63 .middle = vma_, \
64 .next = (vma_) ? NULL : (map_)->next, \
65 .state = VMA_MERGE_START, \
66 }
67
68 /*
69 * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
70 * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
71 * would mean a wider range of folios sharing the root anon_vma lock, and thus
72 * potential lock contention, we do not wish to encourage merging such that this
73 * scales to a problem.
74 */
vma_had_uncowed_parents(struct vm_area_struct * vma)75 static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
76 {
77 /*
78 * The list_is_singular() test is to avoid merging VMA cloned from
79 * parents. This can improve scalability caused by anon_vma lock.
80 */
81 return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
82 }
83
is_mergeable_vma(struct vma_merge_struct * vmg,bool merge_next)84 static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
85 {
86 struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
87
88 if (!mpol_equal(vmg->policy, vma_policy(vma)))
89 return false;
90 /*
91 * VM_SOFTDIRTY should not prevent from VMA merging, if we
92 * match the flags but dirty bit -- the caller should mark
93 * merged VMA as dirty. If dirty bit won't be excluded from
94 * comparison, we increase pressure on the memory system forcing
95 * the kernel to generate new VMAs when old one could be
96 * extended instead.
97 */
98 if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
99 return false;
100 if (vma->vm_file != vmg->file)
101 return false;
102 if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
103 return false;
104 if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
105 return false;
106 return true;
107 }
108
is_mergeable_anon_vma(struct vma_merge_struct * vmg,bool merge_next)109 static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
110 {
111 struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
112 struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
113 struct anon_vma *tgt_anon = tgt->anon_vma;
114 struct anon_vma *src_anon = vmg->anon_vma;
115
116 /*
117 * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
118 * will remove the existing VMA's anon_vma's so there's no scalability
119 * concerns.
120 */
121 VM_WARN_ON(src && src_anon != src->anon_vma);
122
123 /* Case 1 - we will dup_anon_vma() from src into tgt. */
124 if (!tgt_anon && src_anon)
125 return !vma_had_uncowed_parents(src);
126 /* Case 2 - we will simply use tgt's anon_vma. */
127 if (tgt_anon && !src_anon)
128 return !vma_had_uncowed_parents(tgt);
129 /* Case 3 - the anon_vma's are already shared. */
130 return src_anon == tgt_anon;
131 }
132
133 /*
134 * init_multi_vma_prep() - Initializer for struct vma_prepare
135 * @vp: The vma_prepare struct
136 * @vma: The vma that will be altered once locked
137 * @vmg: The merge state that will be used to determine adjustment and VMA
138 * removal.
139 */
init_multi_vma_prep(struct vma_prepare * vp,struct vm_area_struct * vma,struct vma_merge_struct * vmg)140 static void init_multi_vma_prep(struct vma_prepare *vp,
141 struct vm_area_struct *vma,
142 struct vma_merge_struct *vmg)
143 {
144 struct vm_area_struct *adjust;
145 struct vm_area_struct **remove = &vp->remove;
146
147 memset(vp, 0, sizeof(struct vma_prepare));
148 vp->vma = vma;
149 vp->anon_vma = vma->anon_vma;
150
151 if (vmg && vmg->__remove_middle) {
152 *remove = vmg->middle;
153 remove = &vp->remove2;
154 }
155 if (vmg && vmg->__remove_next)
156 *remove = vmg->next;
157
158 if (vmg && vmg->__adjust_middle_start)
159 adjust = vmg->middle;
160 else if (vmg && vmg->__adjust_next_start)
161 adjust = vmg->next;
162 else
163 adjust = NULL;
164
165 vp->adj_next = adjust;
166 if (!vp->anon_vma && adjust)
167 vp->anon_vma = adjust->anon_vma;
168
169 VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
170 vp->anon_vma != adjust->anon_vma);
171
172 vp->file = vma->vm_file;
173 if (vp->file)
174 vp->mapping = vma->vm_file->f_mapping;
175
176 if (vmg && vmg->skip_vma_uprobe)
177 vp->skip_vma_uprobe = true;
178 }
179
180 /*
181 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
182 * in front of (at a lower virtual address and file offset than) the vma.
183 *
184 * We cannot merge two vmas if they have differently assigned (non-NULL)
185 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
186 *
187 * We don't check here for the merged mmap wrapping around the end of pagecache
188 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
189 * wrap, nor mmaps which cover the final page at index -1UL.
190 *
191 * We assume the vma may be removed as part of the merge.
192 */
can_vma_merge_before(struct vma_merge_struct * vmg)193 static bool can_vma_merge_before(struct vma_merge_struct *vmg)
194 {
195 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
196
197 if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
198 is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
199 if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
200 return true;
201 }
202
203 return false;
204 }
205
206 /*
207 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
208 * beyond (at a higher virtual address and file offset than) the vma.
209 *
210 * We cannot merge two vmas if they have differently assigned (non-NULL)
211 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
212 *
213 * We assume that vma is not removed as part of the merge.
214 */
can_vma_merge_after(struct vma_merge_struct * vmg)215 static bool can_vma_merge_after(struct vma_merge_struct *vmg)
216 {
217 if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
218 is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
219 if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
220 return true;
221 }
222 return false;
223 }
224
__vma_link_file(struct vm_area_struct * vma,struct address_space * mapping)225 static void __vma_link_file(struct vm_area_struct *vma,
226 struct address_space *mapping)
227 {
228 if (vma_is_shared_maywrite(vma))
229 mapping_allow_writable(mapping);
230
231 flush_dcache_mmap_lock(mapping);
232 vma_interval_tree_insert(vma, &mapping->i_mmap);
233 flush_dcache_mmap_unlock(mapping);
234 }
235
236 /*
237 * Requires inode->i_mapping->i_mmap_rwsem
238 */
__remove_shared_vm_struct(struct vm_area_struct * vma,struct address_space * mapping)239 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
240 struct address_space *mapping)
241 {
242 if (vma_is_shared_maywrite(vma))
243 mapping_unmap_writable(mapping);
244
245 flush_dcache_mmap_lock(mapping);
246 vma_interval_tree_remove(vma, &mapping->i_mmap);
247 flush_dcache_mmap_unlock(mapping);
248 }
249
250 /*
251 * vma has some anon_vma assigned, and is already inserted on that
252 * anon_vma's interval trees.
253 *
254 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
255 * vma must be removed from the anon_vma's interval trees using
256 * anon_vma_interval_tree_pre_update_vma().
257 *
258 * After the update, the vma will be reinserted using
259 * anon_vma_interval_tree_post_update_vma().
260 *
261 * The entire update must be protected by exclusive mmap_lock and by
262 * the root anon_vma's mutex.
263 */
264 static void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct * vma)265 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
266 {
267 struct anon_vma_chain *avc;
268
269 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
270 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
271 }
272
273 static void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct * vma)274 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
275 {
276 struct anon_vma_chain *avc;
277
278 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
279 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
280 }
281
282 /*
283 * vma_prepare() - Helper function for handling locking VMAs prior to altering
284 * @vp: The initialized vma_prepare struct
285 */
vma_prepare(struct vma_prepare * vp)286 static void vma_prepare(struct vma_prepare *vp)
287 {
288 if (vp->file) {
289 uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
290
291 if (vp->adj_next)
292 uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
293 vp->adj_next->vm_end);
294
295 i_mmap_lock_write(vp->mapping);
296 if (vp->insert && vp->insert->vm_file) {
297 /*
298 * Put into interval tree now, so instantiated pages
299 * are visible to arm/parisc __flush_dcache_page
300 * throughout; but we cannot insert into address
301 * space until vma start or end is updated.
302 */
303 __vma_link_file(vp->insert,
304 vp->insert->vm_file->f_mapping);
305 }
306 }
307
308 if (vp->anon_vma) {
309 anon_vma_lock_write(vp->anon_vma);
310 anon_vma_interval_tree_pre_update_vma(vp->vma);
311 if (vp->adj_next)
312 anon_vma_interval_tree_pre_update_vma(vp->adj_next);
313 }
314
315 if (vp->file) {
316 flush_dcache_mmap_lock(vp->mapping);
317 vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
318 if (vp->adj_next)
319 vma_interval_tree_remove(vp->adj_next,
320 &vp->mapping->i_mmap);
321 }
322
323 }
324
325 /*
326 * vma_complete- Helper function for handling the unlocking after altering VMAs,
327 * or for inserting a VMA.
328 *
329 * @vp: The vma_prepare struct
330 * @vmi: The vma iterator
331 * @mm: The mm_struct
332 */
vma_complete(struct vma_prepare * vp,struct vma_iterator * vmi,struct mm_struct * mm)333 static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
334 struct mm_struct *mm)
335 {
336 if (vp->file) {
337 if (vp->adj_next)
338 vma_interval_tree_insert(vp->adj_next,
339 &vp->mapping->i_mmap);
340 vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
341 flush_dcache_mmap_unlock(vp->mapping);
342 }
343
344 if (vp->remove && vp->file) {
345 __remove_shared_vm_struct(vp->remove, vp->mapping);
346 if (vp->remove2)
347 __remove_shared_vm_struct(vp->remove2, vp->mapping);
348 } else if (vp->insert) {
349 /*
350 * split_vma has split insert from vma, and needs
351 * us to insert it before dropping the locks
352 * (it may either follow vma or precede it).
353 */
354 vma_iter_store_new(vmi, vp->insert);
355 mm->map_count++;
356 }
357
358 if (vp->anon_vma) {
359 anon_vma_interval_tree_post_update_vma(vp->vma);
360 if (vp->adj_next)
361 anon_vma_interval_tree_post_update_vma(vp->adj_next);
362 anon_vma_unlock_write(vp->anon_vma);
363 }
364
365 if (vp->file) {
366 i_mmap_unlock_write(vp->mapping);
367
368 if (!vp->skip_vma_uprobe) {
369 uprobe_mmap(vp->vma);
370
371 if (vp->adj_next)
372 uprobe_mmap(vp->adj_next);
373 }
374 }
375
376 if (vp->remove) {
377 again:
378 vma_mark_detached(vp->remove);
379 if (vp->file) {
380 uprobe_munmap(vp->remove, vp->remove->vm_start,
381 vp->remove->vm_end);
382 fput(vp->file);
383 }
384 if (vp->remove->anon_vma)
385 anon_vma_merge(vp->vma, vp->remove);
386 mm->map_count--;
387 mpol_put(vma_policy(vp->remove));
388 if (!vp->remove2)
389 WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
390 vm_area_free(vp->remove);
391
392 /*
393 * In mprotect's case 6 (see comments on vma_merge),
394 * we are removing both mid and next vmas
395 */
396 if (vp->remove2) {
397 vp->remove = vp->remove2;
398 vp->remove2 = NULL;
399 goto again;
400 }
401 }
402 if (vp->insert && vp->file)
403 uprobe_mmap(vp->insert);
404 }
405
406 /*
407 * init_vma_prep() - Initializer wrapper for vma_prepare struct
408 * @vp: The vma_prepare struct
409 * @vma: The vma that will be altered once locked
410 */
init_vma_prep(struct vma_prepare * vp,struct vm_area_struct * vma)411 static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
412 {
413 init_multi_vma_prep(vp, vma, NULL);
414 }
415
416 /*
417 * Can the proposed VMA be merged with the left (previous) VMA taking into
418 * account the start position of the proposed range.
419 */
can_vma_merge_left(struct vma_merge_struct * vmg)420 static bool can_vma_merge_left(struct vma_merge_struct *vmg)
421
422 {
423 return vmg->prev && vmg->prev->vm_end == vmg->start &&
424 can_vma_merge_after(vmg);
425 }
426
427 /*
428 * Can the proposed VMA be merged with the right (next) VMA taking into
429 * account the end position of the proposed range.
430 *
431 * In addition, if we can merge with the left VMA, ensure that left and right
432 * anon_vma's are also compatible.
433 */
can_vma_merge_right(struct vma_merge_struct * vmg,bool can_merge_left)434 static bool can_vma_merge_right(struct vma_merge_struct *vmg,
435 bool can_merge_left)
436 {
437 struct vm_area_struct *next = vmg->next;
438 struct vm_area_struct *prev;
439
440 if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
441 return false;
442
443 if (!can_merge_left)
444 return true;
445
446 /*
447 * If we can merge with prev (left) and next (right), indicating that
448 * each VMA's anon_vma is compatible with the proposed anon_vma, this
449 * does not mean prev and next are compatible with EACH OTHER.
450 *
451 * We therefore check this in addition to mergeability to either side.
452 */
453 prev = vmg->prev;
454 return !prev->anon_vma || !next->anon_vma ||
455 prev->anon_vma == next->anon_vma;
456 }
457
458 /*
459 * Close a vm structure and free it.
460 */
remove_vma(struct vm_area_struct * vma)461 void remove_vma(struct vm_area_struct *vma)
462 {
463 might_sleep();
464 vma_close(vma);
465 if (vma->vm_file)
466 fput(vma->vm_file);
467 mpol_put(vma_policy(vma));
468 vm_area_free(vma);
469 }
470
471 /*
472 * Get rid of page table information in the indicated region.
473 *
474 * Called with the mm semaphore held.
475 */
unmap_region(struct ma_state * mas,struct vm_area_struct * vma,struct vm_area_struct * prev,struct vm_area_struct * next)476 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
477 struct vm_area_struct *prev, struct vm_area_struct *next)
478 {
479 struct mm_struct *mm = vma->vm_mm;
480 struct mmu_gather tlb;
481
482 tlb_gather_mmu(&tlb, mm);
483 update_hiwater_rss(mm);
484 unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
485 /* mm_wr_locked = */ true);
486 mas_set(mas, vma->vm_end);
487 free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
488 next ? next->vm_start : USER_PGTABLES_CEILING,
489 /* mm_wr_locked = */ true);
490 tlb_finish_mmu(&tlb);
491 }
492
493 /*
494 * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
495 * has already been checked or doesn't make sense to fail.
496 * VMA Iterator will point to the original VMA.
497 */
498 static __must_check int
__split_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,int new_below)499 __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
500 unsigned long addr, int new_below)
501 {
502 struct vma_prepare vp;
503 struct vm_area_struct *new;
504 int err;
505
506 WARN_ON(vma->vm_start >= addr);
507 WARN_ON(vma->vm_end <= addr);
508
509 if (vma->vm_ops && vma->vm_ops->may_split) {
510 err = vma->vm_ops->may_split(vma, addr);
511 if (err)
512 return err;
513 }
514
515 new = vm_area_dup(vma);
516 if (!new)
517 return -ENOMEM;
518
519 if (new_below) {
520 new->vm_end = addr;
521 } else {
522 new->vm_start = addr;
523 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
524 }
525
526 err = -ENOMEM;
527 vma_iter_config(vmi, new->vm_start, new->vm_end);
528 if (vma_iter_prealloc(vmi, new))
529 goto out_free_vma;
530
531 err = vma_dup_policy(vma, new);
532 if (err)
533 goto out_free_vmi;
534
535 err = anon_vma_clone(new, vma);
536 if (err)
537 goto out_free_mpol;
538
539 if (new->vm_file)
540 get_file(new->vm_file);
541
542 if (new->vm_ops && new->vm_ops->open)
543 new->vm_ops->open(new);
544
545 vma_start_write(vma);
546 vma_start_write(new);
547
548 init_vma_prep(&vp, vma);
549 vp.insert = new;
550 vma_prepare(&vp);
551
552 /*
553 * Get rid of huge pages and shared page tables straddling the split
554 * boundary.
555 */
556 vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
557 if (is_vm_hugetlb_page(vma))
558 hugetlb_split(vma, addr);
559
560 if (new_below) {
561 vma->vm_start = addr;
562 vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
563 } else {
564 vma->vm_end = addr;
565 }
566
567 /* vma_complete stores the new vma */
568 vma_complete(&vp, vmi, vma->vm_mm);
569 validate_mm(vma->vm_mm);
570
571 /* Success. */
572 if (new_below)
573 vma_next(vmi);
574 else
575 vma_prev(vmi);
576
577 return 0;
578
579 out_free_mpol:
580 mpol_put(vma_policy(new));
581 out_free_vmi:
582 vma_iter_free(vmi);
583 out_free_vma:
584 vm_area_free(new);
585 return err;
586 }
587
588 /*
589 * Split a vma into two pieces at address 'addr', a new vma is allocated
590 * either for the first part or the tail.
591 */
split_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,int new_below)592 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
593 unsigned long addr, int new_below)
594 {
595 if (vma->vm_mm->map_count >= sysctl_max_map_count)
596 return -ENOMEM;
597
598 return __split_vma(vmi, vma, addr, new_below);
599 }
600
601 /*
602 * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
603 * instance that the destination VMA has no anon_vma but the source does.
604 *
605 * @dst: The destination VMA
606 * @src: The source VMA
607 * @dup: Pointer to the destination VMA when successful.
608 *
609 * Returns: 0 on success.
610 */
dup_anon_vma(struct vm_area_struct * dst,struct vm_area_struct * src,struct vm_area_struct ** dup)611 static int dup_anon_vma(struct vm_area_struct *dst,
612 struct vm_area_struct *src, struct vm_area_struct **dup)
613 {
614 /*
615 * There are three cases to consider for correctly propagating
616 * anon_vma's on merge.
617 *
618 * The first is trivial - neither VMA has anon_vma, we need not do
619 * anything.
620 *
621 * The second where both have anon_vma is also a no-op, as they must
622 * then be the same, so there is simply nothing to copy.
623 *
624 * Here we cover the third - if the destination VMA has no anon_vma,
625 * that is it is unfaulted, we need to ensure that the newly merged
626 * range is referenced by the anon_vma's of the source.
627 */
628 if (src->anon_vma && !dst->anon_vma) {
629 int ret;
630
631 vma_assert_write_locked(dst);
632 dst->anon_vma = src->anon_vma;
633 ret = anon_vma_clone(dst, src);
634 if (ret)
635 return ret;
636
637 *dup = dst;
638 }
639
640 return 0;
641 }
642
643 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
validate_mm(struct mm_struct * mm)644 void validate_mm(struct mm_struct *mm)
645 {
646 int bug = 0;
647 int i = 0;
648 struct vm_area_struct *vma;
649 VMA_ITERATOR(vmi, mm, 0);
650
651 mt_validate(&mm->mm_mt);
652 for_each_vma(vmi, vma) {
653 #ifdef CONFIG_DEBUG_VM_RB
654 struct anon_vma *anon_vma = vma->anon_vma;
655 struct anon_vma_chain *avc;
656 #endif
657 unsigned long vmi_start, vmi_end;
658 bool warn = 0;
659
660 vmi_start = vma_iter_addr(&vmi);
661 vmi_end = vma_iter_end(&vmi);
662 if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
663 warn = 1;
664
665 if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
666 warn = 1;
667
668 if (warn) {
669 pr_emerg("issue in %s\n", current->comm);
670 dump_stack();
671 dump_vma(vma);
672 pr_emerg("tree range: %px start %lx end %lx\n", vma,
673 vmi_start, vmi_end - 1);
674 vma_iter_dump_tree(&vmi);
675 }
676
677 #ifdef CONFIG_DEBUG_VM_RB
678 if (anon_vma) {
679 anon_vma_lock_read(anon_vma);
680 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
681 anon_vma_interval_tree_verify(avc);
682 anon_vma_unlock_read(anon_vma);
683 }
684 #endif
685 /* Check for a infinite loop */
686 if (++i > mm->map_count + 10) {
687 i = -1;
688 break;
689 }
690 }
691 if (i != mm->map_count) {
692 pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
693 bug = 1;
694 }
695 VM_BUG_ON_MM(bug, mm);
696 }
697 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
698
699 /*
700 * Based on the vmg flag indicating whether we need to adjust the vm_start field
701 * for the middle or next VMA, we calculate what the range of the newly adjusted
702 * VMA ought to be, and set the VMA's range accordingly.
703 */
vmg_adjust_set_range(struct vma_merge_struct * vmg)704 static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
705 {
706 struct vm_area_struct *adjust;
707 pgoff_t pgoff;
708
709 if (vmg->__adjust_middle_start) {
710 adjust = vmg->middle;
711 pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
712 } else if (vmg->__adjust_next_start) {
713 adjust = vmg->next;
714 pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
715 } else {
716 return;
717 }
718
719 vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
720 }
721
722 /*
723 * Actually perform the VMA merge operation.
724 *
725 * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
726 * modify any VMAs or cause inconsistent state should an OOM condition arise.
727 *
728 * Returns 0 on success, or an error value on failure.
729 */
commit_merge(struct vma_merge_struct * vmg)730 static int commit_merge(struct vma_merge_struct *vmg)
731 {
732 struct vm_area_struct *vma;
733 struct vma_prepare vp;
734
735 if (vmg->__adjust_next_start) {
736 /* We manipulate middle and adjust next, which is the target. */
737 vma = vmg->middle;
738 vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
739 } else {
740 vma = vmg->target;
741 /* Note: vma iterator must be pointing to 'start'. */
742 vma_iter_config(vmg->vmi, vmg->start, vmg->end);
743 }
744
745 init_multi_vma_prep(&vp, vma, vmg);
746
747 /*
748 * If vmg->give_up_on_oom is set, we're safe, because we don't actually
749 * manipulate any VMAs until we succeed at preallocation.
750 *
751 * Past this point, we will not return an error.
752 */
753 if (vma_iter_prealloc(vmg->vmi, vma))
754 return -ENOMEM;
755
756 vma_prepare(&vp);
757 /*
758 * THP pages may need to do additional splits if we increase
759 * middle->vm_start.
760 */
761 vma_adjust_trans_huge(vma, vmg->start, vmg->end,
762 vmg->__adjust_middle_start ? vmg->middle : NULL);
763 vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
764 vmg_adjust_set_range(vmg);
765 vma_iter_store_overwrite(vmg->vmi, vmg->target);
766
767 vma_complete(&vp, vmg->vmi, vma->vm_mm);
768
769 return 0;
770 }
771
772 /* We can only remove VMAs when merging if they do not have a close hook. */
can_merge_remove_vma(struct vm_area_struct * vma)773 static bool can_merge_remove_vma(struct vm_area_struct *vma)
774 {
775 return !vma->vm_ops || !vma->vm_ops->close;
776 }
777
778 /*
779 * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
780 * attributes modified.
781 *
782 * @vmg: Describes the modifications being made to a VMA and associated
783 * metadata.
784 *
785 * When the attributes of a range within a VMA change, then it might be possible
786 * for immediately adjacent VMAs to be merged into that VMA due to having
787 * identical properties.
788 *
789 * This function checks for the existence of any such mergeable VMAs and updates
790 * the maple tree describing the @vmg->middle->vm_mm address space to account
791 * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
792 * merge.
793 *
794 * As part of this operation, if a merge occurs, the @vmg object will have its
795 * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
796 * calls to this function should reset these fields.
797 *
798 * Returns: The merged VMA if merge succeeds, or NULL otherwise.
799 *
800 * ASSUMPTIONS:
801 * - The caller must assign the VMA to be modifed to @vmg->middle.
802 * - The caller must have set @vmg->prev to the previous VMA, if there is one.
803 * - The caller must not set @vmg->next, as we determine this.
804 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
805 * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
806 */
vma_merge_existing_range(struct vma_merge_struct * vmg)807 static __must_check struct vm_area_struct *vma_merge_existing_range(
808 struct vma_merge_struct *vmg)
809 {
810 struct vm_area_struct *middle = vmg->middle;
811 struct vm_area_struct *prev = vmg->prev;
812 struct vm_area_struct *next;
813 struct vm_area_struct *anon_dup = NULL;
814 unsigned long start = vmg->start;
815 unsigned long end = vmg->end;
816 bool left_side = middle && start == middle->vm_start;
817 bool right_side = middle && end == middle->vm_end;
818 int err = 0;
819 bool merge_left, merge_right, merge_both;
820
821 mmap_assert_write_locked(vmg->mm);
822 VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
823 VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
824 VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
825 VM_WARN_ON_VMG(start >= end, vmg);
826
827 /*
828 * If middle == prev, then we are offset into a VMA. Otherwise, if we are
829 * not, we must span a portion of the VMA.
830 */
831 VM_WARN_ON_VMG(middle &&
832 ((middle != prev && vmg->start != middle->vm_start) ||
833 vmg->end > middle->vm_end), vmg);
834 /* The vmi must be positioned within vmg->middle. */
835 VM_WARN_ON_VMG(middle &&
836 !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
837 vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
838
839 vmg->state = VMA_MERGE_NOMERGE;
840
841 /*
842 * If a special mapping or if the range being modified is neither at the
843 * furthermost left or right side of the VMA, then we have no chance of
844 * merging and should abort.
845 */
846 if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
847 return NULL;
848
849 if (left_side)
850 merge_left = can_vma_merge_left(vmg);
851 else
852 merge_left = false;
853
854 if (right_side) {
855 next = vmg->next = vma_iter_next_range(vmg->vmi);
856 vma_iter_prev_range(vmg->vmi);
857
858 merge_right = can_vma_merge_right(vmg, merge_left);
859 } else {
860 merge_right = false;
861 next = NULL;
862 }
863
864 if (merge_left) /* If merging prev, position iterator there. */
865 vma_prev(vmg->vmi);
866 else if (!merge_right) /* If we have nothing to merge, abort. */
867 return NULL;
868
869 merge_both = merge_left && merge_right;
870 /* If we span the entire VMA, a merge implies it will be deleted. */
871 vmg->__remove_middle = left_side && right_side;
872
873 /*
874 * If we need to remove middle in its entirety but are unable to do so,
875 * we have no sensible recourse but to abort the merge.
876 */
877 if (vmg->__remove_middle && !can_merge_remove_vma(middle))
878 return NULL;
879
880 /*
881 * If we merge both VMAs, then next is also deleted. This implies
882 * merge_will_delete_vma also.
883 */
884 vmg->__remove_next = merge_both;
885
886 /*
887 * If we cannot delete next, then we can reduce the operation to merging
888 * prev and middle (thereby deleting middle).
889 */
890 if (vmg->__remove_next && !can_merge_remove_vma(next)) {
891 vmg->__remove_next = false;
892 merge_right = false;
893 merge_both = false;
894 }
895
896 /* No matter what happens, we will be adjusting middle. */
897 vma_start_write(middle);
898
899 if (merge_right) {
900 vma_start_write(next);
901 vmg->target = next;
902 }
903
904 if (merge_left) {
905 vma_start_write(prev);
906 vmg->target = prev;
907 }
908
909 if (merge_both) {
910 /*
911 * |<-------------------->|
912 * |-------********-------|
913 * prev middle next
914 * extend delete delete
915 */
916
917 vmg->start = prev->vm_start;
918 vmg->end = next->vm_end;
919 vmg->pgoff = prev->vm_pgoff;
920
921 /*
922 * We already ensured anon_vma compatibility above, so now it's
923 * simply a case of, if prev has no anon_vma object, which of
924 * next or middle contains the anon_vma we must duplicate.
925 */
926 err = dup_anon_vma(prev, next->anon_vma ? next : middle,
927 &anon_dup);
928 } else if (merge_left) {
929 /*
930 * |<------------>| OR
931 * |<----------------->|
932 * |-------*************
933 * prev middle
934 * extend shrink/delete
935 */
936
937 vmg->start = prev->vm_start;
938 vmg->pgoff = prev->vm_pgoff;
939
940 if (!vmg->__remove_middle)
941 vmg->__adjust_middle_start = true;
942
943 err = dup_anon_vma(prev, middle, &anon_dup);
944 } else { /* merge_right */
945 /*
946 * |<------------->| OR
947 * |<----------------->|
948 * *************-------|
949 * middle next
950 * shrink/delete extend
951 */
952
953 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
954
955 VM_WARN_ON_VMG(!merge_right, vmg);
956 /* If we are offset into a VMA, then prev must be middle. */
957 VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
958
959 if (vmg->__remove_middle) {
960 vmg->end = next->vm_end;
961 vmg->pgoff = next->vm_pgoff - pglen;
962 } else {
963 /* We shrink middle and expand next. */
964 vmg->__adjust_next_start = true;
965 vmg->start = middle->vm_start;
966 vmg->end = start;
967 vmg->pgoff = middle->vm_pgoff;
968 }
969
970 err = dup_anon_vma(next, middle, &anon_dup);
971 }
972
973 if (err || commit_merge(vmg))
974 goto abort;
975
976 khugepaged_enter_vma(vmg->target, vmg->vm_flags);
977 vmg->state = VMA_MERGE_SUCCESS;
978 return vmg->target;
979
980 abort:
981 vma_iter_set(vmg->vmi, start);
982 vma_iter_load(vmg->vmi);
983
984 if (anon_dup)
985 unlink_anon_vmas(anon_dup);
986
987 /*
988 * This means we have failed to clone anon_vma's correctly, but no
989 * actual changes to VMAs have occurred, so no harm no foul - if the
990 * user doesn't want this reported and instead just wants to give up on
991 * the merge, allow it.
992 */
993 if (!vmg->give_up_on_oom)
994 vmg->state = VMA_MERGE_ERROR_NOMEM;
995 return NULL;
996 }
997
998 /*
999 * vma_merge_new_range - Attempt to merge a new VMA into address space
1000 *
1001 * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
1002 * (exclusive), which we try to merge with any adjacent VMAs if possible.
1003 *
1004 * We are about to add a VMA to the address space starting at @vmg->start and
1005 * ending at @vmg->end. There are three different possible scenarios:
1006 *
1007 * 1. There is a VMA with identical properties immediately adjacent to the
1008 * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
1009 * EXPAND that VMA:
1010 *
1011 * Proposed: |-----| or |-----|
1012 * Existing: |----| |----|
1013 *
1014 * 2. There are VMAs with identical properties immediately adjacent to the
1015 * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
1016 * EXPAND the former and REMOVE the latter:
1017 *
1018 * Proposed: |-----|
1019 * Existing: |----| |----|
1020 *
1021 * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
1022 * VMAs do not have identical attributes - NO MERGE POSSIBLE.
1023 *
1024 * In instances where we can merge, this function returns the expanded VMA which
1025 * will have its range adjusted accordingly and the underlying maple tree also
1026 * adjusted.
1027 *
1028 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
1029 * to the VMA we expanded.
1030 *
1031 * This function adjusts @vmg to provide @vmg->next if not already specified,
1032 * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
1033 *
1034 * ASSUMPTIONS:
1035 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
1036 * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
1037 other than VMAs that will be unmapped should the operation succeed.
1038 * - The caller must have specified the previous vma in @vmg->prev.
1039 * - The caller must have specified the next vma in @vmg->next.
1040 * - The caller must have positioned the vmi at or before the gap.
1041 */
vma_merge_new_range(struct vma_merge_struct * vmg)1042 struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
1043 {
1044 struct vm_area_struct *prev = vmg->prev;
1045 struct vm_area_struct *next = vmg->next;
1046 unsigned long end = vmg->end;
1047 bool can_merge_left, can_merge_right;
1048
1049 mmap_assert_write_locked(vmg->mm);
1050 VM_WARN_ON_VMG(vmg->middle, vmg);
1051 VM_WARN_ON_VMG(vmg->target, vmg);
1052 /* vmi must point at or before the gap. */
1053 VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
1054
1055 vmg->state = VMA_MERGE_NOMERGE;
1056
1057 /* Special VMAs are unmergeable, also if no prev/next. */
1058 if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
1059 return NULL;
1060
1061 can_merge_left = can_vma_merge_left(vmg);
1062 can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
1063
1064 /* If we can merge with the next VMA, adjust vmg accordingly. */
1065 if (can_merge_right) {
1066 vmg->end = next->vm_end;
1067 vmg->target = next;
1068 }
1069
1070 /* If we can merge with the previous VMA, adjust vmg accordingly. */
1071 if (can_merge_left) {
1072 vmg->start = prev->vm_start;
1073 vmg->target = prev;
1074 vmg->pgoff = prev->vm_pgoff;
1075
1076 /*
1077 * If this merge would result in removal of the next VMA but we
1078 * are not permitted to do so, reduce the operation to merging
1079 * prev and vma.
1080 */
1081 if (can_merge_right && !can_merge_remove_vma(next))
1082 vmg->end = end;
1083
1084 /* In expand-only case we are already positioned at prev. */
1085 if (!vmg->just_expand) {
1086 /* Equivalent to going to the previous range. */
1087 vma_prev(vmg->vmi);
1088 }
1089 }
1090
1091 /*
1092 * Now try to expand adjacent VMA(s). This takes care of removing the
1093 * following VMA if we have VMAs on both sides.
1094 */
1095 if (vmg->target && !vma_expand(vmg)) {
1096 khugepaged_enter_vma(vmg->target, vmg->vm_flags);
1097 vmg->state = VMA_MERGE_SUCCESS;
1098 return vmg->target;
1099 }
1100
1101 return NULL;
1102 }
1103
1104 /*
1105 * vma_expand - Expand an existing VMA
1106 *
1107 * @vmg: Describes a VMA expansion operation.
1108 *
1109 * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
1110 * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
1111 * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
1112 * vmg->next needs to be handled by the caller.
1113 *
1114 * Returns: 0 on success.
1115 *
1116 * ASSUMPTIONS:
1117 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
1118 * - The caller must have set @vmg->target and @vmg->next.
1119 */
vma_expand(struct vma_merge_struct * vmg)1120 int vma_expand(struct vma_merge_struct *vmg)
1121 {
1122 struct vm_area_struct *anon_dup = NULL;
1123 bool remove_next = false;
1124 struct vm_area_struct *target = vmg->target;
1125 struct vm_area_struct *next = vmg->next;
1126
1127 VM_WARN_ON_VMG(!target, vmg);
1128
1129 mmap_assert_write_locked(vmg->mm);
1130
1131 vma_start_write(target);
1132 if (next && (target != next) && (vmg->end == next->vm_end)) {
1133 int ret;
1134
1135 remove_next = true;
1136 /* This should already have been checked by this point. */
1137 VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
1138 vma_start_write(next);
1139 /*
1140 * In this case we don't report OOM, so vmg->give_up_on_mm is
1141 * safe.
1142 */
1143 ret = dup_anon_vma(target, next, &anon_dup);
1144 if (ret)
1145 return ret;
1146 }
1147
1148 /* Not merging but overwriting any part of next is not handled. */
1149 VM_WARN_ON_VMG(next && !remove_next &&
1150 next != target && vmg->end > next->vm_start, vmg);
1151 /* Only handles expanding */
1152 VM_WARN_ON_VMG(target->vm_start < vmg->start ||
1153 target->vm_end > vmg->end, vmg);
1154
1155 if (remove_next)
1156 vmg->__remove_next = true;
1157
1158 if (commit_merge(vmg))
1159 goto nomem;
1160
1161 return 0;
1162
1163 nomem:
1164 if (anon_dup)
1165 unlink_anon_vmas(anon_dup);
1166 /*
1167 * If the user requests that we just give upon OOM, we are safe to do so
1168 * here, as commit merge provides this contract to us. Nothing has been
1169 * changed - no harm no foul, just don't report it.
1170 */
1171 if (!vmg->give_up_on_oom)
1172 vmg->state = VMA_MERGE_ERROR_NOMEM;
1173 return -ENOMEM;
1174 }
1175
1176 /*
1177 * vma_shrink() - Reduce an existing VMAs memory area
1178 * @vmi: The vma iterator
1179 * @vma: The VMA to modify
1180 * @start: The new start
1181 * @end: The new end
1182 *
1183 * Returns: 0 on success, -ENOMEM otherwise
1184 */
vma_shrink(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long start,unsigned long end,pgoff_t pgoff)1185 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
1186 unsigned long start, unsigned long end, pgoff_t pgoff)
1187 {
1188 struct vma_prepare vp;
1189
1190 WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
1191
1192 if (vma->vm_start < start)
1193 vma_iter_config(vmi, vma->vm_start, start);
1194 else
1195 vma_iter_config(vmi, end, vma->vm_end);
1196
1197 if (vma_iter_prealloc(vmi, NULL))
1198 return -ENOMEM;
1199
1200 vma_start_write(vma);
1201
1202 init_vma_prep(&vp, vma);
1203 vma_prepare(&vp);
1204 vma_adjust_trans_huge(vma, start, end, NULL);
1205
1206 vma_iter_clear(vmi);
1207 vma_set_range(vma, start, end, pgoff);
1208 vma_complete(&vp, vmi, vma->vm_mm);
1209 validate_mm(vma->vm_mm);
1210 return 0;
1211 }
1212
vms_clear_ptes(struct vma_munmap_struct * vms,struct ma_state * mas_detach,bool mm_wr_locked)1213 static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
1214 struct ma_state *mas_detach, bool mm_wr_locked)
1215 {
1216 struct mmu_gather tlb;
1217
1218 if (!vms->clear_ptes) /* Nothing to do */
1219 return;
1220
1221 /*
1222 * We can free page tables without write-locking mmap_lock because VMAs
1223 * were isolated before we downgraded mmap_lock.
1224 */
1225 mas_set(mas_detach, 1);
1226 tlb_gather_mmu(&tlb, vms->vma->vm_mm);
1227 update_hiwater_rss(vms->vma->vm_mm);
1228 unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
1229 vms->vma_count, mm_wr_locked);
1230
1231 mas_set(mas_detach, 1);
1232 /* start and end may be different if there is no prev or next vma. */
1233 free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
1234 vms->unmap_end, mm_wr_locked);
1235 tlb_finish_mmu(&tlb);
1236 vms->clear_ptes = false;
1237 }
1238
vms_clean_up_area(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1239 static void vms_clean_up_area(struct vma_munmap_struct *vms,
1240 struct ma_state *mas_detach)
1241 {
1242 struct vm_area_struct *vma;
1243
1244 if (!vms->nr_pages)
1245 return;
1246
1247 vms_clear_ptes(vms, mas_detach, true);
1248 mas_set(mas_detach, 0);
1249 mas_for_each(mas_detach, vma, ULONG_MAX)
1250 vma_close(vma);
1251 }
1252
1253 /*
1254 * vms_complete_munmap_vmas() - Finish the munmap() operation
1255 * @vms: The vma munmap struct
1256 * @mas_detach: The maple state of the detached vmas
1257 *
1258 * This updates the mm_struct, unmaps the region, frees the resources
1259 * used for the munmap() and may downgrade the lock - if requested. Everything
1260 * needed to be done once the vma maple tree is updated.
1261 */
vms_complete_munmap_vmas(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1262 static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
1263 struct ma_state *mas_detach)
1264 {
1265 struct vm_area_struct *vma;
1266 struct mm_struct *mm;
1267
1268 mm = current->mm;
1269 mm->map_count -= vms->vma_count;
1270 mm->locked_vm -= vms->locked_vm;
1271 if (vms->unlock)
1272 mmap_write_downgrade(mm);
1273
1274 if (!vms->nr_pages)
1275 return;
1276
1277 vms_clear_ptes(vms, mas_detach, !vms->unlock);
1278 /* Update high watermark before we lower total_vm */
1279 update_hiwater_vm(mm);
1280 /* Stat accounting */
1281 WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
1282 /* Paranoid bookkeeping */
1283 VM_WARN_ON(vms->exec_vm > mm->exec_vm);
1284 VM_WARN_ON(vms->stack_vm > mm->stack_vm);
1285 VM_WARN_ON(vms->data_vm > mm->data_vm);
1286 mm->exec_vm -= vms->exec_vm;
1287 mm->stack_vm -= vms->stack_vm;
1288 mm->data_vm -= vms->data_vm;
1289
1290 /* Remove and clean up vmas */
1291 mas_set(mas_detach, 0);
1292 mas_for_each(mas_detach, vma, ULONG_MAX)
1293 remove_vma(vma);
1294
1295 vm_unacct_memory(vms->nr_accounted);
1296 validate_mm(mm);
1297 if (vms->unlock)
1298 mmap_read_unlock(mm);
1299
1300 __mt_destroy(mas_detach->tree);
1301 }
1302
1303 /*
1304 * reattach_vmas() - Undo any munmap work and free resources
1305 * @mas_detach: The maple state with the detached maple tree
1306 *
1307 * Reattach any detached vmas and free up the maple tree used to track the vmas.
1308 */
reattach_vmas(struct ma_state * mas_detach)1309 static void reattach_vmas(struct ma_state *mas_detach)
1310 {
1311 struct vm_area_struct *vma;
1312
1313 mas_set(mas_detach, 0);
1314 mas_for_each(mas_detach, vma, ULONG_MAX)
1315 vma_mark_attached(vma);
1316
1317 __mt_destroy(mas_detach->tree);
1318 }
1319
1320 /*
1321 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
1322 * for removal at a later date. Handles splitting first and last if necessary
1323 * and marking the vmas as isolated.
1324 *
1325 * @vms: The vma munmap struct
1326 * @mas_detach: The maple state tracking the detached tree
1327 *
1328 * Return: 0 on success, error otherwise
1329 */
vms_gather_munmap_vmas(struct vma_munmap_struct * vms,struct ma_state * mas_detach)1330 static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
1331 struct ma_state *mas_detach)
1332 {
1333 struct vm_area_struct *next = NULL;
1334 int error;
1335
1336 /*
1337 * If we need to split any vma, do it now to save pain later.
1338 * Does it split the first one?
1339 */
1340 if (vms->start > vms->vma->vm_start) {
1341
1342 /*
1343 * Make sure that map_count on return from munmap() will
1344 * not exceed its limit; but let map_count go just above
1345 * its limit temporarily, to help free resources as expected.
1346 */
1347 if (vms->end < vms->vma->vm_end &&
1348 vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
1349 error = -ENOMEM;
1350 goto map_count_exceeded;
1351 }
1352
1353 /* Don't bother splitting the VMA if we can't unmap it anyway */
1354 if (!can_modify_vma(vms->vma)) {
1355 error = -EPERM;
1356 goto start_split_failed;
1357 }
1358
1359 error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
1360 if (error)
1361 goto start_split_failed;
1362 }
1363 vms->prev = vma_prev(vms->vmi);
1364 if (vms->prev)
1365 vms->unmap_start = vms->prev->vm_end;
1366
1367 /*
1368 * Detach a range of VMAs from the mm. Using next as a temp variable as
1369 * it is always overwritten.
1370 */
1371 for_each_vma_range(*(vms->vmi), next, vms->end) {
1372 long nrpages;
1373
1374 if (!can_modify_vma(next)) {
1375 error = -EPERM;
1376 goto modify_vma_failed;
1377 }
1378 /* Does it split the end? */
1379 if (next->vm_end > vms->end) {
1380 error = __split_vma(vms->vmi, next, vms->end, 0);
1381 if (error)
1382 goto end_split_failed;
1383 }
1384 vma_start_write(next);
1385 mas_set(mas_detach, vms->vma_count++);
1386 error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
1387 if (error)
1388 goto munmap_gather_failed;
1389
1390 vma_mark_detached(next);
1391 nrpages = vma_pages(next);
1392
1393 vms->nr_pages += nrpages;
1394 if (next->vm_flags & VM_LOCKED)
1395 vms->locked_vm += nrpages;
1396
1397 if (next->vm_flags & VM_ACCOUNT)
1398 vms->nr_accounted += nrpages;
1399
1400 if (is_exec_mapping(next->vm_flags))
1401 vms->exec_vm += nrpages;
1402 else if (is_stack_mapping(next->vm_flags))
1403 vms->stack_vm += nrpages;
1404 else if (is_data_mapping(next->vm_flags))
1405 vms->data_vm += nrpages;
1406
1407 if (vms->uf) {
1408 /*
1409 * If userfaultfd_unmap_prep returns an error the vmas
1410 * will remain split, but userland will get a
1411 * highly unexpected error anyway. This is no
1412 * different than the case where the first of the two
1413 * __split_vma fails, but we don't undo the first
1414 * split, despite we could. This is unlikely enough
1415 * failure that it's not worth optimizing it for.
1416 */
1417 error = userfaultfd_unmap_prep(next, vms->start,
1418 vms->end, vms->uf);
1419 if (error)
1420 goto userfaultfd_error;
1421 }
1422 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
1423 BUG_ON(next->vm_start < vms->start);
1424 BUG_ON(next->vm_start > vms->end);
1425 #endif
1426 }
1427
1428 vms->next = vma_next(vms->vmi);
1429 if (vms->next)
1430 vms->unmap_end = vms->next->vm_start;
1431
1432 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
1433 /* Make sure no VMAs are about to be lost. */
1434 {
1435 MA_STATE(test, mas_detach->tree, 0, 0);
1436 struct vm_area_struct *vma_mas, *vma_test;
1437 int test_count = 0;
1438
1439 vma_iter_set(vms->vmi, vms->start);
1440 rcu_read_lock();
1441 vma_test = mas_find(&test, vms->vma_count - 1);
1442 for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
1443 BUG_ON(vma_mas != vma_test);
1444 test_count++;
1445 vma_test = mas_next(&test, vms->vma_count - 1);
1446 }
1447 rcu_read_unlock();
1448 BUG_ON(vms->vma_count != test_count);
1449 }
1450 #endif
1451
1452 while (vma_iter_addr(vms->vmi) > vms->start)
1453 vma_iter_prev_range(vms->vmi);
1454
1455 vms->clear_ptes = true;
1456 return 0;
1457
1458 userfaultfd_error:
1459 munmap_gather_failed:
1460 end_split_failed:
1461 modify_vma_failed:
1462 reattach_vmas(mas_detach);
1463 start_split_failed:
1464 map_count_exceeded:
1465 return error;
1466 }
1467
1468 /*
1469 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
1470 * @vms: The vma munmap struct
1471 * @vmi: The vma iterator
1472 * @vma: The first vm_area_struct to munmap
1473 * @start: The aligned start address to munmap
1474 * @end: The aligned end address to munmap
1475 * @uf: The userfaultfd list_head
1476 * @unlock: Unlock after the operation. Only unlocked on success
1477 */
init_vma_munmap(struct vma_munmap_struct * vms,struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long start,unsigned long end,struct list_head * uf,bool unlock)1478 static void init_vma_munmap(struct vma_munmap_struct *vms,
1479 struct vma_iterator *vmi, struct vm_area_struct *vma,
1480 unsigned long start, unsigned long end, struct list_head *uf,
1481 bool unlock)
1482 {
1483 vms->vmi = vmi;
1484 vms->vma = vma;
1485 if (vma) {
1486 vms->start = start;
1487 vms->end = end;
1488 } else {
1489 vms->start = vms->end = 0;
1490 }
1491 vms->unlock = unlock;
1492 vms->uf = uf;
1493 vms->vma_count = 0;
1494 vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
1495 vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
1496 vms->unmap_start = FIRST_USER_ADDRESS;
1497 vms->unmap_end = USER_PGTABLES_CEILING;
1498 vms->clear_ptes = false;
1499 }
1500
1501 /*
1502 * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
1503 * @vmi: The vma iterator
1504 * @vma: The starting vm_area_struct
1505 * @mm: The mm_struct
1506 * @start: The aligned start address to munmap.
1507 * @end: The aligned end address to munmap.
1508 * @uf: The userfaultfd list_head
1509 * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
1510 * success.
1511 *
1512 * Return: 0 on success and drops the lock if so directed, error and leaves the
1513 * lock held otherwise.
1514 */
do_vmi_align_munmap(struct vma_iterator * vmi,struct vm_area_struct * vma,struct mm_struct * mm,unsigned long start,unsigned long end,struct list_head * uf,bool unlock)1515 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
1516 struct mm_struct *mm, unsigned long start, unsigned long end,
1517 struct list_head *uf, bool unlock)
1518 {
1519 struct maple_tree mt_detach;
1520 MA_STATE(mas_detach, &mt_detach, 0, 0);
1521 mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
1522 mt_on_stack(mt_detach);
1523 struct vma_munmap_struct vms;
1524 int error;
1525
1526 init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
1527 error = vms_gather_munmap_vmas(&vms, &mas_detach);
1528 if (error)
1529 goto gather_failed;
1530
1531 error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
1532 if (error)
1533 goto clear_tree_failed;
1534
1535 /* Point of no return */
1536 vms_complete_munmap_vmas(&vms, &mas_detach);
1537 return 0;
1538
1539 clear_tree_failed:
1540 reattach_vmas(&mas_detach);
1541 gather_failed:
1542 validate_mm(mm);
1543 return error;
1544 }
1545
1546 /*
1547 * do_vmi_munmap() - munmap a given range.
1548 * @vmi: The vma iterator
1549 * @mm: The mm_struct
1550 * @start: The start address to munmap
1551 * @len: The length of the range to munmap
1552 * @uf: The userfaultfd list_head
1553 * @unlock: set to true if the user wants to drop the mmap_lock on success
1554 *
1555 * This function takes a @mas that is either pointing to the previous VMA or set
1556 * to MA_START and sets it up to remove the mapping(s). The @len will be
1557 * aligned.
1558 *
1559 * Return: 0 on success and drops the lock if so directed, error and leaves the
1560 * lock held otherwise.
1561 */
do_vmi_munmap(struct vma_iterator * vmi,struct mm_struct * mm,unsigned long start,size_t len,struct list_head * uf,bool unlock)1562 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
1563 unsigned long start, size_t len, struct list_head *uf,
1564 bool unlock)
1565 {
1566 unsigned long end;
1567 struct vm_area_struct *vma;
1568
1569 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
1570 return -EINVAL;
1571
1572 end = start + PAGE_ALIGN(len);
1573 if (end == start)
1574 return -EINVAL;
1575
1576 /* Find the first overlapping VMA */
1577 vma = vma_find(vmi, end);
1578 if (!vma) {
1579 if (unlock)
1580 mmap_write_unlock(mm);
1581 return 0;
1582 }
1583
1584 return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
1585 }
1586
1587 /*
1588 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
1589 * context and anonymous VMA name within the range [start, end).
1590 *
1591 * As a result, we might be able to merge the newly modified VMA range with an
1592 * adjacent VMA with identical properties.
1593 *
1594 * If no merge is possible and the range does not span the entirety of the VMA,
1595 * we then need to split the VMA to accommodate the change.
1596 *
1597 * The function returns either the merged VMA, the original VMA if a split was
1598 * required instead, or an error if the split failed.
1599 */
vma_modify(struct vma_merge_struct * vmg)1600 static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
1601 {
1602 struct vm_area_struct *vma = vmg->middle;
1603 unsigned long start = vmg->start;
1604 unsigned long end = vmg->end;
1605 struct vm_area_struct *merged;
1606
1607 /* First, try to merge. */
1608 merged = vma_merge_existing_range(vmg);
1609 if (merged)
1610 return merged;
1611 if (vmg_nomem(vmg))
1612 return ERR_PTR(-ENOMEM);
1613
1614 /*
1615 * Split can fail for reasons other than OOM, so if the user requests
1616 * this it's probably a mistake.
1617 */
1618 VM_WARN_ON(vmg->give_up_on_oom &&
1619 (vma->vm_start != start || vma->vm_end != end));
1620
1621 /* Split any preceding portion of the VMA. */
1622 if (vma->vm_start < start) {
1623 int err = split_vma(vmg->vmi, vma, start, 1);
1624
1625 if (err)
1626 return ERR_PTR(err);
1627 }
1628
1629 /* Split any trailing portion of the VMA. */
1630 if (vma->vm_end > end) {
1631 int err = split_vma(vmg->vmi, vma, end, 0);
1632
1633 if (err)
1634 return ERR_PTR(err);
1635 }
1636
1637 return vma;
1638 }
1639
vma_modify_flags(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,vm_flags_t vm_flags)1640 struct vm_area_struct *vma_modify_flags(
1641 struct vma_iterator *vmi, struct vm_area_struct *prev,
1642 struct vm_area_struct *vma, unsigned long start, unsigned long end,
1643 vm_flags_t vm_flags)
1644 {
1645 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1646
1647 vmg.vm_flags = vm_flags;
1648
1649 return vma_modify(&vmg);
1650 }
1651
1652 struct vm_area_struct
vma_modify_name(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,struct anon_vma_name * new_name)1653 *vma_modify_name(struct vma_iterator *vmi,
1654 struct vm_area_struct *prev,
1655 struct vm_area_struct *vma,
1656 unsigned long start,
1657 unsigned long end,
1658 struct anon_vma_name *new_name)
1659 {
1660 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1661
1662 vmg.anon_name = new_name;
1663
1664 return vma_modify(&vmg);
1665 }
1666
1667 struct vm_area_struct
vma_modify_policy(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,struct mempolicy * new_pol)1668 *vma_modify_policy(struct vma_iterator *vmi,
1669 struct vm_area_struct *prev,
1670 struct vm_area_struct *vma,
1671 unsigned long start, unsigned long end,
1672 struct mempolicy *new_pol)
1673 {
1674 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1675
1676 vmg.policy = new_pol;
1677
1678 return vma_modify(&vmg);
1679 }
1680
1681 struct vm_area_struct
vma_modify_flags_uffd(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end,vm_flags_t vm_flags,struct vm_userfaultfd_ctx new_ctx,bool give_up_on_oom)1682 *vma_modify_flags_uffd(struct vma_iterator *vmi,
1683 struct vm_area_struct *prev,
1684 struct vm_area_struct *vma,
1685 unsigned long start, unsigned long end,
1686 vm_flags_t vm_flags,
1687 struct vm_userfaultfd_ctx new_ctx,
1688 bool give_up_on_oom)
1689 {
1690 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1691
1692 vmg.vm_flags = vm_flags;
1693 vmg.uffd_ctx = new_ctx;
1694 if (give_up_on_oom)
1695 vmg.give_up_on_oom = true;
1696
1697 return vma_modify(&vmg);
1698 }
1699
1700 /*
1701 * Expand vma by delta bytes, potentially merging with an immediately adjacent
1702 * VMA with identical properties.
1703 */
vma_merge_extend(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long delta)1704 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
1705 struct vm_area_struct *vma,
1706 unsigned long delta)
1707 {
1708 VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
1709
1710 vmg.next = vma_iter_next_rewind(vmi, NULL);
1711 vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */
1712
1713 return vma_merge_new_range(&vmg);
1714 }
1715
unlink_file_vma_batch_init(struct unlink_vma_file_batch * vb)1716 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
1717 {
1718 vb->count = 0;
1719 }
1720
unlink_file_vma_batch_process(struct unlink_vma_file_batch * vb)1721 static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
1722 {
1723 struct address_space *mapping;
1724 int i;
1725
1726 mapping = vb->vmas[0]->vm_file->f_mapping;
1727 i_mmap_lock_write(mapping);
1728 for (i = 0; i < vb->count; i++) {
1729 VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
1730 __remove_shared_vm_struct(vb->vmas[i], mapping);
1731 }
1732 i_mmap_unlock_write(mapping);
1733
1734 unlink_file_vma_batch_init(vb);
1735 }
1736
unlink_file_vma_batch_add(struct unlink_vma_file_batch * vb,struct vm_area_struct * vma)1737 void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
1738 struct vm_area_struct *vma)
1739 {
1740 if (vma->vm_file == NULL)
1741 return;
1742
1743 if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
1744 vb->count == ARRAY_SIZE(vb->vmas))
1745 unlink_file_vma_batch_process(vb);
1746
1747 vb->vmas[vb->count] = vma;
1748 vb->count++;
1749 }
1750
unlink_file_vma_batch_final(struct unlink_vma_file_batch * vb)1751 void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
1752 {
1753 if (vb->count > 0)
1754 unlink_file_vma_batch_process(vb);
1755 }
1756
1757 /*
1758 * Unlink a file-based vm structure from its interval tree, to hide
1759 * vma from rmap and vmtruncate before freeing its page tables.
1760 */
unlink_file_vma(struct vm_area_struct * vma)1761 void unlink_file_vma(struct vm_area_struct *vma)
1762 {
1763 struct file *file = vma->vm_file;
1764
1765 if (file) {
1766 struct address_space *mapping = file->f_mapping;
1767
1768 i_mmap_lock_write(mapping);
1769 __remove_shared_vm_struct(vma, mapping);
1770 i_mmap_unlock_write(mapping);
1771 }
1772 }
1773
vma_link_file(struct vm_area_struct * vma)1774 void vma_link_file(struct vm_area_struct *vma)
1775 {
1776 struct file *file = vma->vm_file;
1777 struct address_space *mapping;
1778
1779 if (file) {
1780 mapping = file->f_mapping;
1781 i_mmap_lock_write(mapping);
1782 __vma_link_file(vma, mapping);
1783 i_mmap_unlock_write(mapping);
1784 }
1785 }
1786
vma_link(struct mm_struct * mm,struct vm_area_struct * vma)1787 int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
1788 {
1789 VMA_ITERATOR(vmi, mm, 0);
1790
1791 vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
1792 if (vma_iter_prealloc(&vmi, vma))
1793 return -ENOMEM;
1794
1795 vma_start_write(vma);
1796 vma_iter_store_new(&vmi, vma);
1797 vma_link_file(vma);
1798 mm->map_count++;
1799 validate_mm(mm);
1800 return 0;
1801 }
1802
1803 /*
1804 * Copy the vma structure to a new location in the same mm,
1805 * prior to moving page table entries, to effect an mremap move.
1806 */
copy_vma(struct vm_area_struct ** vmap,unsigned long addr,unsigned long len,pgoff_t pgoff,bool * need_rmap_locks)1807 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1808 unsigned long addr, unsigned long len, pgoff_t pgoff,
1809 bool *need_rmap_locks)
1810 {
1811 struct vm_area_struct *vma = *vmap;
1812 unsigned long vma_start = vma->vm_start;
1813 struct mm_struct *mm = vma->vm_mm;
1814 struct vm_area_struct *new_vma;
1815 bool faulted_in_anon_vma = true;
1816 VMA_ITERATOR(vmi, mm, addr);
1817 VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
1818
1819 /*
1820 * If anonymous vma has not yet been faulted, update new pgoff
1821 * to match new location, to increase its chance of merging.
1822 */
1823 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
1824 pgoff = addr >> PAGE_SHIFT;
1825 faulted_in_anon_vma = false;
1826 }
1827
1828 /*
1829 * If the VMA we are copying might contain a uprobe PTE, ensure
1830 * that we do not establish one upon merge. Otherwise, when mremap()
1831 * moves page tables, it will orphan the newly created PTE.
1832 */
1833 if (vma->vm_file)
1834 vmg.skip_vma_uprobe = true;
1835
1836 new_vma = find_vma_prev(mm, addr, &vmg.prev);
1837 if (new_vma && new_vma->vm_start < addr + len)
1838 return NULL; /* should never get here */
1839
1840 vmg.middle = NULL; /* New VMA range. */
1841 vmg.pgoff = pgoff;
1842 vmg.next = vma_iter_next_rewind(&vmi, NULL);
1843 new_vma = vma_merge_new_range(&vmg);
1844
1845 if (new_vma) {
1846 /*
1847 * Source vma may have been merged into new_vma
1848 */
1849 if (unlikely(vma_start >= new_vma->vm_start &&
1850 vma_start < new_vma->vm_end)) {
1851 /*
1852 * The only way we can get a vma_merge with
1853 * self during an mremap is if the vma hasn't
1854 * been faulted in yet and we were allowed to
1855 * reset the dst vma->vm_pgoff to the
1856 * destination address of the mremap to allow
1857 * the merge to happen. mremap must change the
1858 * vm_pgoff linearity between src and dst vmas
1859 * (in turn preventing a vma_merge) to be
1860 * safe. It is only safe to keep the vm_pgoff
1861 * linear if there are no pages mapped yet.
1862 */
1863 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
1864 *vmap = vma = new_vma;
1865 }
1866 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
1867 } else {
1868 new_vma = vm_area_dup(vma);
1869 if (!new_vma)
1870 goto out;
1871 vma_set_range(new_vma, addr, addr + len, pgoff);
1872 if (vma_dup_policy(vma, new_vma))
1873 goto out_free_vma;
1874 if (anon_vma_clone(new_vma, vma))
1875 goto out_free_mempol;
1876 if (new_vma->vm_file)
1877 get_file(new_vma->vm_file);
1878 if (new_vma->vm_ops && new_vma->vm_ops->open)
1879 new_vma->vm_ops->open(new_vma);
1880 if (vma_link(mm, new_vma))
1881 goto out_vma_link;
1882 *need_rmap_locks = false;
1883 }
1884 return new_vma;
1885
1886 out_vma_link:
1887 fixup_hugetlb_reservations(new_vma);
1888 vma_close(new_vma);
1889
1890 if (new_vma->vm_file)
1891 fput(new_vma->vm_file);
1892
1893 unlink_anon_vmas(new_vma);
1894 out_free_mempol:
1895 mpol_put(vma_policy(new_vma));
1896 out_free_vma:
1897 vm_area_free(new_vma);
1898 out:
1899 return NULL;
1900 }
1901
1902 /*
1903 * Rough compatibility check to quickly see if it's even worth looking
1904 * at sharing an anon_vma.
1905 *
1906 * They need to have the same vm_file, and the flags can only differ
1907 * in things that mprotect may change.
1908 *
1909 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1910 * we can merge the two vma's. For example, we refuse to merge a vma if
1911 * there is a vm_ops->close() function, because that indicates that the
1912 * driver is doing some kind of reference counting. But that doesn't
1913 * really matter for the anon_vma sharing case.
1914 */
anon_vma_compatible(struct vm_area_struct * a,struct vm_area_struct * b)1915 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1916 {
1917 return a->vm_end == b->vm_start &&
1918 mpol_equal(vma_policy(a), vma_policy(b)) &&
1919 a->vm_file == b->vm_file &&
1920 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
1921 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1922 }
1923
1924 /*
1925 * Do some basic sanity checking to see if we can re-use the anon_vma
1926 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1927 * the same as 'old', the other will be the new one that is trying
1928 * to share the anon_vma.
1929 *
1930 * NOTE! This runs with mmap_lock held for reading, so it is possible that
1931 * the anon_vma of 'old' is concurrently in the process of being set up
1932 * by another page fault trying to merge _that_. But that's ok: if it
1933 * is being set up, that automatically means that it will be a singleton
1934 * acceptable for merging, so we can do all of this optimistically. But
1935 * we do that READ_ONCE() to make sure that we never re-load the pointer.
1936 *
1937 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1938 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1939 * is to return an anon_vma that is "complex" due to having gone through
1940 * a fork).
1941 *
1942 * We also make sure that the two vma's are compatible (adjacent,
1943 * and with the same memory policies). That's all stable, even with just
1944 * a read lock on the mmap_lock.
1945 */
reusable_anon_vma(struct vm_area_struct * old,struct vm_area_struct * a,struct vm_area_struct * b)1946 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
1947 struct vm_area_struct *a,
1948 struct vm_area_struct *b)
1949 {
1950 if (anon_vma_compatible(a, b)) {
1951 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1952
1953 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1954 return anon_vma;
1955 }
1956 return NULL;
1957 }
1958
1959 /*
1960 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1961 * neighbouring vmas for a suitable anon_vma, before it goes off
1962 * to allocate a new anon_vma. It checks because a repetitive
1963 * sequence of mprotects and faults may otherwise lead to distinct
1964 * anon_vmas being allocated, preventing vma merge in subsequent
1965 * mprotect.
1966 */
find_mergeable_anon_vma(struct vm_area_struct * vma)1967 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1968 {
1969 struct anon_vma *anon_vma = NULL;
1970 struct vm_area_struct *prev, *next;
1971 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
1972
1973 /* Try next first. */
1974 next = vma_iter_load(&vmi);
1975 if (next) {
1976 anon_vma = reusable_anon_vma(next, vma, next);
1977 if (anon_vma)
1978 return anon_vma;
1979 }
1980
1981 prev = vma_prev(&vmi);
1982 VM_BUG_ON_VMA(prev != vma, vma);
1983 prev = vma_prev(&vmi);
1984 /* Try prev next. */
1985 if (prev)
1986 anon_vma = reusable_anon_vma(prev, prev, vma);
1987
1988 /*
1989 * We might reach here with anon_vma == NULL if we can't find
1990 * any reusable anon_vma.
1991 * There's no absolute need to look only at touching neighbours:
1992 * we could search further afield for "compatible" anon_vmas.
1993 * But it would probably just be a waste of time searching,
1994 * or lead to too many vmas hanging off the same anon_vma.
1995 * We're trying to allow mprotect remerging later on,
1996 * not trying to minimize memory used for anon_vmas.
1997 */
1998 return anon_vma;
1999 }
2000
vm_ops_needs_writenotify(const struct vm_operations_struct * vm_ops)2001 static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
2002 {
2003 return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
2004 }
2005
vma_is_shared_writable(struct vm_area_struct * vma)2006 static bool vma_is_shared_writable(struct vm_area_struct *vma)
2007 {
2008 return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
2009 (VM_WRITE | VM_SHARED);
2010 }
2011
vma_fs_can_writeback(struct vm_area_struct * vma)2012 static bool vma_fs_can_writeback(struct vm_area_struct *vma)
2013 {
2014 /* No managed pages to writeback. */
2015 if (vma->vm_flags & VM_PFNMAP)
2016 return false;
2017
2018 return vma->vm_file && vma->vm_file->f_mapping &&
2019 mapping_can_writeback(vma->vm_file->f_mapping);
2020 }
2021
2022 /*
2023 * Does this VMA require the underlying folios to have their dirty state
2024 * tracked?
2025 */
vma_needs_dirty_tracking(struct vm_area_struct * vma)2026 bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
2027 {
2028 /* Only shared, writable VMAs require dirty tracking. */
2029 if (!vma_is_shared_writable(vma))
2030 return false;
2031
2032 /* Does the filesystem need to be notified? */
2033 if (vm_ops_needs_writenotify(vma->vm_ops))
2034 return true;
2035
2036 /*
2037 * Even if the filesystem doesn't indicate a need for writenotify, if it
2038 * can writeback, dirty tracking is still required.
2039 */
2040 return vma_fs_can_writeback(vma);
2041 }
2042
2043 /*
2044 * Some shared mappings will want the pages marked read-only
2045 * to track write events. If so, we'll downgrade vm_page_prot
2046 * to the private version (using protection_map[] without the
2047 * VM_SHARED bit).
2048 */
vma_wants_writenotify(struct vm_area_struct * vma,pgprot_t vm_page_prot)2049 bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
2050 {
2051 /* If it was private or non-writable, the write bit is already clear */
2052 if (!vma_is_shared_writable(vma))
2053 return false;
2054
2055 /* The backer wishes to know when pages are first written to? */
2056 if (vm_ops_needs_writenotify(vma->vm_ops))
2057 return true;
2058
2059 /* The open routine did something to the protections that pgprot_modify
2060 * won't preserve? */
2061 if (pgprot_val(vm_page_prot) !=
2062 pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
2063 return false;
2064
2065 /*
2066 * Do we need to track softdirty? hugetlb does not support softdirty
2067 * tracking yet.
2068 */
2069 if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
2070 return true;
2071
2072 /* Do we need write faults for uffd-wp tracking? */
2073 if (userfaultfd_wp(vma))
2074 return true;
2075
2076 /* Can the mapping track the dirty pages? */
2077 return vma_fs_can_writeback(vma);
2078 }
2079
2080 static DEFINE_MUTEX(mm_all_locks_mutex);
2081
vm_lock_anon_vma(struct mm_struct * mm,struct anon_vma * anon_vma)2082 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2083 {
2084 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
2085 /*
2086 * The LSB of head.next can't change from under us
2087 * because we hold the mm_all_locks_mutex.
2088 */
2089 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
2090 /*
2091 * We can safely modify head.next after taking the
2092 * anon_vma->root->rwsem. If some other vma in this mm shares
2093 * the same anon_vma we won't take it again.
2094 *
2095 * No need of atomic instructions here, head.next
2096 * can't change from under us thanks to the
2097 * anon_vma->root->rwsem.
2098 */
2099 if (__test_and_set_bit(0, (unsigned long *)
2100 &anon_vma->root->rb_root.rb_root.rb_node))
2101 BUG();
2102 }
2103 }
2104
vm_lock_mapping(struct mm_struct * mm,struct address_space * mapping)2105 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2106 {
2107 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2108 /*
2109 * AS_MM_ALL_LOCKS can't change from under us because
2110 * we hold the mm_all_locks_mutex.
2111 *
2112 * Operations on ->flags have to be atomic because
2113 * even if AS_MM_ALL_LOCKS is stable thanks to the
2114 * mm_all_locks_mutex, there may be other cpus
2115 * changing other bitflags in parallel to us.
2116 */
2117 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2118 BUG();
2119 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
2120 }
2121 }
2122
2123 /*
2124 * This operation locks against the VM for all pte/vma/mm related
2125 * operations that could ever happen on a certain mm. This includes
2126 * vmtruncate, try_to_unmap, and all page faults.
2127 *
2128 * The caller must take the mmap_lock in write mode before calling
2129 * mm_take_all_locks(). The caller isn't allowed to release the
2130 * mmap_lock until mm_drop_all_locks() returns.
2131 *
2132 * mmap_lock in write mode is required in order to block all operations
2133 * that could modify pagetables and free pages without need of
2134 * altering the vma layout. It's also needed in write mode to avoid new
2135 * anon_vmas to be associated with existing vmas.
2136 *
2137 * A single task can't take more than one mm_take_all_locks() in a row
2138 * or it would deadlock.
2139 *
2140 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2141 * mapping->flags avoid to take the same lock twice, if more than one
2142 * vma in this mm is backed by the same anon_vma or address_space.
2143 *
2144 * We take locks in following order, accordingly to comment at beginning
2145 * of mm/rmap.c:
2146 * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
2147 * hugetlb mapping);
2148 * - all vmas marked locked
2149 * - all i_mmap_rwsem locks;
2150 * - all anon_vma->rwseml
2151 *
2152 * We can take all locks within these types randomly because the VM code
2153 * doesn't nest them and we protected from parallel mm_take_all_locks() by
2154 * mm_all_locks_mutex.
2155 *
2156 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2157 * that may have to take thousand of locks.
2158 *
2159 * mm_take_all_locks() can fail if it's interrupted by signals.
2160 */
mm_take_all_locks(struct mm_struct * mm)2161 int mm_take_all_locks(struct mm_struct *mm)
2162 {
2163 struct vm_area_struct *vma;
2164 struct anon_vma_chain *avc;
2165 VMA_ITERATOR(vmi, mm, 0);
2166
2167 mmap_assert_write_locked(mm);
2168
2169 mutex_lock(&mm_all_locks_mutex);
2170
2171 /*
2172 * vma_start_write() does not have a complement in mm_drop_all_locks()
2173 * because vma_start_write() is always asymmetrical; it marks a VMA as
2174 * being written to until mmap_write_unlock() or mmap_write_downgrade()
2175 * is reached.
2176 */
2177 for_each_vma(vmi, vma) {
2178 if (signal_pending(current))
2179 goto out_unlock;
2180 vma_start_write(vma);
2181 }
2182
2183 vma_iter_init(&vmi, mm, 0);
2184 for_each_vma(vmi, vma) {
2185 if (signal_pending(current))
2186 goto out_unlock;
2187 if (vma->vm_file && vma->vm_file->f_mapping &&
2188 is_vm_hugetlb_page(vma))
2189 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2190 }
2191
2192 vma_iter_init(&vmi, mm, 0);
2193 for_each_vma(vmi, vma) {
2194 if (signal_pending(current))
2195 goto out_unlock;
2196 if (vma->vm_file && vma->vm_file->f_mapping &&
2197 !is_vm_hugetlb_page(vma))
2198 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2199 }
2200
2201 vma_iter_init(&vmi, mm, 0);
2202 for_each_vma(vmi, vma) {
2203 if (signal_pending(current))
2204 goto out_unlock;
2205 if (vma->anon_vma)
2206 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2207 vm_lock_anon_vma(mm, avc->anon_vma);
2208 }
2209
2210 return 0;
2211
2212 out_unlock:
2213 mm_drop_all_locks(mm);
2214 return -EINTR;
2215 }
2216
vm_unlock_anon_vma(struct anon_vma * anon_vma)2217 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2218 {
2219 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
2220 /*
2221 * The LSB of head.next can't change to 0 from under
2222 * us because we hold the mm_all_locks_mutex.
2223 *
2224 * We must however clear the bitflag before unlocking
2225 * the vma so the users using the anon_vma->rb_root will
2226 * never see our bitflag.
2227 *
2228 * No need of atomic instructions here, head.next
2229 * can't change from under us until we release the
2230 * anon_vma->root->rwsem.
2231 */
2232 if (!__test_and_clear_bit(0, (unsigned long *)
2233 &anon_vma->root->rb_root.rb_root.rb_node))
2234 BUG();
2235 anon_vma_unlock_write(anon_vma);
2236 }
2237 }
2238
vm_unlock_mapping(struct address_space * mapping)2239 static void vm_unlock_mapping(struct address_space *mapping)
2240 {
2241 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2242 /*
2243 * AS_MM_ALL_LOCKS can't change to 0 from under us
2244 * because we hold the mm_all_locks_mutex.
2245 */
2246 i_mmap_unlock_write(mapping);
2247 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2248 &mapping->flags))
2249 BUG();
2250 }
2251 }
2252
2253 /*
2254 * The mmap_lock cannot be released by the caller until
2255 * mm_drop_all_locks() returns.
2256 */
mm_drop_all_locks(struct mm_struct * mm)2257 void mm_drop_all_locks(struct mm_struct *mm)
2258 {
2259 struct vm_area_struct *vma;
2260 struct anon_vma_chain *avc;
2261 VMA_ITERATOR(vmi, mm, 0);
2262
2263 mmap_assert_write_locked(mm);
2264 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2265
2266 for_each_vma(vmi, vma) {
2267 if (vma->anon_vma)
2268 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2269 vm_unlock_anon_vma(avc->anon_vma);
2270 if (vma->vm_file && vma->vm_file->f_mapping)
2271 vm_unlock_mapping(vma->vm_file->f_mapping);
2272 }
2273
2274 mutex_unlock(&mm_all_locks_mutex);
2275 }
2276
2277 /*
2278 * We account for memory if it's a private writeable mapping,
2279 * not hugepages and VM_NORESERVE wasn't set.
2280 */
accountable_mapping(struct file * file,vm_flags_t vm_flags)2281 static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
2282 {
2283 /*
2284 * hugetlb has its own accounting separate from the core VM
2285 * VM_HUGETLB may not be set yet so we cannot check for that flag.
2286 */
2287 if (file && is_file_hugepages(file))
2288 return false;
2289
2290 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
2291 }
2292
2293 /*
2294 * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
2295 * operation.
2296 * @vms: The vma unmap structure
2297 * @mas_detach: The maple state with the detached maple tree
2298 *
2299 * Reattach any detached vmas, free up the maple tree used to track the vmas.
2300 * If that's not possible because the ptes are cleared (and vm_ops->closed() may
2301 * have been called), then a NULL is written over the vmas and the vmas are
2302 * removed (munmap() completed).
2303 */
vms_abort_munmap_vmas(struct vma_munmap_struct * vms,struct ma_state * mas_detach)2304 static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
2305 struct ma_state *mas_detach)
2306 {
2307 struct ma_state *mas = &vms->vmi->mas;
2308
2309 if (!vms->nr_pages)
2310 return;
2311
2312 if (vms->clear_ptes)
2313 return reattach_vmas(mas_detach);
2314
2315 /*
2316 * Aborting cannot just call the vm_ops open() because they are often
2317 * not symmetrical and state data has been lost. Resort to the old
2318 * failure method of leaving a gap where the MAP_FIXED mapping failed.
2319 */
2320 mas_set_range(mas, vms->start, vms->end - 1);
2321 mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
2322 /* Clean up the insertion of the unfortunate gap */
2323 vms_complete_munmap_vmas(vms, mas_detach);
2324 }
2325
update_ksm_flags(struct mmap_state * map)2326 static void update_ksm_flags(struct mmap_state *map)
2327 {
2328 map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
2329 }
2330
2331 /*
2332 * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
2333 * unmapped once the map operation is completed, check limits, account mapping
2334 * and clean up any pre-existing VMAs.
2335 *
2336 * @map: Mapping state.
2337 * @uf: Userfaultfd context list.
2338 *
2339 * Returns: 0 on success, error code otherwise.
2340 */
__mmap_prepare(struct mmap_state * map,struct list_head * uf)2341 static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
2342 {
2343 int error;
2344 struct vma_iterator *vmi = map->vmi;
2345 struct vma_munmap_struct *vms = &map->vms;
2346
2347 /* Find the first overlapping VMA and initialise unmap state. */
2348 vms->vma = vma_find(vmi, map->end);
2349 init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
2350 /* unlock = */ false);
2351
2352 /* OK, we have overlapping VMAs - prepare to unmap them. */
2353 if (vms->vma) {
2354 mt_init_flags(&map->mt_detach,
2355 vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
2356 mt_on_stack(map->mt_detach);
2357 mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
2358 /* Prepare to unmap any existing mapping in the area */
2359 error = vms_gather_munmap_vmas(vms, &map->mas_detach);
2360 if (error) {
2361 /* On error VMAs will already have been reattached. */
2362 vms->nr_pages = 0;
2363 return error;
2364 }
2365
2366 map->next = vms->next;
2367 map->prev = vms->prev;
2368 } else {
2369 map->next = vma_iter_next_rewind(vmi, &map->prev);
2370 }
2371
2372 /* Check against address space limit. */
2373 if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
2374 return -ENOMEM;
2375
2376 /* Private writable mapping: check memory availability. */
2377 if (accountable_mapping(map->file, map->vm_flags)) {
2378 map->charged = map->pglen;
2379 map->charged -= vms->nr_accounted;
2380 if (map->charged) {
2381 error = security_vm_enough_memory_mm(map->mm, map->charged);
2382 if (error)
2383 return error;
2384 }
2385
2386 vms->nr_accounted = 0;
2387 map->vm_flags |= VM_ACCOUNT;
2388 }
2389
2390 /*
2391 * Clear PTEs while the vma is still in the tree so that rmap
2392 * cannot race with the freeing later in the truncate scenario.
2393 * This is also needed for mmap_file(), which is why vm_ops
2394 * close function is called.
2395 */
2396 vms_clean_up_area(vms, &map->mas_detach);
2397
2398 return 0;
2399 }
2400
2401
__mmap_new_file_vma(struct mmap_state * map,struct vm_area_struct * vma)2402 static int __mmap_new_file_vma(struct mmap_state *map,
2403 struct vm_area_struct *vma)
2404 {
2405 struct vma_iterator *vmi = map->vmi;
2406 int error;
2407
2408 vma->vm_file = get_file(map->file);
2409
2410 if (!map->file->f_op->mmap)
2411 return 0;
2412
2413 error = mmap_file(vma->vm_file, vma);
2414 if (error) {
2415 fput(vma->vm_file);
2416 vma->vm_file = NULL;
2417
2418 vma_iter_set(vmi, vma->vm_end);
2419 /* Undo any partial mapping done by a device driver. */
2420 unmap_region(&vmi->mas, vma, map->prev, map->next);
2421
2422 return error;
2423 }
2424
2425 /* Drivers cannot alter the address of the VMA. */
2426 WARN_ON_ONCE(map->addr != vma->vm_start);
2427 /*
2428 * Drivers should not permit writability when previously it was
2429 * disallowed.
2430 */
2431 VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
2432 !(map->vm_flags & VM_MAYWRITE) &&
2433 (vma->vm_flags & VM_MAYWRITE));
2434
2435 map->file = vma->vm_file;
2436 map->vm_flags = vma->vm_flags;
2437
2438 return 0;
2439 }
2440
2441 /*
2442 * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
2443 * possible.
2444 *
2445 * @map: Mapping state.
2446 * @vmap: Output pointer for the new VMA.
2447 *
2448 * Returns: Zero on success, or an error.
2449 */
__mmap_new_vma(struct mmap_state * map,struct vm_area_struct ** vmap)2450 static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
2451 {
2452 struct vma_iterator *vmi = map->vmi;
2453 int error = 0;
2454 struct vm_area_struct *vma;
2455
2456 /*
2457 * Determine the object being mapped and call the appropriate
2458 * specific mapper. the address has already been validated, but
2459 * not unmapped, but the maps are removed from the list.
2460 */
2461 vma = vm_area_alloc(map->mm);
2462 if (!vma)
2463 return -ENOMEM;
2464
2465 vma_iter_config(vmi, map->addr, map->end);
2466 vma_set_range(vma, map->addr, map->end, map->pgoff);
2467 vm_flags_init(vma, map->vm_flags);
2468 vma->vm_page_prot = map->page_prot;
2469
2470 if (vma_iter_prealloc(vmi, vma)) {
2471 error = -ENOMEM;
2472 goto free_vma;
2473 }
2474
2475 if (map->file)
2476 error = __mmap_new_file_vma(map, vma);
2477 else if (map->vm_flags & VM_SHARED)
2478 error = shmem_zero_setup(vma);
2479 else
2480 vma_set_anonymous(vma);
2481
2482 if (error)
2483 goto free_iter_vma;
2484
2485 if (!map->check_ksm_early) {
2486 update_ksm_flags(map);
2487 vm_flags_init(vma, map->vm_flags);
2488 }
2489
2490 #ifdef CONFIG_SPARC64
2491 /* TODO: Fix SPARC ADI! */
2492 WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
2493 #endif
2494
2495 /* Lock the VMA since it is modified after insertion into VMA tree */
2496 vma_start_write(vma);
2497 vma_iter_store_new(vmi, vma);
2498 map->mm->map_count++;
2499 vma_link_file(vma);
2500
2501 /*
2502 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
2503 * call covers the non-merge case.
2504 */
2505 if (!vma_is_anonymous(vma))
2506 khugepaged_enter_vma(vma, map->vm_flags);
2507 *vmap = vma;
2508 return 0;
2509
2510 free_iter_vma:
2511 vma_iter_free(vmi);
2512 free_vma:
2513 vm_area_free(vma);
2514 return error;
2515 }
2516
2517 /*
2518 * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
2519 * statistics, handle locking and finalise the VMA.
2520 *
2521 * @map: Mapping state.
2522 * @vma: Merged or newly allocated VMA for the mmap()'d region.
2523 */
__mmap_complete(struct mmap_state * map,struct vm_area_struct * vma)2524 static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
2525 {
2526 struct mm_struct *mm = map->mm;
2527 vm_flags_t vm_flags = vma->vm_flags;
2528
2529 perf_event_mmap(vma);
2530
2531 /* Unmap any existing mapping in the area. */
2532 vms_complete_munmap_vmas(&map->vms, &map->mas_detach);
2533
2534 vm_stat_account(mm, vma->vm_flags, map->pglen);
2535 if (vm_flags & VM_LOCKED) {
2536 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
2537 is_vm_hugetlb_page(vma) ||
2538 vma == get_gate_vma(mm))
2539 vm_flags_clear(vma, VM_LOCKED_MASK);
2540 else
2541 mm->locked_vm += map->pglen;
2542 }
2543
2544 if (vma->vm_file)
2545 uprobe_mmap(vma);
2546
2547 /*
2548 * New (or expanded) vma always get soft dirty status.
2549 * Otherwise user-space soft-dirty page tracker won't
2550 * be able to distinguish situation when vma area unmapped,
2551 * then new mapped in-place (which must be aimed as
2552 * a completely new data area).
2553 */
2554 vm_flags_set(vma, VM_SOFTDIRTY);
2555
2556 vma_set_page_prot(vma);
2557 }
2558
2559 /*
2560 * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
2561 * specifies it.
2562 *
2563 * This is called prior to any merge attempt, and updates whitelisted fields
2564 * that are permitted to be updated by the caller.
2565 *
2566 * All but user-defined fields will be pre-populated with original values.
2567 *
2568 * Returns 0 on success, or an error code otherwise.
2569 */
call_mmap_prepare(struct mmap_state * map)2570 static int call_mmap_prepare(struct mmap_state *map)
2571 {
2572 int err;
2573 struct vm_area_desc desc = {
2574 .mm = map->mm,
2575 .start = map->addr,
2576 .end = map->end,
2577
2578 .pgoff = map->pgoff,
2579 .file = map->file,
2580 .vm_flags = map->vm_flags,
2581 .page_prot = map->page_prot,
2582 };
2583
2584 /* Invoke the hook. */
2585 err = vfs_mmap_prepare(map->file, &desc);
2586 if (err)
2587 return err;
2588
2589 /* Update fields permitted to be changed. */
2590 map->pgoff = desc.pgoff;
2591 map->file = desc.file;
2592 map->vm_flags = desc.vm_flags;
2593 map->page_prot = desc.page_prot;
2594 /* User-defined fields. */
2595 map->vm_ops = desc.vm_ops;
2596 map->vm_private_data = desc.private_data;
2597
2598 return 0;
2599 }
2600
set_vma_user_defined_fields(struct vm_area_struct * vma,struct mmap_state * map)2601 static void set_vma_user_defined_fields(struct vm_area_struct *vma,
2602 struct mmap_state *map)
2603 {
2604 if (map->vm_ops)
2605 vma->vm_ops = map->vm_ops;
2606 vma->vm_private_data = map->vm_private_data;
2607 }
2608
2609 /*
2610 * Are we guaranteed no driver can change state such as to preclude KSM merging?
2611 * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
2612 */
can_set_ksm_flags_early(struct mmap_state * map)2613 static bool can_set_ksm_flags_early(struct mmap_state *map)
2614 {
2615 struct file *file = map->file;
2616
2617 /* Anonymous mappings have no driver which can change them. */
2618 if (!file)
2619 return true;
2620
2621 /*
2622 * If .mmap_prepare() is specified, then the driver will have already
2623 * manipulated state prior to updating KSM flags. So no need to worry
2624 * about mmap callbacks modifying VMA flags after the KSM flag has been
2625 * updated here, which could otherwise affect KSM eligibility.
2626 */
2627 if (file->f_op->mmap_prepare)
2628 return true;
2629
2630 /* shmem is safe. */
2631 if (shmem_file(file))
2632 return true;
2633
2634 /* Any other .mmap callback is not safe. */
2635 return false;
2636 }
2637
__mmap_region(struct file * file,unsigned long addr,unsigned long len,vm_flags_t vm_flags,unsigned long pgoff,struct list_head * uf)2638 static unsigned long __mmap_region(struct file *file, unsigned long addr,
2639 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
2640 struct list_head *uf)
2641 {
2642 struct mm_struct *mm = current->mm;
2643 struct vm_area_struct *vma = NULL;
2644 int error;
2645 bool have_mmap_prepare = file && file->f_op->mmap_prepare;
2646 VMA_ITERATOR(vmi, mm, addr);
2647 MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
2648
2649 map.check_ksm_early = can_set_ksm_flags_early(&map);
2650
2651 error = __mmap_prepare(&map, uf);
2652 if (!error && have_mmap_prepare)
2653 error = call_mmap_prepare(&map);
2654 if (error)
2655 goto abort_munmap;
2656
2657 if (map.check_ksm_early)
2658 update_ksm_flags(&map);
2659
2660 /* Attempt to merge with adjacent VMAs... */
2661 if (map.prev || map.next) {
2662 VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
2663
2664 vma = vma_merge_new_range(&vmg);
2665 }
2666
2667 /* ...but if we can't, allocate a new VMA. */
2668 if (!vma) {
2669 error = __mmap_new_vma(&map, &vma);
2670 if (error)
2671 goto unacct_error;
2672 }
2673
2674 if (have_mmap_prepare)
2675 set_vma_user_defined_fields(vma, &map);
2676
2677 __mmap_complete(&map, vma);
2678
2679 return addr;
2680
2681 /* Accounting was done by __mmap_prepare(). */
2682 unacct_error:
2683 if (map.charged)
2684 vm_unacct_memory(map.charged);
2685 abort_munmap:
2686 vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
2687 return error;
2688 }
2689
2690 /**
2691 * mmap_region() - Actually perform the userland mapping of a VMA into
2692 * current->mm with known, aligned and overflow-checked @addr and @len, and
2693 * correctly determined VMA flags @vm_flags and page offset @pgoff.
2694 *
2695 * This is an internal memory management function, and should not be used
2696 * directly.
2697 *
2698 * The caller must write-lock current->mm->mmap_lock.
2699 *
2700 * @file: If a file-backed mapping, a pointer to the struct file describing the
2701 * file to be mapped, otherwise NULL.
2702 * @addr: The page-aligned address at which to perform the mapping.
2703 * @len: The page-aligned, non-zero, length of the mapping.
2704 * @vm_flags: The VMA flags which should be applied to the mapping.
2705 * @pgoff: If @file is specified, the page offset into the file, if not then
2706 * the virtual page offset in memory of the anonymous mapping.
2707 * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
2708 * events.
2709 *
2710 * Returns: Either an error, or the address at which the requested mapping has
2711 * been performed.
2712 */
mmap_region(struct file * file,unsigned long addr,unsigned long len,vm_flags_t vm_flags,unsigned long pgoff,struct list_head * uf)2713 unsigned long mmap_region(struct file *file, unsigned long addr,
2714 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
2715 struct list_head *uf)
2716 {
2717 unsigned long ret;
2718 bool writable_file_mapping = false;
2719
2720 mmap_assert_write_locked(current->mm);
2721
2722 /* Check to see if MDWE is applicable. */
2723 if (map_deny_write_exec(vm_flags, vm_flags))
2724 return -EACCES;
2725
2726 /* Allow architectures to sanity-check the vm_flags. */
2727 if (!arch_validate_flags(vm_flags))
2728 return -EINVAL;
2729
2730 /* Map writable and ensure this isn't a sealed memfd. */
2731 if (file && is_shared_maywrite(vm_flags)) {
2732 int error = mapping_map_writable(file->f_mapping);
2733
2734 if (error)
2735 return error;
2736 writable_file_mapping = true;
2737 }
2738
2739 ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
2740
2741 /* Clear our write mapping regardless of error. */
2742 if (writable_file_mapping)
2743 mapping_unmap_writable(file->f_mapping);
2744
2745 validate_mm(current->mm);
2746 return ret;
2747 }
2748
2749 /*
2750 * do_brk_flags() - Increase the brk vma if the flags match.
2751 * @vmi: The vma iterator
2752 * @addr: The start address
2753 * @len: The length of the increase
2754 * @vma: The vma,
2755 * @vm_flags: The VMA Flags
2756 *
2757 * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
2758 * do not match then create a new anonymous VMA. Eventually we may be able to
2759 * do some brk-specific accounting here.
2760 */
do_brk_flags(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,unsigned long len,vm_flags_t vm_flags)2761 int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
2762 unsigned long addr, unsigned long len, vm_flags_t vm_flags)
2763 {
2764 struct mm_struct *mm = current->mm;
2765
2766 /*
2767 * Check against address space limits by the changed size
2768 * Note: This happens *after* clearing old mappings in some code paths.
2769 */
2770 vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2771 vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
2772 if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
2773 return -ENOMEM;
2774
2775 if (mm->map_count > sysctl_max_map_count)
2776 return -ENOMEM;
2777
2778 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2779 return -ENOMEM;
2780
2781 /*
2782 * Expand the existing vma if possible; Note that singular lists do not
2783 * occur after forking, so the expand will only happen on new VMAs.
2784 */
2785 if (vma && vma->vm_end == addr) {
2786 VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
2787
2788 vmg.prev = vma;
2789 /* vmi is positioned at prev, which this mode expects. */
2790 vmg.just_expand = true;
2791
2792 if (vma_merge_new_range(&vmg))
2793 goto out;
2794 else if (vmg_nomem(&vmg))
2795 goto unacct_fail;
2796 }
2797
2798 if (vma)
2799 vma_iter_next_range(vmi);
2800 /* create a vma struct for an anonymous mapping */
2801 vma = vm_area_alloc(mm);
2802 if (!vma)
2803 goto unacct_fail;
2804
2805 vma_set_anonymous(vma);
2806 vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
2807 vm_flags_init(vma, vm_flags);
2808 vma->vm_page_prot = vm_get_page_prot(vm_flags);
2809 vma_start_write(vma);
2810 if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
2811 goto mas_store_fail;
2812
2813 mm->map_count++;
2814 validate_mm(mm);
2815 out:
2816 perf_event_mmap(vma);
2817 mm->total_vm += len >> PAGE_SHIFT;
2818 mm->data_vm += len >> PAGE_SHIFT;
2819 if (vm_flags & VM_LOCKED)
2820 mm->locked_vm += (len >> PAGE_SHIFT);
2821 vm_flags_set(vma, VM_SOFTDIRTY);
2822 return 0;
2823
2824 mas_store_fail:
2825 vm_area_free(vma);
2826 unacct_fail:
2827 vm_unacct_memory(len >> PAGE_SHIFT);
2828 return -ENOMEM;
2829 }
2830
2831 /**
2832 * unmapped_area() - Find an area between the low_limit and the high_limit with
2833 * the correct alignment and offset, all from @info. Note: current->mm is used
2834 * for the search.
2835 *
2836 * @info: The unmapped area information including the range [low_limit -
2837 * high_limit), the alignment offset and mask.
2838 *
2839 * Return: A memory address or -ENOMEM.
2840 */
unmapped_area(struct vm_unmapped_area_info * info)2841 unsigned long unmapped_area(struct vm_unmapped_area_info *info)
2842 {
2843 unsigned long length, gap;
2844 unsigned long low_limit, high_limit;
2845 struct vm_area_struct *tmp;
2846 VMA_ITERATOR(vmi, current->mm, 0);
2847
2848 /* Adjust search length to account for worst case alignment overhead */
2849 length = info->length + info->align_mask + info->start_gap;
2850 if (length < info->length)
2851 return -ENOMEM;
2852
2853 low_limit = info->low_limit;
2854 if (low_limit < mmap_min_addr)
2855 low_limit = mmap_min_addr;
2856 high_limit = info->high_limit;
2857 retry:
2858 if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
2859 return -ENOMEM;
2860
2861 /*
2862 * Adjust for the gap first so it doesn't interfere with the
2863 * later alignment. The first step is the minimum needed to
2864 * fulill the start gap, the next steps is the minimum to align
2865 * that. It is the minimum needed to fulill both.
2866 */
2867 gap = vma_iter_addr(&vmi) + info->start_gap;
2868 gap += (info->align_offset - gap) & info->align_mask;
2869 tmp = vma_next(&vmi);
2870 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
2871 if (vm_start_gap(tmp) < gap + length - 1) {
2872 low_limit = tmp->vm_end;
2873 vma_iter_reset(&vmi);
2874 goto retry;
2875 }
2876 } else {
2877 tmp = vma_prev(&vmi);
2878 if (tmp && vm_end_gap(tmp) > gap) {
2879 low_limit = vm_end_gap(tmp);
2880 vma_iter_reset(&vmi);
2881 goto retry;
2882 }
2883 }
2884
2885 return gap;
2886 }
2887
2888 /**
2889 * unmapped_area_topdown() - Find an area between the low_limit and the
2890 * high_limit with the correct alignment and offset at the highest available
2891 * address, all from @info. Note: current->mm is used for the search.
2892 *
2893 * @info: The unmapped area information including the range [low_limit -
2894 * high_limit), the alignment offset and mask.
2895 *
2896 * Return: A memory address or -ENOMEM.
2897 */
unmapped_area_topdown(struct vm_unmapped_area_info * info)2898 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
2899 {
2900 unsigned long length, gap, gap_end;
2901 unsigned long low_limit, high_limit;
2902 struct vm_area_struct *tmp;
2903 VMA_ITERATOR(vmi, current->mm, 0);
2904
2905 /* Adjust search length to account for worst case alignment overhead */
2906 length = info->length + info->align_mask + info->start_gap;
2907 if (length < info->length)
2908 return -ENOMEM;
2909
2910 low_limit = info->low_limit;
2911 if (low_limit < mmap_min_addr)
2912 low_limit = mmap_min_addr;
2913 high_limit = info->high_limit;
2914 retry:
2915 if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
2916 return -ENOMEM;
2917
2918 gap = vma_iter_end(&vmi) - info->length;
2919 gap -= (gap - info->align_offset) & info->align_mask;
2920 gap_end = vma_iter_end(&vmi);
2921 tmp = vma_next(&vmi);
2922 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
2923 if (vm_start_gap(tmp) < gap_end) {
2924 high_limit = vm_start_gap(tmp);
2925 vma_iter_reset(&vmi);
2926 goto retry;
2927 }
2928 } else {
2929 tmp = vma_prev(&vmi);
2930 if (tmp && vm_end_gap(tmp) > gap) {
2931 high_limit = tmp->vm_start;
2932 vma_iter_reset(&vmi);
2933 goto retry;
2934 }
2935 }
2936
2937 return gap;
2938 }
2939
2940 /*
2941 * Verify that the stack growth is acceptable and
2942 * update accounting. This is shared with both the
2943 * grow-up and grow-down cases.
2944 */
acct_stack_growth(struct vm_area_struct * vma,unsigned long size,unsigned long grow)2945 static int acct_stack_growth(struct vm_area_struct *vma,
2946 unsigned long size, unsigned long grow)
2947 {
2948 struct mm_struct *mm = vma->vm_mm;
2949 unsigned long new_start;
2950
2951 /* address space limit tests */
2952 if (!may_expand_vm(mm, vma->vm_flags, grow))
2953 return -ENOMEM;
2954
2955 /* Stack limit test */
2956 if (size > rlimit(RLIMIT_STACK))
2957 return -ENOMEM;
2958
2959 /* mlock limit tests */
2960 if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
2961 return -ENOMEM;
2962
2963 /* Check to ensure the stack will not grow into a hugetlb-only region */
2964 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2965 vma->vm_end - size;
2966 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2967 return -EFAULT;
2968
2969 /*
2970 * Overcommit.. This must be the final test, as it will
2971 * update security statistics.
2972 */
2973 if (security_vm_enough_memory_mm(mm, grow))
2974 return -ENOMEM;
2975
2976 return 0;
2977 }
2978
2979 #if defined(CONFIG_STACK_GROWSUP)
2980 /*
2981 * PA-RISC uses this for its stack.
2982 * vma is the last one with address > vma->vm_end. Have to extend vma.
2983 */
expand_upwards(struct vm_area_struct * vma,unsigned long address)2984 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2985 {
2986 struct mm_struct *mm = vma->vm_mm;
2987 struct vm_area_struct *next;
2988 unsigned long gap_addr;
2989 int error = 0;
2990 VMA_ITERATOR(vmi, mm, vma->vm_start);
2991
2992 if (!(vma->vm_flags & VM_GROWSUP))
2993 return -EFAULT;
2994
2995 mmap_assert_write_locked(mm);
2996
2997 /* Guard against exceeding limits of the address space. */
2998 address &= PAGE_MASK;
2999 if (address >= (TASK_SIZE & PAGE_MASK))
3000 return -ENOMEM;
3001 address += PAGE_SIZE;
3002
3003 /* Enforce stack_guard_gap */
3004 gap_addr = address + stack_guard_gap;
3005
3006 /* Guard against overflow */
3007 if (gap_addr < address || gap_addr > TASK_SIZE)
3008 gap_addr = TASK_SIZE;
3009
3010 next = find_vma_intersection(mm, vma->vm_end, gap_addr);
3011 if (next && vma_is_accessible(next)) {
3012 if (!(next->vm_flags & VM_GROWSUP))
3013 return -ENOMEM;
3014 /* Check that both stack segments have the same anon_vma? */
3015 }
3016
3017 if (next)
3018 vma_iter_prev_range_limit(&vmi, address);
3019
3020 vma_iter_config(&vmi, vma->vm_start, address);
3021 if (vma_iter_prealloc(&vmi, vma))
3022 return -ENOMEM;
3023
3024 /* We must make sure the anon_vma is allocated. */
3025 if (unlikely(anon_vma_prepare(vma))) {
3026 vma_iter_free(&vmi);
3027 return -ENOMEM;
3028 }
3029
3030 /* Lock the VMA before expanding to prevent concurrent page faults */
3031 vma_start_write(vma);
3032 /* We update the anon VMA tree. */
3033 anon_vma_lock_write(vma->anon_vma);
3034
3035 /* Somebody else might have raced and expanded it already */
3036 if (address > vma->vm_end) {
3037 unsigned long size, grow;
3038
3039 size = address - vma->vm_start;
3040 grow = (address - vma->vm_end) >> PAGE_SHIFT;
3041
3042 error = -ENOMEM;
3043 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
3044 error = acct_stack_growth(vma, size, grow);
3045 if (!error) {
3046 if (vma->vm_flags & VM_LOCKED)
3047 mm->locked_vm += grow;
3048 vm_stat_account(mm, vma->vm_flags, grow);
3049 anon_vma_interval_tree_pre_update_vma(vma);
3050 vma->vm_end = address;
3051 /* Overwrite old entry in mtree. */
3052 vma_iter_store_overwrite(&vmi, vma);
3053 anon_vma_interval_tree_post_update_vma(vma);
3054
3055 perf_event_mmap(vma);
3056 }
3057 }
3058 }
3059 anon_vma_unlock_write(vma->anon_vma);
3060 vma_iter_free(&vmi);
3061 validate_mm(mm);
3062 return error;
3063 }
3064 #endif /* CONFIG_STACK_GROWSUP */
3065
3066 /*
3067 * vma is the first one with address < vma->vm_start. Have to extend vma.
3068 * mmap_lock held for writing.
3069 */
expand_downwards(struct vm_area_struct * vma,unsigned long address)3070 int expand_downwards(struct vm_area_struct *vma, unsigned long address)
3071 {
3072 struct mm_struct *mm = vma->vm_mm;
3073 struct vm_area_struct *prev;
3074 int error = 0;
3075 VMA_ITERATOR(vmi, mm, vma->vm_start);
3076
3077 if (!(vma->vm_flags & VM_GROWSDOWN))
3078 return -EFAULT;
3079
3080 mmap_assert_write_locked(mm);
3081
3082 address &= PAGE_MASK;
3083 if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
3084 return -EPERM;
3085
3086 /* Enforce stack_guard_gap */
3087 prev = vma_prev(&vmi);
3088 /* Check that both stack segments have the same anon_vma? */
3089 if (prev) {
3090 if (!(prev->vm_flags & VM_GROWSDOWN) &&
3091 vma_is_accessible(prev) &&
3092 (address - prev->vm_end < stack_guard_gap))
3093 return -ENOMEM;
3094 }
3095
3096 if (prev)
3097 vma_iter_next_range_limit(&vmi, vma->vm_start);
3098
3099 vma_iter_config(&vmi, address, vma->vm_end);
3100 if (vma_iter_prealloc(&vmi, vma))
3101 return -ENOMEM;
3102
3103 /* We must make sure the anon_vma is allocated. */
3104 if (unlikely(anon_vma_prepare(vma))) {
3105 vma_iter_free(&vmi);
3106 return -ENOMEM;
3107 }
3108
3109 /* Lock the VMA before expanding to prevent concurrent page faults */
3110 vma_start_write(vma);
3111 /* We update the anon VMA tree. */
3112 anon_vma_lock_write(vma->anon_vma);
3113
3114 /* Somebody else might have raced and expanded it already */
3115 if (address < vma->vm_start) {
3116 unsigned long size, grow;
3117
3118 size = vma->vm_end - address;
3119 grow = (vma->vm_start - address) >> PAGE_SHIFT;
3120
3121 error = -ENOMEM;
3122 if (grow <= vma->vm_pgoff) {
3123 error = acct_stack_growth(vma, size, grow);
3124 if (!error) {
3125 if (vma->vm_flags & VM_LOCKED)
3126 mm->locked_vm += grow;
3127 vm_stat_account(mm, vma->vm_flags, grow);
3128 anon_vma_interval_tree_pre_update_vma(vma);
3129 vma->vm_start = address;
3130 vma->vm_pgoff -= grow;
3131 /* Overwrite old entry in mtree. */
3132 vma_iter_store_overwrite(&vmi, vma);
3133 anon_vma_interval_tree_post_update_vma(vma);
3134
3135 perf_event_mmap(vma);
3136 }
3137 }
3138 }
3139 anon_vma_unlock_write(vma->anon_vma);
3140 vma_iter_free(&vmi);
3141 validate_mm(mm);
3142 return error;
3143 }
3144
__vm_munmap(unsigned long start,size_t len,bool unlock)3145 int __vm_munmap(unsigned long start, size_t len, bool unlock)
3146 {
3147 int ret;
3148 struct mm_struct *mm = current->mm;
3149 LIST_HEAD(uf);
3150 VMA_ITERATOR(vmi, mm, start);
3151
3152 if (mmap_write_lock_killable(mm))
3153 return -EINTR;
3154
3155 ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
3156 if (ret || !unlock)
3157 mmap_write_unlock(mm);
3158
3159 userfaultfd_unmap_complete(mm, &uf);
3160 return ret;
3161 }
3162
3163 /* Insert vm structure into process list sorted by address
3164 * and into the inode's i_mmap tree. If vm_file is non-NULL
3165 * then i_mmap_rwsem is taken here.
3166 */
insert_vm_struct(struct mm_struct * mm,struct vm_area_struct * vma)3167 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
3168 {
3169 unsigned long charged = vma_pages(vma);
3170
3171
3172 if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
3173 return -ENOMEM;
3174
3175 if ((vma->vm_flags & VM_ACCOUNT) &&
3176 security_vm_enough_memory_mm(mm, charged))
3177 return -ENOMEM;
3178
3179 /*
3180 * The vm_pgoff of a purely anonymous vma should be irrelevant
3181 * until its first write fault, when page's anon_vma and index
3182 * are set. But now set the vm_pgoff it will almost certainly
3183 * end up with (unless mremap moves it elsewhere before that
3184 * first wfault), so /proc/pid/maps tells a consistent story.
3185 *
3186 * By setting it to reflect the virtual start address of the
3187 * vma, merges and splits can happen in a seamless way, just
3188 * using the existing file pgoff checks and manipulations.
3189 * Similarly in do_mmap and in do_brk_flags.
3190 */
3191 if (vma_is_anonymous(vma)) {
3192 BUG_ON(vma->anon_vma);
3193 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
3194 }
3195
3196 if (vma_link(mm, vma)) {
3197 if (vma->vm_flags & VM_ACCOUNT)
3198 vm_unacct_memory(charged);
3199 return -ENOMEM;
3200 }
3201
3202 return 0;
3203 }
3204