1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 /* 4 * VMA-specific functions. 5 */ 6 7 #include "vma_internal.h" 8 #include "vma.h" 9 10 struct mmap_state { 11 struct mm_struct *mm; 12 struct vma_iterator *vmi; 13 14 unsigned long addr; 15 unsigned long end; 16 pgoff_t pgoff; 17 unsigned long pglen; 18 unsigned long flags; 19 struct file *file; 20 pgprot_t page_prot; 21 22 /* User-defined fields, perhaps updated by .mmap_prepare(). */ 23 const struct vm_operations_struct *vm_ops; 24 void *vm_private_data; 25 26 unsigned long charged; 27 28 struct vm_area_struct *prev; 29 struct vm_area_struct *next; 30 31 /* Unmapping state. */ 32 struct vma_munmap_struct vms; 33 struct ma_state mas_detach; 34 struct maple_tree mt_detach; 35 }; 36 37 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ 38 struct mmap_state name = { \ 39 .mm = mm_, \ 40 .vmi = vmi_, \ 41 .addr = addr_, \ 42 .end = (addr_) + (len_), \ 43 .pgoff = pgoff_, \ 44 .pglen = PHYS_PFN(len_), \ 45 .flags = flags_, \ 46 .file = file_, \ 47 .page_prot = vm_get_page_prot(flags_), \ 48 } 49 50 #define VMG_MMAP_STATE(name, map_, vma_) \ 51 struct vma_merge_struct name = { \ 52 .mm = (map_)->mm, \ 53 .vmi = (map_)->vmi, \ 54 .start = (map_)->addr, \ 55 .end = (map_)->end, \ 56 .flags = (map_)->flags, \ 57 .pgoff = (map_)->pgoff, \ 58 .file = (map_)->file, \ 59 .prev = (map_)->prev, \ 60 .middle = vma_, \ 61 .next = (vma_) ? NULL : (map_)->next, \ 62 .state = VMA_MERGE_START, \ 63 } 64 65 /* 66 * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain 67 * more than one anon_vma_chain connecting it to more than one anon_vma. A merge 68 * would mean a wider range of folios sharing the root anon_vma lock, and thus 69 * potential lock contention, we do not wish to encourage merging such that this 70 * scales to a problem. 71 */ 72 static bool vma_had_uncowed_parents(struct vm_area_struct *vma) 73 { 74 /* 75 * The list_is_singular() test is to avoid merging VMA cloned from 76 * parents. This can improve scalability caused by anon_vma lock. 77 */ 78 return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); 79 } 80 81 static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) 82 { 83 struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; 84 85 if (!mpol_equal(vmg->policy, vma_policy(vma))) 86 return false; 87 /* 88 * VM_SOFTDIRTY should not prevent from VMA merging, if we 89 * match the flags but dirty bit -- the caller should mark 90 * merged VMA as dirty. If dirty bit won't be excluded from 91 * comparison, we increase pressure on the memory system forcing 92 * the kernel to generate new VMAs when old one could be 93 * extended instead. 94 */ 95 if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) 96 return false; 97 if (vma->vm_file != vmg->file) 98 return false; 99 if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) 100 return false; 101 if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) 102 return false; 103 return true; 104 } 105 106 static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) 107 { 108 struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; 109 struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ 110 struct anon_vma *tgt_anon = tgt->anon_vma; 111 struct anon_vma *src_anon = vmg->anon_vma; 112 113 /* 114 * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we 115 * will remove the existing VMA's anon_vma's so there's no scalability 116 * concerns. 117 */ 118 VM_WARN_ON(src && src_anon != src->anon_vma); 119 120 /* Case 1 - we will dup_anon_vma() from src into tgt. */ 121 if (!tgt_anon && src_anon) 122 return !vma_had_uncowed_parents(src); 123 /* Case 2 - we will simply use tgt's anon_vma. */ 124 if (tgt_anon && !src_anon) 125 return !vma_had_uncowed_parents(tgt); 126 /* Case 3 - the anon_vma's are already shared. */ 127 return src_anon == tgt_anon; 128 } 129 130 /* 131 * init_multi_vma_prep() - Initializer for struct vma_prepare 132 * @vp: The vma_prepare struct 133 * @vma: The vma that will be altered once locked 134 * @vmg: The merge state that will be used to determine adjustment and VMA 135 * removal. 136 */ 137 static void init_multi_vma_prep(struct vma_prepare *vp, 138 struct vm_area_struct *vma, 139 struct vma_merge_struct *vmg) 140 { 141 struct vm_area_struct *adjust; 142 struct vm_area_struct **remove = &vp->remove; 143 144 memset(vp, 0, sizeof(struct vma_prepare)); 145 vp->vma = vma; 146 vp->anon_vma = vma->anon_vma; 147 148 if (vmg && vmg->__remove_middle) { 149 *remove = vmg->middle; 150 remove = &vp->remove2; 151 } 152 if (vmg && vmg->__remove_next) 153 *remove = vmg->next; 154 155 if (vmg && vmg->__adjust_middle_start) 156 adjust = vmg->middle; 157 else if (vmg && vmg->__adjust_next_start) 158 adjust = vmg->next; 159 else 160 adjust = NULL; 161 162 vp->adj_next = adjust; 163 if (!vp->anon_vma && adjust) 164 vp->anon_vma = adjust->anon_vma; 165 166 VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && 167 vp->anon_vma != adjust->anon_vma); 168 169 vp->file = vma->vm_file; 170 if (vp->file) 171 vp->mapping = vma->vm_file->f_mapping; 172 173 if (vmg && vmg->skip_vma_uprobe) 174 vp->skip_vma_uprobe = true; 175 } 176 177 /* 178 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 179 * in front of (at a lower virtual address and file offset than) the vma. 180 * 181 * We cannot merge two vmas if they have differently assigned (non-NULL) 182 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 183 * 184 * We don't check here for the merged mmap wrapping around the end of pagecache 185 * indices (16TB on ia32) because do_mmap() does not permit mmap's which 186 * wrap, nor mmaps which cover the final page at index -1UL. 187 * 188 * We assume the vma may be removed as part of the merge. 189 */ 190 static bool can_vma_merge_before(struct vma_merge_struct *vmg) 191 { 192 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); 193 194 if (is_mergeable_vma(vmg, /* merge_next = */ true) && 195 is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { 196 if (vmg->next->vm_pgoff == vmg->pgoff + pglen) 197 return true; 198 } 199 200 return false; 201 } 202 203 /* 204 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 205 * beyond (at a higher virtual address and file offset than) the vma. 206 * 207 * We cannot merge two vmas if they have differently assigned (non-NULL) 208 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 209 * 210 * We assume that vma is not removed as part of the merge. 211 */ 212 static bool can_vma_merge_after(struct vma_merge_struct *vmg) 213 { 214 if (is_mergeable_vma(vmg, /* merge_next = */ false) && 215 is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { 216 if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) 217 return true; 218 } 219 return false; 220 } 221 222 static void __vma_link_file(struct vm_area_struct *vma, 223 struct address_space *mapping) 224 { 225 if (vma_is_shared_maywrite(vma)) 226 mapping_allow_writable(mapping); 227 228 flush_dcache_mmap_lock(mapping); 229 vma_interval_tree_insert(vma, &mapping->i_mmap); 230 flush_dcache_mmap_unlock(mapping); 231 } 232 233 /* 234 * Requires inode->i_mapping->i_mmap_rwsem 235 */ 236 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 237 struct address_space *mapping) 238 { 239 if (vma_is_shared_maywrite(vma)) 240 mapping_unmap_writable(mapping); 241 242 flush_dcache_mmap_lock(mapping); 243 vma_interval_tree_remove(vma, &mapping->i_mmap); 244 flush_dcache_mmap_unlock(mapping); 245 } 246 247 /* 248 * vma has some anon_vma assigned, and is already inserted on that 249 * anon_vma's interval trees. 250 * 251 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 252 * vma must be removed from the anon_vma's interval trees using 253 * anon_vma_interval_tree_pre_update_vma(). 254 * 255 * After the update, the vma will be reinserted using 256 * anon_vma_interval_tree_post_update_vma(). 257 * 258 * The entire update must be protected by exclusive mmap_lock and by 259 * the root anon_vma's mutex. 260 */ 261 static void 262 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 263 { 264 struct anon_vma_chain *avc; 265 266 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 267 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 268 } 269 270 static void 271 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 272 { 273 struct anon_vma_chain *avc; 274 275 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 276 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 277 } 278 279 /* 280 * vma_prepare() - Helper function for handling locking VMAs prior to altering 281 * @vp: The initialized vma_prepare struct 282 */ 283 static void vma_prepare(struct vma_prepare *vp) 284 { 285 if (vp->file) { 286 uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 287 288 if (vp->adj_next) 289 uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 290 vp->adj_next->vm_end); 291 292 i_mmap_lock_write(vp->mapping); 293 if (vp->insert && vp->insert->vm_file) { 294 /* 295 * Put into interval tree now, so instantiated pages 296 * are visible to arm/parisc __flush_dcache_page 297 * throughout; but we cannot insert into address 298 * space until vma start or end is updated. 299 */ 300 __vma_link_file(vp->insert, 301 vp->insert->vm_file->f_mapping); 302 } 303 } 304 305 if (vp->anon_vma) { 306 anon_vma_lock_write(vp->anon_vma); 307 anon_vma_interval_tree_pre_update_vma(vp->vma); 308 if (vp->adj_next) 309 anon_vma_interval_tree_pre_update_vma(vp->adj_next); 310 } 311 312 if (vp->file) { 313 flush_dcache_mmap_lock(vp->mapping); 314 vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 315 if (vp->adj_next) 316 vma_interval_tree_remove(vp->adj_next, 317 &vp->mapping->i_mmap); 318 } 319 320 } 321 322 /* 323 * vma_complete- Helper function for handling the unlocking after altering VMAs, 324 * or for inserting a VMA. 325 * 326 * @vp: The vma_prepare struct 327 * @vmi: The vma iterator 328 * @mm: The mm_struct 329 */ 330 static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, 331 struct mm_struct *mm) 332 { 333 if (vp->file) { 334 if (vp->adj_next) 335 vma_interval_tree_insert(vp->adj_next, 336 &vp->mapping->i_mmap); 337 vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 338 flush_dcache_mmap_unlock(vp->mapping); 339 } 340 341 if (vp->remove && vp->file) { 342 __remove_shared_vm_struct(vp->remove, vp->mapping); 343 if (vp->remove2) 344 __remove_shared_vm_struct(vp->remove2, vp->mapping); 345 } else if (vp->insert) { 346 /* 347 * split_vma has split insert from vma, and needs 348 * us to insert it before dropping the locks 349 * (it may either follow vma or precede it). 350 */ 351 vma_iter_store_new(vmi, vp->insert); 352 mm->map_count++; 353 } 354 355 if (vp->anon_vma) { 356 anon_vma_interval_tree_post_update_vma(vp->vma); 357 if (vp->adj_next) 358 anon_vma_interval_tree_post_update_vma(vp->adj_next); 359 anon_vma_unlock_write(vp->anon_vma); 360 } 361 362 if (vp->file) { 363 i_mmap_unlock_write(vp->mapping); 364 365 if (!vp->skip_vma_uprobe) { 366 uprobe_mmap(vp->vma); 367 368 if (vp->adj_next) 369 uprobe_mmap(vp->adj_next); 370 } 371 } 372 373 if (vp->remove) { 374 again: 375 vma_mark_detached(vp->remove); 376 if (vp->file) { 377 uprobe_munmap(vp->remove, vp->remove->vm_start, 378 vp->remove->vm_end); 379 fput(vp->file); 380 } 381 if (vp->remove->anon_vma) 382 anon_vma_merge(vp->vma, vp->remove); 383 mm->map_count--; 384 mpol_put(vma_policy(vp->remove)); 385 if (!vp->remove2) 386 WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 387 vm_area_free(vp->remove); 388 389 /* 390 * In mprotect's case 6 (see comments on vma_merge), 391 * we are removing both mid and next vmas 392 */ 393 if (vp->remove2) { 394 vp->remove = vp->remove2; 395 vp->remove2 = NULL; 396 goto again; 397 } 398 } 399 if (vp->insert && vp->file) 400 uprobe_mmap(vp->insert); 401 } 402 403 /* 404 * init_vma_prep() - Initializer wrapper for vma_prepare struct 405 * @vp: The vma_prepare struct 406 * @vma: The vma that will be altered once locked 407 */ 408 static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) 409 { 410 init_multi_vma_prep(vp, vma, NULL); 411 } 412 413 /* 414 * Can the proposed VMA be merged with the left (previous) VMA taking into 415 * account the start position of the proposed range. 416 */ 417 static bool can_vma_merge_left(struct vma_merge_struct *vmg) 418 419 { 420 return vmg->prev && vmg->prev->vm_end == vmg->start && 421 can_vma_merge_after(vmg); 422 } 423 424 /* 425 * Can the proposed VMA be merged with the right (next) VMA taking into 426 * account the end position of the proposed range. 427 * 428 * In addition, if we can merge with the left VMA, ensure that left and right 429 * anon_vma's are also compatible. 430 */ 431 static bool can_vma_merge_right(struct vma_merge_struct *vmg, 432 bool can_merge_left) 433 { 434 struct vm_area_struct *next = vmg->next; 435 struct vm_area_struct *prev; 436 437 if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) 438 return false; 439 440 if (!can_merge_left) 441 return true; 442 443 /* 444 * If we can merge with prev (left) and next (right), indicating that 445 * each VMA's anon_vma is compatible with the proposed anon_vma, this 446 * does not mean prev and next are compatible with EACH OTHER. 447 * 448 * We therefore check this in addition to mergeability to either side. 449 */ 450 prev = vmg->prev; 451 return !prev->anon_vma || !next->anon_vma || 452 prev->anon_vma == next->anon_vma; 453 } 454 455 /* 456 * Close a vm structure and free it. 457 */ 458 void remove_vma(struct vm_area_struct *vma) 459 { 460 might_sleep(); 461 vma_close(vma); 462 if (vma->vm_file) 463 fput(vma->vm_file); 464 mpol_put(vma_policy(vma)); 465 vm_area_free(vma); 466 } 467 468 /* 469 * Get rid of page table information in the indicated region. 470 * 471 * Called with the mm semaphore held. 472 */ 473 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, 474 struct vm_area_struct *prev, struct vm_area_struct *next) 475 { 476 struct mm_struct *mm = vma->vm_mm; 477 struct mmu_gather tlb; 478 479 tlb_gather_mmu(&tlb, mm); 480 update_hiwater_rss(mm); 481 unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, 482 /* mm_wr_locked = */ true); 483 mas_set(mas, vma->vm_end); 484 free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 485 next ? next->vm_start : USER_PGTABLES_CEILING, 486 /* mm_wr_locked = */ true); 487 tlb_finish_mmu(&tlb); 488 } 489 490 /* 491 * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 492 * has already been checked or doesn't make sense to fail. 493 * VMA Iterator will point to the original VMA. 494 */ 495 static __must_check int 496 __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 497 unsigned long addr, int new_below) 498 { 499 struct vma_prepare vp; 500 struct vm_area_struct *new; 501 int err; 502 503 WARN_ON(vma->vm_start >= addr); 504 WARN_ON(vma->vm_end <= addr); 505 506 if (vma->vm_ops && vma->vm_ops->may_split) { 507 err = vma->vm_ops->may_split(vma, addr); 508 if (err) 509 return err; 510 } 511 512 new = vm_area_dup(vma); 513 if (!new) 514 return -ENOMEM; 515 516 if (new_below) { 517 new->vm_end = addr; 518 } else { 519 new->vm_start = addr; 520 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 521 } 522 523 err = -ENOMEM; 524 vma_iter_config(vmi, new->vm_start, new->vm_end); 525 if (vma_iter_prealloc(vmi, new)) 526 goto out_free_vma; 527 528 err = vma_dup_policy(vma, new); 529 if (err) 530 goto out_free_vmi; 531 532 err = anon_vma_clone(new, vma); 533 if (err) 534 goto out_free_mpol; 535 536 if (new->vm_file) 537 get_file(new->vm_file); 538 539 if (new->vm_ops && new->vm_ops->open) 540 new->vm_ops->open(new); 541 542 vma_start_write(vma); 543 vma_start_write(new); 544 545 init_vma_prep(&vp, vma); 546 vp.insert = new; 547 vma_prepare(&vp); 548 549 /* 550 * Get rid of huge pages and shared page tables straddling the split 551 * boundary. 552 */ 553 vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); 554 if (is_vm_hugetlb_page(vma)) 555 hugetlb_split(vma, addr); 556 557 if (new_below) { 558 vma->vm_start = addr; 559 vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 560 } else { 561 vma->vm_end = addr; 562 } 563 564 /* vma_complete stores the new vma */ 565 vma_complete(&vp, vmi, vma->vm_mm); 566 validate_mm(vma->vm_mm); 567 568 /* Success. */ 569 if (new_below) 570 vma_next(vmi); 571 else 572 vma_prev(vmi); 573 574 return 0; 575 576 out_free_mpol: 577 mpol_put(vma_policy(new)); 578 out_free_vmi: 579 vma_iter_free(vmi); 580 out_free_vma: 581 vm_area_free(new); 582 return err; 583 } 584 585 /* 586 * Split a vma into two pieces at address 'addr', a new vma is allocated 587 * either for the first part or the tail. 588 */ 589 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 590 unsigned long addr, int new_below) 591 { 592 if (vma->vm_mm->map_count >= sysctl_max_map_count) 593 return -ENOMEM; 594 595 return __split_vma(vmi, vma, addr, new_below); 596 } 597 598 /* 599 * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the 600 * instance that the destination VMA has no anon_vma but the source does. 601 * 602 * @dst: The destination VMA 603 * @src: The source VMA 604 * @dup: Pointer to the destination VMA when successful. 605 * 606 * Returns: 0 on success. 607 */ 608 static int dup_anon_vma(struct vm_area_struct *dst, 609 struct vm_area_struct *src, struct vm_area_struct **dup) 610 { 611 /* 612 * There are three cases to consider for correctly propagating 613 * anon_vma's on merge. 614 * 615 * The first is trivial - neither VMA has anon_vma, we need not do 616 * anything. 617 * 618 * The second where both have anon_vma is also a no-op, as they must 619 * then be the same, so there is simply nothing to copy. 620 * 621 * Here we cover the third - if the destination VMA has no anon_vma, 622 * that is it is unfaulted, we need to ensure that the newly merged 623 * range is referenced by the anon_vma's of the source. 624 */ 625 if (src->anon_vma && !dst->anon_vma) { 626 int ret; 627 628 vma_assert_write_locked(dst); 629 dst->anon_vma = src->anon_vma; 630 ret = anon_vma_clone(dst, src); 631 if (ret) 632 return ret; 633 634 *dup = dst; 635 } 636 637 return 0; 638 } 639 640 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 641 void validate_mm(struct mm_struct *mm) 642 { 643 int bug = 0; 644 int i = 0; 645 struct vm_area_struct *vma; 646 VMA_ITERATOR(vmi, mm, 0); 647 648 mt_validate(&mm->mm_mt); 649 for_each_vma(vmi, vma) { 650 #ifdef CONFIG_DEBUG_VM_RB 651 struct anon_vma *anon_vma = vma->anon_vma; 652 struct anon_vma_chain *avc; 653 #endif 654 unsigned long vmi_start, vmi_end; 655 bool warn = 0; 656 657 vmi_start = vma_iter_addr(&vmi); 658 vmi_end = vma_iter_end(&vmi); 659 if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 660 warn = 1; 661 662 if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 663 warn = 1; 664 665 if (warn) { 666 pr_emerg("issue in %s\n", current->comm); 667 dump_stack(); 668 dump_vma(vma); 669 pr_emerg("tree range: %px start %lx end %lx\n", vma, 670 vmi_start, vmi_end - 1); 671 vma_iter_dump_tree(&vmi); 672 } 673 674 #ifdef CONFIG_DEBUG_VM_RB 675 if (anon_vma) { 676 anon_vma_lock_read(anon_vma); 677 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 678 anon_vma_interval_tree_verify(avc); 679 anon_vma_unlock_read(anon_vma); 680 } 681 #endif 682 /* Check for a infinite loop */ 683 if (++i > mm->map_count + 10) { 684 i = -1; 685 break; 686 } 687 } 688 if (i != mm->map_count) { 689 pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 690 bug = 1; 691 } 692 VM_BUG_ON_MM(bug, mm); 693 } 694 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 695 696 /* 697 * Based on the vmg flag indicating whether we need to adjust the vm_start field 698 * for the middle or next VMA, we calculate what the range of the newly adjusted 699 * VMA ought to be, and set the VMA's range accordingly. 700 */ 701 static void vmg_adjust_set_range(struct vma_merge_struct *vmg) 702 { 703 struct vm_area_struct *adjust; 704 pgoff_t pgoff; 705 706 if (vmg->__adjust_middle_start) { 707 adjust = vmg->middle; 708 pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); 709 } else if (vmg->__adjust_next_start) { 710 adjust = vmg->next; 711 pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); 712 } else { 713 return; 714 } 715 716 vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); 717 } 718 719 /* 720 * Actually perform the VMA merge operation. 721 * 722 * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not 723 * modify any VMAs or cause inconsistent state should an OOM condition arise. 724 * 725 * Returns 0 on success, or an error value on failure. 726 */ 727 static int commit_merge(struct vma_merge_struct *vmg) 728 { 729 struct vm_area_struct *vma; 730 struct vma_prepare vp; 731 732 if (vmg->__adjust_next_start) { 733 /* We manipulate middle and adjust next, which is the target. */ 734 vma = vmg->middle; 735 vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); 736 } else { 737 vma = vmg->target; 738 /* Note: vma iterator must be pointing to 'start'. */ 739 vma_iter_config(vmg->vmi, vmg->start, vmg->end); 740 } 741 742 init_multi_vma_prep(&vp, vma, vmg); 743 744 /* 745 * If vmg->give_up_on_oom is set, we're safe, because we don't actually 746 * manipulate any VMAs until we succeed at preallocation. 747 * 748 * Past this point, we will not return an error. 749 */ 750 if (vma_iter_prealloc(vmg->vmi, vma)) 751 return -ENOMEM; 752 753 vma_prepare(&vp); 754 /* 755 * THP pages may need to do additional splits if we increase 756 * middle->vm_start. 757 */ 758 vma_adjust_trans_huge(vma, vmg->start, vmg->end, 759 vmg->__adjust_middle_start ? vmg->middle : NULL); 760 vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); 761 vmg_adjust_set_range(vmg); 762 vma_iter_store_overwrite(vmg->vmi, vmg->target); 763 764 vma_complete(&vp, vmg->vmi, vma->vm_mm); 765 766 return 0; 767 } 768 769 /* We can only remove VMAs when merging if they do not have a close hook. */ 770 static bool can_merge_remove_vma(struct vm_area_struct *vma) 771 { 772 return !vma->vm_ops || !vma->vm_ops->close; 773 } 774 775 /* 776 * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its 777 * attributes modified. 778 * 779 * @vmg: Describes the modifications being made to a VMA and associated 780 * metadata. 781 * 782 * When the attributes of a range within a VMA change, then it might be possible 783 * for immediately adjacent VMAs to be merged into that VMA due to having 784 * identical properties. 785 * 786 * This function checks for the existence of any such mergeable VMAs and updates 787 * the maple tree describing the @vmg->middle->vm_mm address space to account 788 * for this, as well as any VMAs shrunk/expanded/deleted as a result of this 789 * merge. 790 * 791 * As part of this operation, if a merge occurs, the @vmg object will have its 792 * vma, start, end, and pgoff fields modified to execute the merge. Subsequent 793 * calls to this function should reset these fields. 794 * 795 * Returns: The merged VMA if merge succeeds, or NULL otherwise. 796 * 797 * ASSUMPTIONS: 798 * - The caller must assign the VMA to be modifed to @vmg->middle. 799 * - The caller must have set @vmg->prev to the previous VMA, if there is one. 800 * - The caller must not set @vmg->next, as we determine this. 801 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. 802 * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). 803 */ 804 static __must_check struct vm_area_struct *vma_merge_existing_range( 805 struct vma_merge_struct *vmg) 806 { 807 struct vm_area_struct *middle = vmg->middle; 808 struct vm_area_struct *prev = vmg->prev; 809 struct vm_area_struct *next; 810 struct vm_area_struct *anon_dup = NULL; 811 unsigned long start = vmg->start; 812 unsigned long end = vmg->end; 813 bool left_side = middle && start == middle->vm_start; 814 bool right_side = middle && end == middle->vm_end; 815 int err = 0; 816 bool merge_left, merge_right, merge_both; 817 818 mmap_assert_write_locked(vmg->mm); 819 VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ 820 VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ 821 VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); 822 VM_WARN_ON_VMG(start >= end, vmg); 823 824 /* 825 * If middle == prev, then we are offset into a VMA. Otherwise, if we are 826 * not, we must span a portion of the VMA. 827 */ 828 VM_WARN_ON_VMG(middle && 829 ((middle != prev && vmg->start != middle->vm_start) || 830 vmg->end > middle->vm_end), vmg); 831 /* The vmi must be positioned within vmg->middle. */ 832 VM_WARN_ON_VMG(middle && 833 !(vma_iter_addr(vmg->vmi) >= middle->vm_start && 834 vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); 835 836 vmg->state = VMA_MERGE_NOMERGE; 837 838 /* 839 * If a special mapping or if the range being modified is neither at the 840 * furthermost left or right side of the VMA, then we have no chance of 841 * merging and should abort. 842 */ 843 if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) 844 return NULL; 845 846 if (left_side) 847 merge_left = can_vma_merge_left(vmg); 848 else 849 merge_left = false; 850 851 if (right_side) { 852 next = vmg->next = vma_iter_next_range(vmg->vmi); 853 vma_iter_prev_range(vmg->vmi); 854 855 merge_right = can_vma_merge_right(vmg, merge_left); 856 } else { 857 merge_right = false; 858 next = NULL; 859 } 860 861 if (merge_left) /* If merging prev, position iterator there. */ 862 vma_prev(vmg->vmi); 863 else if (!merge_right) /* If we have nothing to merge, abort. */ 864 return NULL; 865 866 merge_both = merge_left && merge_right; 867 /* If we span the entire VMA, a merge implies it will be deleted. */ 868 vmg->__remove_middle = left_side && right_side; 869 870 /* 871 * If we need to remove middle in its entirety but are unable to do so, 872 * we have no sensible recourse but to abort the merge. 873 */ 874 if (vmg->__remove_middle && !can_merge_remove_vma(middle)) 875 return NULL; 876 877 /* 878 * If we merge both VMAs, then next is also deleted. This implies 879 * merge_will_delete_vma also. 880 */ 881 vmg->__remove_next = merge_both; 882 883 /* 884 * If we cannot delete next, then we can reduce the operation to merging 885 * prev and middle (thereby deleting middle). 886 */ 887 if (vmg->__remove_next && !can_merge_remove_vma(next)) { 888 vmg->__remove_next = false; 889 merge_right = false; 890 merge_both = false; 891 } 892 893 /* No matter what happens, we will be adjusting middle. */ 894 vma_start_write(middle); 895 896 if (merge_right) { 897 vma_start_write(next); 898 vmg->target = next; 899 } 900 901 if (merge_left) { 902 vma_start_write(prev); 903 vmg->target = prev; 904 } 905 906 if (merge_both) { 907 /* 908 * |<-------------------->| 909 * |-------********-------| 910 * prev middle next 911 * extend delete delete 912 */ 913 914 vmg->start = prev->vm_start; 915 vmg->end = next->vm_end; 916 vmg->pgoff = prev->vm_pgoff; 917 918 /* 919 * We already ensured anon_vma compatibility above, so now it's 920 * simply a case of, if prev has no anon_vma object, which of 921 * next or middle contains the anon_vma we must duplicate. 922 */ 923 err = dup_anon_vma(prev, next->anon_vma ? next : middle, 924 &anon_dup); 925 } else if (merge_left) { 926 /* 927 * |<------------>| OR 928 * |<----------------->| 929 * |-------************* 930 * prev middle 931 * extend shrink/delete 932 */ 933 934 vmg->start = prev->vm_start; 935 vmg->pgoff = prev->vm_pgoff; 936 937 if (!vmg->__remove_middle) 938 vmg->__adjust_middle_start = true; 939 940 err = dup_anon_vma(prev, middle, &anon_dup); 941 } else { /* merge_right */ 942 /* 943 * |<------------->| OR 944 * |<----------------->| 945 * *************-------| 946 * middle next 947 * shrink/delete extend 948 */ 949 950 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); 951 952 VM_WARN_ON_VMG(!merge_right, vmg); 953 /* If we are offset into a VMA, then prev must be middle. */ 954 VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); 955 956 if (vmg->__remove_middle) { 957 vmg->end = next->vm_end; 958 vmg->pgoff = next->vm_pgoff - pglen; 959 } else { 960 /* We shrink middle and expand next. */ 961 vmg->__adjust_next_start = true; 962 vmg->start = middle->vm_start; 963 vmg->end = start; 964 vmg->pgoff = middle->vm_pgoff; 965 } 966 967 err = dup_anon_vma(next, middle, &anon_dup); 968 } 969 970 if (err) 971 goto abort; 972 973 err = commit_merge(vmg); 974 if (err) { 975 VM_WARN_ON(err != -ENOMEM); 976 977 if (anon_dup) 978 unlink_anon_vmas(anon_dup); 979 980 /* 981 * We've cleaned up any cloned anon_vma's, no VMAs have been 982 * modified, no harm no foul if the user requests that we not 983 * report this and just give up, leaving the VMAs unmerged. 984 */ 985 if (!vmg->give_up_on_oom) 986 vmg->state = VMA_MERGE_ERROR_NOMEM; 987 return NULL; 988 } 989 990 khugepaged_enter_vma(vmg->target, vmg->flags); 991 vmg->state = VMA_MERGE_SUCCESS; 992 return vmg->target; 993 994 abort: 995 vma_iter_set(vmg->vmi, start); 996 vma_iter_load(vmg->vmi); 997 998 /* 999 * This means we have failed to clone anon_vma's correctly, but no 1000 * actual changes to VMAs have occurred, so no harm no foul - if the 1001 * user doesn't want this reported and instead just wants to give up on 1002 * the merge, allow it. 1003 */ 1004 if (!vmg->give_up_on_oom) 1005 vmg->state = VMA_MERGE_ERROR_NOMEM; 1006 return NULL; 1007 } 1008 1009 /* 1010 * vma_merge_new_range - Attempt to merge a new VMA into address space 1011 * 1012 * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end 1013 * (exclusive), which we try to merge with any adjacent VMAs if possible. 1014 * 1015 * We are about to add a VMA to the address space starting at @vmg->start and 1016 * ending at @vmg->end. There are three different possible scenarios: 1017 * 1018 * 1. There is a VMA with identical properties immediately adjacent to the 1019 * proposed new VMA [@vmg->start, @vmg->end) either before or after it - 1020 * EXPAND that VMA: 1021 * 1022 * Proposed: |-----| or |-----| 1023 * Existing: |----| |----| 1024 * 1025 * 2. There are VMAs with identical properties immediately adjacent to the 1026 * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - 1027 * EXPAND the former and REMOVE the latter: 1028 * 1029 * Proposed: |-----| 1030 * Existing: |----| |----| 1031 * 1032 * 3. There are no VMAs immediately adjacent to the proposed new VMA or those 1033 * VMAs do not have identical attributes - NO MERGE POSSIBLE. 1034 * 1035 * In instances where we can merge, this function returns the expanded VMA which 1036 * will have its range adjusted accordingly and the underlying maple tree also 1037 * adjusted. 1038 * 1039 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer 1040 * to the VMA we expanded. 1041 * 1042 * This function adjusts @vmg to provide @vmg->next if not already specified, 1043 * and adjusts [@vmg->start, @vmg->end) to span the expanded range. 1044 * 1045 * ASSUMPTIONS: 1046 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. 1047 * - The caller must have determined that [@vmg->start, @vmg->end) is empty, 1048 other than VMAs that will be unmapped should the operation succeed. 1049 * - The caller must have specified the previous vma in @vmg->prev. 1050 * - The caller must have specified the next vma in @vmg->next. 1051 * - The caller must have positioned the vmi at or before the gap. 1052 */ 1053 struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) 1054 { 1055 struct vm_area_struct *prev = vmg->prev; 1056 struct vm_area_struct *next = vmg->next; 1057 unsigned long end = vmg->end; 1058 bool can_merge_left, can_merge_right; 1059 1060 mmap_assert_write_locked(vmg->mm); 1061 VM_WARN_ON_VMG(vmg->middle, vmg); 1062 /* vmi must point at or before the gap. */ 1063 VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); 1064 1065 vmg->state = VMA_MERGE_NOMERGE; 1066 1067 /* Special VMAs are unmergeable, also if no prev/next. */ 1068 if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) 1069 return NULL; 1070 1071 can_merge_left = can_vma_merge_left(vmg); 1072 can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); 1073 1074 /* If we can merge with the next VMA, adjust vmg accordingly. */ 1075 if (can_merge_right) { 1076 vmg->end = next->vm_end; 1077 vmg->middle = next; 1078 } 1079 1080 /* If we can merge with the previous VMA, adjust vmg accordingly. */ 1081 if (can_merge_left) { 1082 vmg->start = prev->vm_start; 1083 vmg->middle = prev; 1084 vmg->pgoff = prev->vm_pgoff; 1085 1086 /* 1087 * If this merge would result in removal of the next VMA but we 1088 * are not permitted to do so, reduce the operation to merging 1089 * prev and vma. 1090 */ 1091 if (can_merge_right && !can_merge_remove_vma(next)) 1092 vmg->end = end; 1093 1094 /* In expand-only case we are already positioned at prev. */ 1095 if (!vmg->just_expand) { 1096 /* Equivalent to going to the previous range. */ 1097 vma_prev(vmg->vmi); 1098 } 1099 } 1100 1101 /* 1102 * Now try to expand adjacent VMA(s). This takes care of removing the 1103 * following VMA if we have VMAs on both sides. 1104 */ 1105 if (vmg->middle && !vma_expand(vmg)) { 1106 khugepaged_enter_vma(vmg->middle, vmg->flags); 1107 vmg->state = VMA_MERGE_SUCCESS; 1108 return vmg->middle; 1109 } 1110 1111 return NULL; 1112 } 1113 1114 /* 1115 * vma_expand - Expand an existing VMA 1116 * 1117 * @vmg: Describes a VMA expansion operation. 1118 * 1119 * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. 1120 * Will expand over vmg->next if it's different from vmg->middle and vmg->end == 1121 * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with 1122 * vmg->next needs to be handled by the caller. 1123 * 1124 * Returns: 0 on success. 1125 * 1126 * ASSUMPTIONS: 1127 * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. 1128 * - The caller must have set @vmg->middle and @vmg->next. 1129 */ 1130 int vma_expand(struct vma_merge_struct *vmg) 1131 { 1132 struct vm_area_struct *anon_dup = NULL; 1133 bool remove_next = false; 1134 struct vm_area_struct *middle = vmg->middle; 1135 struct vm_area_struct *next = vmg->next; 1136 1137 mmap_assert_write_locked(vmg->mm); 1138 1139 vma_start_write(middle); 1140 if (next && (middle != next) && (vmg->end == next->vm_end)) { 1141 int ret; 1142 1143 remove_next = true; 1144 /* This should already have been checked by this point. */ 1145 VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); 1146 vma_start_write(next); 1147 /* 1148 * In this case we don't report OOM, so vmg->give_up_on_mm is 1149 * safe. 1150 */ 1151 ret = dup_anon_vma(middle, next, &anon_dup); 1152 if (ret) 1153 return ret; 1154 } 1155 1156 /* Not merging but overwriting any part of next is not handled. */ 1157 VM_WARN_ON_VMG(next && !remove_next && 1158 next != middle && vmg->end > next->vm_start, vmg); 1159 /* Only handles expanding */ 1160 VM_WARN_ON_VMG(middle->vm_start < vmg->start || 1161 middle->vm_end > vmg->end, vmg); 1162 1163 vmg->target = middle; 1164 if (remove_next) 1165 vmg->__remove_next = true; 1166 1167 if (commit_merge(vmg)) 1168 goto nomem; 1169 1170 return 0; 1171 1172 nomem: 1173 if (anon_dup) 1174 unlink_anon_vmas(anon_dup); 1175 /* 1176 * If the user requests that we just give upon OOM, we are safe to do so 1177 * here, as commit merge provides this contract to us. Nothing has been 1178 * changed - no harm no foul, just don't report it. 1179 */ 1180 if (!vmg->give_up_on_oom) 1181 vmg->state = VMA_MERGE_ERROR_NOMEM; 1182 return -ENOMEM; 1183 } 1184 1185 /* 1186 * vma_shrink() - Reduce an existing VMAs memory area 1187 * @vmi: The vma iterator 1188 * @vma: The VMA to modify 1189 * @start: The new start 1190 * @end: The new end 1191 * 1192 * Returns: 0 on success, -ENOMEM otherwise 1193 */ 1194 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 1195 unsigned long start, unsigned long end, pgoff_t pgoff) 1196 { 1197 struct vma_prepare vp; 1198 1199 WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 1200 1201 if (vma->vm_start < start) 1202 vma_iter_config(vmi, vma->vm_start, start); 1203 else 1204 vma_iter_config(vmi, end, vma->vm_end); 1205 1206 if (vma_iter_prealloc(vmi, NULL)) 1207 return -ENOMEM; 1208 1209 vma_start_write(vma); 1210 1211 init_vma_prep(&vp, vma); 1212 vma_prepare(&vp); 1213 vma_adjust_trans_huge(vma, start, end, NULL); 1214 1215 vma_iter_clear(vmi); 1216 vma_set_range(vma, start, end, pgoff); 1217 vma_complete(&vp, vmi, vma->vm_mm); 1218 validate_mm(vma->vm_mm); 1219 return 0; 1220 } 1221 1222 static inline void vms_clear_ptes(struct vma_munmap_struct *vms, 1223 struct ma_state *mas_detach, bool mm_wr_locked) 1224 { 1225 struct mmu_gather tlb; 1226 1227 if (!vms->clear_ptes) /* Nothing to do */ 1228 return; 1229 1230 /* 1231 * We can free page tables without write-locking mmap_lock because VMAs 1232 * were isolated before we downgraded mmap_lock. 1233 */ 1234 mas_set(mas_detach, 1); 1235 tlb_gather_mmu(&tlb, vms->vma->vm_mm); 1236 update_hiwater_rss(vms->vma->vm_mm); 1237 unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, 1238 vms->vma_count, mm_wr_locked); 1239 1240 mas_set(mas_detach, 1); 1241 /* start and end may be different if there is no prev or next vma. */ 1242 free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, 1243 vms->unmap_end, mm_wr_locked); 1244 tlb_finish_mmu(&tlb); 1245 vms->clear_ptes = false; 1246 } 1247 1248 static void vms_clean_up_area(struct vma_munmap_struct *vms, 1249 struct ma_state *mas_detach) 1250 { 1251 struct vm_area_struct *vma; 1252 1253 if (!vms->nr_pages) 1254 return; 1255 1256 vms_clear_ptes(vms, mas_detach, true); 1257 mas_set(mas_detach, 0); 1258 mas_for_each(mas_detach, vma, ULONG_MAX) 1259 vma_close(vma); 1260 } 1261 1262 /* 1263 * vms_complete_munmap_vmas() - Finish the munmap() operation 1264 * @vms: The vma munmap struct 1265 * @mas_detach: The maple state of the detached vmas 1266 * 1267 * This updates the mm_struct, unmaps the region, frees the resources 1268 * used for the munmap() and may downgrade the lock - if requested. Everything 1269 * needed to be done once the vma maple tree is updated. 1270 */ 1271 static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, 1272 struct ma_state *mas_detach) 1273 { 1274 struct vm_area_struct *vma; 1275 struct mm_struct *mm; 1276 1277 mm = current->mm; 1278 mm->map_count -= vms->vma_count; 1279 mm->locked_vm -= vms->locked_vm; 1280 if (vms->unlock) 1281 mmap_write_downgrade(mm); 1282 1283 if (!vms->nr_pages) 1284 return; 1285 1286 vms_clear_ptes(vms, mas_detach, !vms->unlock); 1287 /* Update high watermark before we lower total_vm */ 1288 update_hiwater_vm(mm); 1289 /* Stat accounting */ 1290 WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); 1291 /* Paranoid bookkeeping */ 1292 VM_WARN_ON(vms->exec_vm > mm->exec_vm); 1293 VM_WARN_ON(vms->stack_vm > mm->stack_vm); 1294 VM_WARN_ON(vms->data_vm > mm->data_vm); 1295 mm->exec_vm -= vms->exec_vm; 1296 mm->stack_vm -= vms->stack_vm; 1297 mm->data_vm -= vms->data_vm; 1298 1299 /* Remove and clean up vmas */ 1300 mas_set(mas_detach, 0); 1301 mas_for_each(mas_detach, vma, ULONG_MAX) 1302 remove_vma(vma); 1303 1304 vm_unacct_memory(vms->nr_accounted); 1305 validate_mm(mm); 1306 if (vms->unlock) 1307 mmap_read_unlock(mm); 1308 1309 __mt_destroy(mas_detach->tree); 1310 } 1311 1312 /* 1313 * reattach_vmas() - Undo any munmap work and free resources 1314 * @mas_detach: The maple state with the detached maple tree 1315 * 1316 * Reattach any detached vmas and free up the maple tree used to track the vmas. 1317 */ 1318 static void reattach_vmas(struct ma_state *mas_detach) 1319 { 1320 struct vm_area_struct *vma; 1321 1322 mas_set(mas_detach, 0); 1323 mas_for_each(mas_detach, vma, ULONG_MAX) 1324 vma_mark_attached(vma); 1325 1326 __mt_destroy(mas_detach->tree); 1327 } 1328 1329 /* 1330 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree 1331 * for removal at a later date. Handles splitting first and last if necessary 1332 * and marking the vmas as isolated. 1333 * 1334 * @vms: The vma munmap struct 1335 * @mas_detach: The maple state tracking the detached tree 1336 * 1337 * Return: 0 on success, error otherwise 1338 */ 1339 static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, 1340 struct ma_state *mas_detach) 1341 { 1342 struct vm_area_struct *next = NULL; 1343 int error; 1344 1345 /* 1346 * If we need to split any vma, do it now to save pain later. 1347 * Does it split the first one? 1348 */ 1349 if (vms->start > vms->vma->vm_start) { 1350 1351 /* 1352 * Make sure that map_count on return from munmap() will 1353 * not exceed its limit; but let map_count go just above 1354 * its limit temporarily, to help free resources as expected. 1355 */ 1356 if (vms->end < vms->vma->vm_end && 1357 vms->vma->vm_mm->map_count >= sysctl_max_map_count) { 1358 error = -ENOMEM; 1359 goto map_count_exceeded; 1360 } 1361 1362 /* Don't bother splitting the VMA if we can't unmap it anyway */ 1363 if (!can_modify_vma(vms->vma)) { 1364 error = -EPERM; 1365 goto start_split_failed; 1366 } 1367 1368 error = __split_vma(vms->vmi, vms->vma, vms->start, 1); 1369 if (error) 1370 goto start_split_failed; 1371 } 1372 vms->prev = vma_prev(vms->vmi); 1373 if (vms->prev) 1374 vms->unmap_start = vms->prev->vm_end; 1375 1376 /* 1377 * Detach a range of VMAs from the mm. Using next as a temp variable as 1378 * it is always overwritten. 1379 */ 1380 for_each_vma_range(*(vms->vmi), next, vms->end) { 1381 long nrpages; 1382 1383 if (!can_modify_vma(next)) { 1384 error = -EPERM; 1385 goto modify_vma_failed; 1386 } 1387 /* Does it split the end? */ 1388 if (next->vm_end > vms->end) { 1389 error = __split_vma(vms->vmi, next, vms->end, 0); 1390 if (error) 1391 goto end_split_failed; 1392 } 1393 vma_start_write(next); 1394 mas_set(mas_detach, vms->vma_count++); 1395 error = mas_store_gfp(mas_detach, next, GFP_KERNEL); 1396 if (error) 1397 goto munmap_gather_failed; 1398 1399 vma_mark_detached(next); 1400 nrpages = vma_pages(next); 1401 1402 vms->nr_pages += nrpages; 1403 if (next->vm_flags & VM_LOCKED) 1404 vms->locked_vm += nrpages; 1405 1406 if (next->vm_flags & VM_ACCOUNT) 1407 vms->nr_accounted += nrpages; 1408 1409 if (is_exec_mapping(next->vm_flags)) 1410 vms->exec_vm += nrpages; 1411 else if (is_stack_mapping(next->vm_flags)) 1412 vms->stack_vm += nrpages; 1413 else if (is_data_mapping(next->vm_flags)) 1414 vms->data_vm += nrpages; 1415 1416 if (vms->uf) { 1417 /* 1418 * If userfaultfd_unmap_prep returns an error the vmas 1419 * will remain split, but userland will get a 1420 * highly unexpected error anyway. This is no 1421 * different than the case where the first of the two 1422 * __split_vma fails, but we don't undo the first 1423 * split, despite we could. This is unlikely enough 1424 * failure that it's not worth optimizing it for. 1425 */ 1426 error = userfaultfd_unmap_prep(next, vms->start, 1427 vms->end, vms->uf); 1428 if (error) 1429 goto userfaultfd_error; 1430 } 1431 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 1432 BUG_ON(next->vm_start < vms->start); 1433 BUG_ON(next->vm_start > vms->end); 1434 #endif 1435 } 1436 1437 vms->next = vma_next(vms->vmi); 1438 if (vms->next) 1439 vms->unmap_end = vms->next->vm_start; 1440 1441 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1442 /* Make sure no VMAs are about to be lost. */ 1443 { 1444 MA_STATE(test, mas_detach->tree, 0, 0); 1445 struct vm_area_struct *vma_mas, *vma_test; 1446 int test_count = 0; 1447 1448 vma_iter_set(vms->vmi, vms->start); 1449 rcu_read_lock(); 1450 vma_test = mas_find(&test, vms->vma_count - 1); 1451 for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { 1452 BUG_ON(vma_mas != vma_test); 1453 test_count++; 1454 vma_test = mas_next(&test, vms->vma_count - 1); 1455 } 1456 rcu_read_unlock(); 1457 BUG_ON(vms->vma_count != test_count); 1458 } 1459 #endif 1460 1461 while (vma_iter_addr(vms->vmi) > vms->start) 1462 vma_iter_prev_range(vms->vmi); 1463 1464 vms->clear_ptes = true; 1465 return 0; 1466 1467 userfaultfd_error: 1468 munmap_gather_failed: 1469 end_split_failed: 1470 modify_vma_failed: 1471 reattach_vmas(mas_detach); 1472 start_split_failed: 1473 map_count_exceeded: 1474 return error; 1475 } 1476 1477 /* 1478 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct 1479 * @vms: The vma munmap struct 1480 * @vmi: The vma iterator 1481 * @vma: The first vm_area_struct to munmap 1482 * @start: The aligned start address to munmap 1483 * @end: The aligned end address to munmap 1484 * @uf: The userfaultfd list_head 1485 * @unlock: Unlock after the operation. Only unlocked on success 1486 */ 1487 static void init_vma_munmap(struct vma_munmap_struct *vms, 1488 struct vma_iterator *vmi, struct vm_area_struct *vma, 1489 unsigned long start, unsigned long end, struct list_head *uf, 1490 bool unlock) 1491 { 1492 vms->vmi = vmi; 1493 vms->vma = vma; 1494 if (vma) { 1495 vms->start = start; 1496 vms->end = end; 1497 } else { 1498 vms->start = vms->end = 0; 1499 } 1500 vms->unlock = unlock; 1501 vms->uf = uf; 1502 vms->vma_count = 0; 1503 vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; 1504 vms->exec_vm = vms->stack_vm = vms->data_vm = 0; 1505 vms->unmap_start = FIRST_USER_ADDRESS; 1506 vms->unmap_end = USER_PGTABLES_CEILING; 1507 vms->clear_ptes = false; 1508 } 1509 1510 /* 1511 * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 1512 * @vmi: The vma iterator 1513 * @vma: The starting vm_area_struct 1514 * @mm: The mm_struct 1515 * @start: The aligned start address to munmap. 1516 * @end: The aligned end address to munmap. 1517 * @uf: The userfaultfd list_head 1518 * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 1519 * success. 1520 * 1521 * Return: 0 on success and drops the lock if so directed, error and leaves the 1522 * lock held otherwise. 1523 */ 1524 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 1525 struct mm_struct *mm, unsigned long start, unsigned long end, 1526 struct list_head *uf, bool unlock) 1527 { 1528 struct maple_tree mt_detach; 1529 MA_STATE(mas_detach, &mt_detach, 0, 0); 1530 mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 1531 mt_on_stack(mt_detach); 1532 struct vma_munmap_struct vms; 1533 int error; 1534 1535 init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); 1536 error = vms_gather_munmap_vmas(&vms, &mas_detach); 1537 if (error) 1538 goto gather_failed; 1539 1540 error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 1541 if (error) 1542 goto clear_tree_failed; 1543 1544 /* Point of no return */ 1545 vms_complete_munmap_vmas(&vms, &mas_detach); 1546 return 0; 1547 1548 clear_tree_failed: 1549 reattach_vmas(&mas_detach); 1550 gather_failed: 1551 validate_mm(mm); 1552 return error; 1553 } 1554 1555 /* 1556 * do_vmi_munmap() - munmap a given range. 1557 * @vmi: The vma iterator 1558 * @mm: The mm_struct 1559 * @start: The start address to munmap 1560 * @len: The length of the range to munmap 1561 * @uf: The userfaultfd list_head 1562 * @unlock: set to true if the user wants to drop the mmap_lock on success 1563 * 1564 * This function takes a @mas that is either pointing to the previous VMA or set 1565 * to MA_START and sets it up to remove the mapping(s). The @len will be 1566 * aligned. 1567 * 1568 * Return: 0 on success and drops the lock if so directed, error and leaves the 1569 * lock held otherwise. 1570 */ 1571 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 1572 unsigned long start, size_t len, struct list_head *uf, 1573 bool unlock) 1574 { 1575 unsigned long end; 1576 struct vm_area_struct *vma; 1577 1578 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 1579 return -EINVAL; 1580 1581 end = start + PAGE_ALIGN(len); 1582 if (end == start) 1583 return -EINVAL; 1584 1585 /* Find the first overlapping VMA */ 1586 vma = vma_find(vmi, end); 1587 if (!vma) { 1588 if (unlock) 1589 mmap_write_unlock(mm); 1590 return 0; 1591 } 1592 1593 return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 1594 } 1595 1596 /* 1597 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1598 * context and anonymous VMA name within the range [start, end). 1599 * 1600 * As a result, we might be able to merge the newly modified VMA range with an 1601 * adjacent VMA with identical properties. 1602 * 1603 * If no merge is possible and the range does not span the entirety of the VMA, 1604 * we then need to split the VMA to accommodate the change. 1605 * 1606 * The function returns either the merged VMA, the original VMA if a split was 1607 * required instead, or an error if the split failed. 1608 */ 1609 static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) 1610 { 1611 struct vm_area_struct *vma = vmg->middle; 1612 unsigned long start = vmg->start; 1613 unsigned long end = vmg->end; 1614 struct vm_area_struct *merged; 1615 1616 /* First, try to merge. */ 1617 merged = vma_merge_existing_range(vmg); 1618 if (merged) 1619 return merged; 1620 if (vmg_nomem(vmg)) 1621 return ERR_PTR(-ENOMEM); 1622 1623 /* 1624 * Split can fail for reasons other than OOM, so if the user requests 1625 * this it's probably a mistake. 1626 */ 1627 VM_WARN_ON(vmg->give_up_on_oom && 1628 (vma->vm_start != start || vma->vm_end != end)); 1629 1630 /* Split any preceding portion of the VMA. */ 1631 if (vma->vm_start < start) { 1632 int err = split_vma(vmg->vmi, vma, start, 1); 1633 1634 if (err) 1635 return ERR_PTR(err); 1636 } 1637 1638 /* Split any trailing portion of the VMA. */ 1639 if (vma->vm_end > end) { 1640 int err = split_vma(vmg->vmi, vma, end, 0); 1641 1642 if (err) 1643 return ERR_PTR(err); 1644 } 1645 1646 return vma; 1647 } 1648 1649 struct vm_area_struct *vma_modify_flags( 1650 struct vma_iterator *vmi, struct vm_area_struct *prev, 1651 struct vm_area_struct *vma, unsigned long start, unsigned long end, 1652 unsigned long new_flags) 1653 { 1654 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1655 1656 vmg.flags = new_flags; 1657 1658 return vma_modify(&vmg); 1659 } 1660 1661 struct vm_area_struct 1662 *vma_modify_flags_name(struct vma_iterator *vmi, 1663 struct vm_area_struct *prev, 1664 struct vm_area_struct *vma, 1665 unsigned long start, 1666 unsigned long end, 1667 unsigned long new_flags, 1668 struct anon_vma_name *new_name) 1669 { 1670 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1671 1672 vmg.flags = new_flags; 1673 vmg.anon_name = new_name; 1674 1675 return vma_modify(&vmg); 1676 } 1677 1678 struct vm_area_struct 1679 *vma_modify_policy(struct vma_iterator *vmi, 1680 struct vm_area_struct *prev, 1681 struct vm_area_struct *vma, 1682 unsigned long start, unsigned long end, 1683 struct mempolicy *new_pol) 1684 { 1685 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1686 1687 vmg.policy = new_pol; 1688 1689 return vma_modify(&vmg); 1690 } 1691 1692 struct vm_area_struct 1693 *vma_modify_flags_uffd(struct vma_iterator *vmi, 1694 struct vm_area_struct *prev, 1695 struct vm_area_struct *vma, 1696 unsigned long start, unsigned long end, 1697 unsigned long new_flags, 1698 struct vm_userfaultfd_ctx new_ctx, 1699 bool give_up_on_oom) 1700 { 1701 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1702 1703 vmg.flags = new_flags; 1704 vmg.uffd_ctx = new_ctx; 1705 if (give_up_on_oom) 1706 vmg.give_up_on_oom = true; 1707 1708 return vma_modify(&vmg); 1709 } 1710 1711 /* 1712 * Expand vma by delta bytes, potentially merging with an immediately adjacent 1713 * VMA with identical properties. 1714 */ 1715 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1716 struct vm_area_struct *vma, 1717 unsigned long delta) 1718 { 1719 VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); 1720 1721 vmg.next = vma_iter_next_rewind(vmi, NULL); 1722 vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ 1723 1724 return vma_merge_new_range(&vmg); 1725 } 1726 1727 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 1728 { 1729 vb->count = 0; 1730 } 1731 1732 static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 1733 { 1734 struct address_space *mapping; 1735 int i; 1736 1737 mapping = vb->vmas[0]->vm_file->f_mapping; 1738 i_mmap_lock_write(mapping); 1739 for (i = 0; i < vb->count; i++) { 1740 VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 1741 __remove_shared_vm_struct(vb->vmas[i], mapping); 1742 } 1743 i_mmap_unlock_write(mapping); 1744 1745 unlink_file_vma_batch_init(vb); 1746 } 1747 1748 void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 1749 struct vm_area_struct *vma) 1750 { 1751 if (vma->vm_file == NULL) 1752 return; 1753 1754 if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 1755 vb->count == ARRAY_SIZE(vb->vmas)) 1756 unlink_file_vma_batch_process(vb); 1757 1758 vb->vmas[vb->count] = vma; 1759 vb->count++; 1760 } 1761 1762 void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 1763 { 1764 if (vb->count > 0) 1765 unlink_file_vma_batch_process(vb); 1766 } 1767 1768 /* 1769 * Unlink a file-based vm structure from its interval tree, to hide 1770 * vma from rmap and vmtruncate before freeing its page tables. 1771 */ 1772 void unlink_file_vma(struct vm_area_struct *vma) 1773 { 1774 struct file *file = vma->vm_file; 1775 1776 if (file) { 1777 struct address_space *mapping = file->f_mapping; 1778 1779 i_mmap_lock_write(mapping); 1780 __remove_shared_vm_struct(vma, mapping); 1781 i_mmap_unlock_write(mapping); 1782 } 1783 } 1784 1785 void vma_link_file(struct vm_area_struct *vma) 1786 { 1787 struct file *file = vma->vm_file; 1788 struct address_space *mapping; 1789 1790 if (file) { 1791 mapping = file->f_mapping; 1792 i_mmap_lock_write(mapping); 1793 __vma_link_file(vma, mapping); 1794 i_mmap_unlock_write(mapping); 1795 } 1796 } 1797 1798 int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 1799 { 1800 VMA_ITERATOR(vmi, mm, 0); 1801 1802 vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1803 if (vma_iter_prealloc(&vmi, vma)) 1804 return -ENOMEM; 1805 1806 vma_start_write(vma); 1807 vma_iter_store_new(&vmi, vma); 1808 vma_link_file(vma); 1809 mm->map_count++; 1810 validate_mm(mm); 1811 return 0; 1812 } 1813 1814 /* 1815 * Copy the vma structure to a new location in the same mm, 1816 * prior to moving page table entries, to effect an mremap move. 1817 */ 1818 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1819 unsigned long addr, unsigned long len, pgoff_t pgoff, 1820 bool *need_rmap_locks) 1821 { 1822 struct vm_area_struct *vma = *vmap; 1823 unsigned long vma_start = vma->vm_start; 1824 struct mm_struct *mm = vma->vm_mm; 1825 struct vm_area_struct *new_vma; 1826 bool faulted_in_anon_vma = true; 1827 VMA_ITERATOR(vmi, mm, addr); 1828 VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); 1829 1830 /* 1831 * If anonymous vma has not yet been faulted, update new pgoff 1832 * to match new location, to increase its chance of merging. 1833 */ 1834 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 1835 pgoff = addr >> PAGE_SHIFT; 1836 faulted_in_anon_vma = false; 1837 } 1838 1839 /* 1840 * If the VMA we are copying might contain a uprobe PTE, ensure 1841 * that we do not establish one upon merge. Otherwise, when mremap() 1842 * moves page tables, it will orphan the newly created PTE. 1843 */ 1844 if (vma->vm_file) 1845 vmg.skip_vma_uprobe = true; 1846 1847 new_vma = find_vma_prev(mm, addr, &vmg.prev); 1848 if (new_vma && new_vma->vm_start < addr + len) 1849 return NULL; /* should never get here */ 1850 1851 vmg.middle = NULL; /* New VMA range. */ 1852 vmg.pgoff = pgoff; 1853 vmg.next = vma_iter_next_rewind(&vmi, NULL); 1854 new_vma = vma_merge_new_range(&vmg); 1855 1856 if (new_vma) { 1857 /* 1858 * Source vma may have been merged into new_vma 1859 */ 1860 if (unlikely(vma_start >= new_vma->vm_start && 1861 vma_start < new_vma->vm_end)) { 1862 /* 1863 * The only way we can get a vma_merge with 1864 * self during an mremap is if the vma hasn't 1865 * been faulted in yet and we were allowed to 1866 * reset the dst vma->vm_pgoff to the 1867 * destination address of the mremap to allow 1868 * the merge to happen. mremap must change the 1869 * vm_pgoff linearity between src and dst vmas 1870 * (in turn preventing a vma_merge) to be 1871 * safe. It is only safe to keep the vm_pgoff 1872 * linear if there are no pages mapped yet. 1873 */ 1874 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 1875 *vmap = vma = new_vma; 1876 } 1877 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 1878 } else { 1879 new_vma = vm_area_dup(vma); 1880 if (!new_vma) 1881 goto out; 1882 vma_set_range(new_vma, addr, addr + len, pgoff); 1883 if (vma_dup_policy(vma, new_vma)) 1884 goto out_free_vma; 1885 if (anon_vma_clone(new_vma, vma)) 1886 goto out_free_mempol; 1887 if (new_vma->vm_file) 1888 get_file(new_vma->vm_file); 1889 if (new_vma->vm_ops && new_vma->vm_ops->open) 1890 new_vma->vm_ops->open(new_vma); 1891 if (vma_link(mm, new_vma)) 1892 goto out_vma_link; 1893 *need_rmap_locks = false; 1894 } 1895 return new_vma; 1896 1897 out_vma_link: 1898 fixup_hugetlb_reservations(new_vma); 1899 vma_close(new_vma); 1900 1901 if (new_vma->vm_file) 1902 fput(new_vma->vm_file); 1903 1904 unlink_anon_vmas(new_vma); 1905 out_free_mempol: 1906 mpol_put(vma_policy(new_vma)); 1907 out_free_vma: 1908 vm_area_free(new_vma); 1909 out: 1910 return NULL; 1911 } 1912 1913 /* 1914 * Rough compatibility check to quickly see if it's even worth looking 1915 * at sharing an anon_vma. 1916 * 1917 * They need to have the same vm_file, and the flags can only differ 1918 * in things that mprotect may change. 1919 * 1920 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1921 * we can merge the two vma's. For example, we refuse to merge a vma if 1922 * there is a vm_ops->close() function, because that indicates that the 1923 * driver is doing some kind of reference counting. But that doesn't 1924 * really matter for the anon_vma sharing case. 1925 */ 1926 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1927 { 1928 return a->vm_end == b->vm_start && 1929 mpol_equal(vma_policy(a), vma_policy(b)) && 1930 a->vm_file == b->vm_file && 1931 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1932 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1933 } 1934 1935 /* 1936 * Do some basic sanity checking to see if we can re-use the anon_vma 1937 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1938 * the same as 'old', the other will be the new one that is trying 1939 * to share the anon_vma. 1940 * 1941 * NOTE! This runs with mmap_lock held for reading, so it is possible that 1942 * the anon_vma of 'old' is concurrently in the process of being set up 1943 * by another page fault trying to merge _that_. But that's ok: if it 1944 * is being set up, that automatically means that it will be a singleton 1945 * acceptable for merging, so we can do all of this optimistically. But 1946 * we do that READ_ONCE() to make sure that we never re-load the pointer. 1947 * 1948 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1949 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1950 * is to return an anon_vma that is "complex" due to having gone through 1951 * a fork). 1952 * 1953 * We also make sure that the two vma's are compatible (adjacent, 1954 * and with the same memory policies). That's all stable, even with just 1955 * a read lock on the mmap_lock. 1956 */ 1957 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, 1958 struct vm_area_struct *a, 1959 struct vm_area_struct *b) 1960 { 1961 if (anon_vma_compatible(a, b)) { 1962 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1963 1964 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1965 return anon_vma; 1966 } 1967 return NULL; 1968 } 1969 1970 /* 1971 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1972 * neighbouring vmas for a suitable anon_vma, before it goes off 1973 * to allocate a new anon_vma. It checks because a repetitive 1974 * sequence of mprotects and faults may otherwise lead to distinct 1975 * anon_vmas being allocated, preventing vma merge in subsequent 1976 * mprotect. 1977 */ 1978 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1979 { 1980 struct anon_vma *anon_vma = NULL; 1981 struct vm_area_struct *prev, *next; 1982 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1983 1984 /* Try next first. */ 1985 next = vma_iter_load(&vmi); 1986 if (next) { 1987 anon_vma = reusable_anon_vma(next, vma, next); 1988 if (anon_vma) 1989 return anon_vma; 1990 } 1991 1992 prev = vma_prev(&vmi); 1993 VM_BUG_ON_VMA(prev != vma, vma); 1994 prev = vma_prev(&vmi); 1995 /* Try prev next. */ 1996 if (prev) 1997 anon_vma = reusable_anon_vma(prev, prev, vma); 1998 1999 /* 2000 * We might reach here with anon_vma == NULL if we can't find 2001 * any reusable anon_vma. 2002 * There's no absolute need to look only at touching neighbours: 2003 * we could search further afield for "compatible" anon_vmas. 2004 * But it would probably just be a waste of time searching, 2005 * or lead to too many vmas hanging off the same anon_vma. 2006 * We're trying to allow mprotect remerging later on, 2007 * not trying to minimize memory used for anon_vmas. 2008 */ 2009 return anon_vma; 2010 } 2011 2012 static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 2013 { 2014 return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 2015 } 2016 2017 static bool vma_is_shared_writable(struct vm_area_struct *vma) 2018 { 2019 return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 2020 (VM_WRITE | VM_SHARED); 2021 } 2022 2023 static bool vma_fs_can_writeback(struct vm_area_struct *vma) 2024 { 2025 /* No managed pages to writeback. */ 2026 if (vma->vm_flags & VM_PFNMAP) 2027 return false; 2028 2029 return vma->vm_file && vma->vm_file->f_mapping && 2030 mapping_can_writeback(vma->vm_file->f_mapping); 2031 } 2032 2033 /* 2034 * Does this VMA require the underlying folios to have their dirty state 2035 * tracked? 2036 */ 2037 bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 2038 { 2039 /* Only shared, writable VMAs require dirty tracking. */ 2040 if (!vma_is_shared_writable(vma)) 2041 return false; 2042 2043 /* Does the filesystem need to be notified? */ 2044 if (vm_ops_needs_writenotify(vma->vm_ops)) 2045 return true; 2046 2047 /* 2048 * Even if the filesystem doesn't indicate a need for writenotify, if it 2049 * can writeback, dirty tracking is still required. 2050 */ 2051 return vma_fs_can_writeback(vma); 2052 } 2053 2054 /* 2055 * Some shared mappings will want the pages marked read-only 2056 * to track write events. If so, we'll downgrade vm_page_prot 2057 * to the private version (using protection_map[] without the 2058 * VM_SHARED bit). 2059 */ 2060 bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 2061 { 2062 /* If it was private or non-writable, the write bit is already clear */ 2063 if (!vma_is_shared_writable(vma)) 2064 return false; 2065 2066 /* The backer wishes to know when pages are first written to? */ 2067 if (vm_ops_needs_writenotify(vma->vm_ops)) 2068 return true; 2069 2070 /* The open routine did something to the protections that pgprot_modify 2071 * won't preserve? */ 2072 if (pgprot_val(vm_page_prot) != 2073 pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 2074 return false; 2075 2076 /* 2077 * Do we need to track softdirty? hugetlb does not support softdirty 2078 * tracking yet. 2079 */ 2080 if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 2081 return true; 2082 2083 /* Do we need write faults for uffd-wp tracking? */ 2084 if (userfaultfd_wp(vma)) 2085 return true; 2086 2087 /* Can the mapping track the dirty pages? */ 2088 return vma_fs_can_writeback(vma); 2089 } 2090 2091 static DEFINE_MUTEX(mm_all_locks_mutex); 2092 2093 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2094 { 2095 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2096 /* 2097 * The LSB of head.next can't change from under us 2098 * because we hold the mm_all_locks_mutex. 2099 */ 2100 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 2101 /* 2102 * We can safely modify head.next after taking the 2103 * anon_vma->root->rwsem. If some other vma in this mm shares 2104 * the same anon_vma we won't take it again. 2105 * 2106 * No need of atomic instructions here, head.next 2107 * can't change from under us thanks to the 2108 * anon_vma->root->rwsem. 2109 */ 2110 if (__test_and_set_bit(0, (unsigned long *) 2111 &anon_vma->root->rb_root.rb_root.rb_node)) 2112 BUG(); 2113 } 2114 } 2115 2116 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 2117 { 2118 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2119 /* 2120 * AS_MM_ALL_LOCKS can't change from under us because 2121 * we hold the mm_all_locks_mutex. 2122 * 2123 * Operations on ->flags have to be atomic because 2124 * even if AS_MM_ALL_LOCKS is stable thanks to the 2125 * mm_all_locks_mutex, there may be other cpus 2126 * changing other bitflags in parallel to us. 2127 */ 2128 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2129 BUG(); 2130 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 2131 } 2132 } 2133 2134 /* 2135 * This operation locks against the VM for all pte/vma/mm related 2136 * operations that could ever happen on a certain mm. This includes 2137 * vmtruncate, try_to_unmap, and all page faults. 2138 * 2139 * The caller must take the mmap_lock in write mode before calling 2140 * mm_take_all_locks(). The caller isn't allowed to release the 2141 * mmap_lock until mm_drop_all_locks() returns. 2142 * 2143 * mmap_lock in write mode is required in order to block all operations 2144 * that could modify pagetables and free pages without need of 2145 * altering the vma layout. It's also needed in write mode to avoid new 2146 * anon_vmas to be associated with existing vmas. 2147 * 2148 * A single task can't take more than one mm_take_all_locks() in a row 2149 * or it would deadlock. 2150 * 2151 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 2152 * mapping->flags avoid to take the same lock twice, if more than one 2153 * vma in this mm is backed by the same anon_vma or address_space. 2154 * 2155 * We take locks in following order, accordingly to comment at beginning 2156 * of mm/rmap.c: 2157 * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 2158 * hugetlb mapping); 2159 * - all vmas marked locked 2160 * - all i_mmap_rwsem locks; 2161 * - all anon_vma->rwseml 2162 * 2163 * We can take all locks within these types randomly because the VM code 2164 * doesn't nest them and we protected from parallel mm_take_all_locks() by 2165 * mm_all_locks_mutex. 2166 * 2167 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 2168 * that may have to take thousand of locks. 2169 * 2170 * mm_take_all_locks() can fail if it's interrupted by signals. 2171 */ 2172 int mm_take_all_locks(struct mm_struct *mm) 2173 { 2174 struct vm_area_struct *vma; 2175 struct anon_vma_chain *avc; 2176 VMA_ITERATOR(vmi, mm, 0); 2177 2178 mmap_assert_write_locked(mm); 2179 2180 mutex_lock(&mm_all_locks_mutex); 2181 2182 /* 2183 * vma_start_write() does not have a complement in mm_drop_all_locks() 2184 * because vma_start_write() is always asymmetrical; it marks a VMA as 2185 * being written to until mmap_write_unlock() or mmap_write_downgrade() 2186 * is reached. 2187 */ 2188 for_each_vma(vmi, vma) { 2189 if (signal_pending(current)) 2190 goto out_unlock; 2191 vma_start_write(vma); 2192 } 2193 2194 vma_iter_init(&vmi, mm, 0); 2195 for_each_vma(vmi, vma) { 2196 if (signal_pending(current)) 2197 goto out_unlock; 2198 if (vma->vm_file && vma->vm_file->f_mapping && 2199 is_vm_hugetlb_page(vma)) 2200 vm_lock_mapping(mm, vma->vm_file->f_mapping); 2201 } 2202 2203 vma_iter_init(&vmi, mm, 0); 2204 for_each_vma(vmi, vma) { 2205 if (signal_pending(current)) 2206 goto out_unlock; 2207 if (vma->vm_file && vma->vm_file->f_mapping && 2208 !is_vm_hugetlb_page(vma)) 2209 vm_lock_mapping(mm, vma->vm_file->f_mapping); 2210 } 2211 2212 vma_iter_init(&vmi, mm, 0); 2213 for_each_vma(vmi, vma) { 2214 if (signal_pending(current)) 2215 goto out_unlock; 2216 if (vma->anon_vma) 2217 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2218 vm_lock_anon_vma(mm, avc->anon_vma); 2219 } 2220 2221 return 0; 2222 2223 out_unlock: 2224 mm_drop_all_locks(mm); 2225 return -EINTR; 2226 } 2227 2228 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2229 { 2230 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2231 /* 2232 * The LSB of head.next can't change to 0 from under 2233 * us because we hold the mm_all_locks_mutex. 2234 * 2235 * We must however clear the bitflag before unlocking 2236 * the vma so the users using the anon_vma->rb_root will 2237 * never see our bitflag. 2238 * 2239 * No need of atomic instructions here, head.next 2240 * can't change from under us until we release the 2241 * anon_vma->root->rwsem. 2242 */ 2243 if (!__test_and_clear_bit(0, (unsigned long *) 2244 &anon_vma->root->rb_root.rb_root.rb_node)) 2245 BUG(); 2246 anon_vma_unlock_write(anon_vma); 2247 } 2248 } 2249 2250 static void vm_unlock_mapping(struct address_space *mapping) 2251 { 2252 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2253 /* 2254 * AS_MM_ALL_LOCKS can't change to 0 from under us 2255 * because we hold the mm_all_locks_mutex. 2256 */ 2257 i_mmap_unlock_write(mapping); 2258 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2259 &mapping->flags)) 2260 BUG(); 2261 } 2262 } 2263 2264 /* 2265 * The mmap_lock cannot be released by the caller until 2266 * mm_drop_all_locks() returns. 2267 */ 2268 void mm_drop_all_locks(struct mm_struct *mm) 2269 { 2270 struct vm_area_struct *vma; 2271 struct anon_vma_chain *avc; 2272 VMA_ITERATOR(vmi, mm, 0); 2273 2274 mmap_assert_write_locked(mm); 2275 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2276 2277 for_each_vma(vmi, vma) { 2278 if (vma->anon_vma) 2279 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2280 vm_unlock_anon_vma(avc->anon_vma); 2281 if (vma->vm_file && vma->vm_file->f_mapping) 2282 vm_unlock_mapping(vma->vm_file->f_mapping); 2283 } 2284 2285 mutex_unlock(&mm_all_locks_mutex); 2286 } 2287 2288 /* 2289 * We account for memory if it's a private writeable mapping, 2290 * not hugepages and VM_NORESERVE wasn't set. 2291 */ 2292 static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) 2293 { 2294 /* 2295 * hugetlb has its own accounting separate from the core VM 2296 * VM_HUGETLB may not be set yet so we cannot check for that flag. 2297 */ 2298 if (file && is_file_hugepages(file)) 2299 return false; 2300 2301 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 2302 } 2303 2304 /* 2305 * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() 2306 * operation. 2307 * @vms: The vma unmap structure 2308 * @mas_detach: The maple state with the detached maple tree 2309 * 2310 * Reattach any detached vmas, free up the maple tree used to track the vmas. 2311 * If that's not possible because the ptes are cleared (and vm_ops->closed() may 2312 * have been called), then a NULL is written over the vmas and the vmas are 2313 * removed (munmap() completed). 2314 */ 2315 static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, 2316 struct ma_state *mas_detach) 2317 { 2318 struct ma_state *mas = &vms->vmi->mas; 2319 2320 if (!vms->nr_pages) 2321 return; 2322 2323 if (vms->clear_ptes) 2324 return reattach_vmas(mas_detach); 2325 2326 /* 2327 * Aborting cannot just call the vm_ops open() because they are often 2328 * not symmetrical and state data has been lost. Resort to the old 2329 * failure method of leaving a gap where the MAP_FIXED mapping failed. 2330 */ 2331 mas_set_range(mas, vms->start, vms->end - 1); 2332 mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); 2333 /* Clean up the insertion of the unfortunate gap */ 2334 vms_complete_munmap_vmas(vms, mas_detach); 2335 } 2336 2337 /* 2338 * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be 2339 * unmapped once the map operation is completed, check limits, account mapping 2340 * and clean up any pre-existing VMAs. 2341 * 2342 * @map: Mapping state. 2343 * @uf: Userfaultfd context list. 2344 * 2345 * Returns: 0 on success, error code otherwise. 2346 */ 2347 static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) 2348 { 2349 int error; 2350 struct vma_iterator *vmi = map->vmi; 2351 struct vma_munmap_struct *vms = &map->vms; 2352 2353 /* Find the first overlapping VMA and initialise unmap state. */ 2354 vms->vma = vma_find(vmi, map->end); 2355 init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, 2356 /* unlock = */ false); 2357 2358 /* OK, we have overlapping VMAs - prepare to unmap them. */ 2359 if (vms->vma) { 2360 mt_init_flags(&map->mt_detach, 2361 vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 2362 mt_on_stack(map->mt_detach); 2363 mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); 2364 /* Prepare to unmap any existing mapping in the area */ 2365 error = vms_gather_munmap_vmas(vms, &map->mas_detach); 2366 if (error) { 2367 /* On error VMAs will already have been reattached. */ 2368 vms->nr_pages = 0; 2369 return error; 2370 } 2371 2372 map->next = vms->next; 2373 map->prev = vms->prev; 2374 } else { 2375 map->next = vma_iter_next_rewind(vmi, &map->prev); 2376 } 2377 2378 /* Check against address space limit. */ 2379 if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) 2380 return -ENOMEM; 2381 2382 /* Private writable mapping: check memory availability. */ 2383 if (accountable_mapping(map->file, map->flags)) { 2384 map->charged = map->pglen; 2385 map->charged -= vms->nr_accounted; 2386 if (map->charged) { 2387 error = security_vm_enough_memory_mm(map->mm, map->charged); 2388 if (error) 2389 return error; 2390 } 2391 2392 vms->nr_accounted = 0; 2393 map->flags |= VM_ACCOUNT; 2394 } 2395 2396 /* 2397 * Clear PTEs while the vma is still in the tree so that rmap 2398 * cannot race with the freeing later in the truncate scenario. 2399 * This is also needed for mmap_file(), which is why vm_ops 2400 * close function is called. 2401 */ 2402 vms_clean_up_area(vms, &map->mas_detach); 2403 2404 return 0; 2405 } 2406 2407 2408 static int __mmap_new_file_vma(struct mmap_state *map, 2409 struct vm_area_struct *vma) 2410 { 2411 struct vma_iterator *vmi = map->vmi; 2412 int error; 2413 2414 vma->vm_file = get_file(map->file); 2415 2416 if (!map->file->f_op->mmap) 2417 return 0; 2418 2419 error = mmap_file(vma->vm_file, vma); 2420 if (error) { 2421 fput(vma->vm_file); 2422 vma->vm_file = NULL; 2423 2424 vma_iter_set(vmi, vma->vm_end); 2425 /* Undo any partial mapping done by a device driver. */ 2426 unmap_region(&vmi->mas, vma, map->prev, map->next); 2427 2428 return error; 2429 } 2430 2431 /* Drivers cannot alter the address of the VMA. */ 2432 WARN_ON_ONCE(map->addr != vma->vm_start); 2433 /* 2434 * Drivers should not permit writability when previously it was 2435 * disallowed. 2436 */ 2437 VM_WARN_ON_ONCE(map->flags != vma->vm_flags && 2438 !(map->flags & VM_MAYWRITE) && 2439 (vma->vm_flags & VM_MAYWRITE)); 2440 2441 map->flags = vma->vm_flags; 2442 2443 return 0; 2444 } 2445 2446 /* 2447 * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not 2448 * possible. 2449 * 2450 * @map: Mapping state. 2451 * @vmap: Output pointer for the new VMA. 2452 * 2453 * Returns: Zero on success, or an error. 2454 */ 2455 static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) 2456 { 2457 struct vma_iterator *vmi = map->vmi; 2458 int error = 0; 2459 struct vm_area_struct *vma; 2460 2461 /* 2462 * Determine the object being mapped and call the appropriate 2463 * specific mapper. the address has already been validated, but 2464 * not unmapped, but the maps are removed from the list. 2465 */ 2466 vma = vm_area_alloc(map->mm); 2467 if (!vma) 2468 return -ENOMEM; 2469 2470 vma_iter_config(vmi, map->addr, map->end); 2471 vma_set_range(vma, map->addr, map->end, map->pgoff); 2472 vm_flags_init(vma, map->flags); 2473 vma->vm_page_prot = map->page_prot; 2474 2475 if (vma_iter_prealloc(vmi, vma)) { 2476 error = -ENOMEM; 2477 goto free_vma; 2478 } 2479 2480 if (map->file) 2481 error = __mmap_new_file_vma(map, vma); 2482 else if (map->flags & VM_SHARED) 2483 error = shmem_zero_setup(vma); 2484 else 2485 vma_set_anonymous(vma); 2486 2487 if (error) 2488 goto free_iter_vma; 2489 2490 #ifdef CONFIG_SPARC64 2491 /* TODO: Fix SPARC ADI! */ 2492 WARN_ON_ONCE(!arch_validate_flags(map->flags)); 2493 #endif 2494 2495 /* Lock the VMA since it is modified after insertion into VMA tree */ 2496 vma_start_write(vma); 2497 vma_iter_store_new(vmi, vma); 2498 map->mm->map_count++; 2499 vma_link_file(vma); 2500 2501 /* 2502 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below 2503 * call covers the non-merge case. 2504 */ 2505 if (!vma_is_anonymous(vma)) 2506 khugepaged_enter_vma(vma, map->flags); 2507 ksm_add_vma(vma); 2508 *vmap = vma; 2509 return 0; 2510 2511 free_iter_vma: 2512 vma_iter_free(vmi); 2513 free_vma: 2514 vm_area_free(vma); 2515 return error; 2516 } 2517 2518 /* 2519 * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping 2520 * statistics, handle locking and finalise the VMA. 2521 * 2522 * @map: Mapping state. 2523 * @vma: Merged or newly allocated VMA for the mmap()'d region. 2524 */ 2525 static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) 2526 { 2527 struct mm_struct *mm = map->mm; 2528 unsigned long vm_flags = vma->vm_flags; 2529 2530 perf_event_mmap(vma); 2531 2532 /* Unmap any existing mapping in the area. */ 2533 vms_complete_munmap_vmas(&map->vms, &map->mas_detach); 2534 2535 vm_stat_account(mm, vma->vm_flags, map->pglen); 2536 if (vm_flags & VM_LOCKED) { 2537 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || 2538 is_vm_hugetlb_page(vma) || 2539 vma == get_gate_vma(mm)) 2540 vm_flags_clear(vma, VM_LOCKED_MASK); 2541 else 2542 mm->locked_vm += map->pglen; 2543 } 2544 2545 if (vma->vm_file) 2546 uprobe_mmap(vma); 2547 2548 /* 2549 * New (or expanded) vma always get soft dirty status. 2550 * Otherwise user-space soft-dirty page tracker won't 2551 * be able to distinguish situation when vma area unmapped, 2552 * then new mapped in-place (which must be aimed as 2553 * a completely new data area). 2554 */ 2555 vm_flags_set(vma, VM_SOFTDIRTY); 2556 2557 vma_set_page_prot(vma); 2558 } 2559 2560 /* 2561 * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that 2562 * specifies it. 2563 * 2564 * This is called prior to any merge attempt, and updates whitelisted fields 2565 * that are permitted to be updated by the caller. 2566 * 2567 * All but user-defined fields will be pre-populated with original values. 2568 * 2569 * Returns 0 on success, or an error code otherwise. 2570 */ 2571 static int call_mmap_prepare(struct mmap_state *map) 2572 { 2573 int err; 2574 struct vm_area_desc desc = { 2575 .mm = map->mm, 2576 .start = map->addr, 2577 .end = map->end, 2578 2579 .pgoff = map->pgoff, 2580 .file = map->file, 2581 .vm_flags = map->flags, 2582 .page_prot = map->page_prot, 2583 }; 2584 2585 /* Invoke the hook. */ 2586 err = __call_mmap_prepare(map->file, &desc); 2587 if (err) 2588 return err; 2589 2590 /* Update fields permitted to be changed. */ 2591 map->pgoff = desc.pgoff; 2592 map->file = desc.file; 2593 map->flags = desc.vm_flags; 2594 map->page_prot = desc.page_prot; 2595 /* User-defined fields. */ 2596 map->vm_ops = desc.vm_ops; 2597 map->vm_private_data = desc.private_data; 2598 2599 return 0; 2600 } 2601 2602 static void set_vma_user_defined_fields(struct vm_area_struct *vma, 2603 struct mmap_state *map) 2604 { 2605 if (map->vm_ops) 2606 vma->vm_ops = map->vm_ops; 2607 vma->vm_private_data = map->vm_private_data; 2608 } 2609 2610 static unsigned long __mmap_region(struct file *file, unsigned long addr, 2611 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2612 struct list_head *uf) 2613 { 2614 struct mm_struct *mm = current->mm; 2615 struct vm_area_struct *vma = NULL; 2616 int error; 2617 bool have_mmap_prepare = file && file->f_op->mmap_prepare; 2618 VMA_ITERATOR(vmi, mm, addr); 2619 MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); 2620 2621 error = __mmap_prepare(&map, uf); 2622 if (!error && have_mmap_prepare) 2623 error = call_mmap_prepare(&map); 2624 if (error) 2625 goto abort_munmap; 2626 2627 /* Attempt to merge with adjacent VMAs... */ 2628 if (map.prev || map.next) { 2629 VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); 2630 2631 vma = vma_merge_new_range(&vmg); 2632 } 2633 2634 /* ...but if we can't, allocate a new VMA. */ 2635 if (!vma) { 2636 error = __mmap_new_vma(&map, &vma); 2637 if (error) 2638 goto unacct_error; 2639 } 2640 2641 if (have_mmap_prepare) 2642 set_vma_user_defined_fields(vma, &map); 2643 2644 __mmap_complete(&map, vma); 2645 2646 return addr; 2647 2648 /* Accounting was done by __mmap_prepare(). */ 2649 unacct_error: 2650 if (map.charged) 2651 vm_unacct_memory(map.charged); 2652 abort_munmap: 2653 vms_abort_munmap_vmas(&map.vms, &map.mas_detach); 2654 return error; 2655 } 2656 2657 /** 2658 * mmap_region() - Actually perform the userland mapping of a VMA into 2659 * current->mm with known, aligned and overflow-checked @addr and @len, and 2660 * correctly determined VMA flags @vm_flags and page offset @pgoff. 2661 * 2662 * This is an internal memory management function, and should not be used 2663 * directly. 2664 * 2665 * The caller must write-lock current->mm->mmap_lock. 2666 * 2667 * @file: If a file-backed mapping, a pointer to the struct file describing the 2668 * file to be mapped, otherwise NULL. 2669 * @addr: The page-aligned address at which to perform the mapping. 2670 * @len: The page-aligned, non-zero, length of the mapping. 2671 * @vm_flags: The VMA flags which should be applied to the mapping. 2672 * @pgoff: If @file is specified, the page offset into the file, if not then 2673 * the virtual page offset in memory of the anonymous mapping. 2674 * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap 2675 * events. 2676 * 2677 * Returns: Either an error, or the address at which the requested mapping has 2678 * been performed. 2679 */ 2680 unsigned long mmap_region(struct file *file, unsigned long addr, 2681 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2682 struct list_head *uf) 2683 { 2684 unsigned long ret; 2685 bool writable_file_mapping = false; 2686 2687 mmap_assert_write_locked(current->mm); 2688 2689 /* Check to see if MDWE is applicable. */ 2690 if (map_deny_write_exec(vm_flags, vm_flags)) 2691 return -EACCES; 2692 2693 /* Allow architectures to sanity-check the vm_flags. */ 2694 if (!arch_validate_flags(vm_flags)) 2695 return -EINVAL; 2696 2697 /* Map writable and ensure this isn't a sealed memfd. */ 2698 if (file && is_shared_maywrite(vm_flags)) { 2699 int error = mapping_map_writable(file->f_mapping); 2700 2701 if (error) 2702 return error; 2703 writable_file_mapping = true; 2704 } 2705 2706 ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); 2707 2708 /* Clear our write mapping regardless of error. */ 2709 if (writable_file_mapping) 2710 mapping_unmap_writable(file->f_mapping); 2711 2712 validate_mm(current->mm); 2713 return ret; 2714 } 2715 2716 /* 2717 * do_brk_flags() - Increase the brk vma if the flags match. 2718 * @vmi: The vma iterator 2719 * @addr: The start address 2720 * @len: The length of the increase 2721 * @vma: The vma, 2722 * @flags: The VMA Flags 2723 * 2724 * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags 2725 * do not match then create a new anonymous VMA. Eventually we may be able to 2726 * do some brk-specific accounting here. 2727 */ 2728 int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, 2729 unsigned long addr, unsigned long len, unsigned long flags) 2730 { 2731 struct mm_struct *mm = current->mm; 2732 2733 /* 2734 * Check against address space limits by the changed size 2735 * Note: This happens *after* clearing old mappings in some code paths. 2736 */ 2737 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2738 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) 2739 return -ENOMEM; 2740 2741 if (mm->map_count > sysctl_max_map_count) 2742 return -ENOMEM; 2743 2744 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2745 return -ENOMEM; 2746 2747 /* 2748 * Expand the existing vma if possible; Note that singular lists do not 2749 * occur after forking, so the expand will only happen on new VMAs. 2750 */ 2751 if (vma && vma->vm_end == addr) { 2752 VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); 2753 2754 vmg.prev = vma; 2755 /* vmi is positioned at prev, which this mode expects. */ 2756 vmg.just_expand = true; 2757 2758 if (vma_merge_new_range(&vmg)) 2759 goto out; 2760 else if (vmg_nomem(&vmg)) 2761 goto unacct_fail; 2762 } 2763 2764 if (vma) 2765 vma_iter_next_range(vmi); 2766 /* create a vma struct for an anonymous mapping */ 2767 vma = vm_area_alloc(mm); 2768 if (!vma) 2769 goto unacct_fail; 2770 2771 vma_set_anonymous(vma); 2772 vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); 2773 vm_flags_init(vma, flags); 2774 vma->vm_page_prot = vm_get_page_prot(flags); 2775 vma_start_write(vma); 2776 if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) 2777 goto mas_store_fail; 2778 2779 mm->map_count++; 2780 validate_mm(mm); 2781 ksm_add_vma(vma); 2782 out: 2783 perf_event_mmap(vma); 2784 mm->total_vm += len >> PAGE_SHIFT; 2785 mm->data_vm += len >> PAGE_SHIFT; 2786 if (flags & VM_LOCKED) 2787 mm->locked_vm += (len >> PAGE_SHIFT); 2788 vm_flags_set(vma, VM_SOFTDIRTY); 2789 return 0; 2790 2791 mas_store_fail: 2792 vm_area_free(vma); 2793 unacct_fail: 2794 vm_unacct_memory(len >> PAGE_SHIFT); 2795 return -ENOMEM; 2796 } 2797 2798 /** 2799 * unmapped_area() - Find an area between the low_limit and the high_limit with 2800 * the correct alignment and offset, all from @info. Note: current->mm is used 2801 * for the search. 2802 * 2803 * @info: The unmapped area information including the range [low_limit - 2804 * high_limit), the alignment offset and mask. 2805 * 2806 * Return: A memory address or -ENOMEM. 2807 */ 2808 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 2809 { 2810 unsigned long length, gap; 2811 unsigned long low_limit, high_limit; 2812 struct vm_area_struct *tmp; 2813 VMA_ITERATOR(vmi, current->mm, 0); 2814 2815 /* Adjust search length to account for worst case alignment overhead */ 2816 length = info->length + info->align_mask + info->start_gap; 2817 if (length < info->length) 2818 return -ENOMEM; 2819 2820 low_limit = info->low_limit; 2821 if (low_limit < mmap_min_addr) 2822 low_limit = mmap_min_addr; 2823 high_limit = info->high_limit; 2824 retry: 2825 if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) 2826 return -ENOMEM; 2827 2828 /* 2829 * Adjust for the gap first so it doesn't interfere with the 2830 * later alignment. The first step is the minimum needed to 2831 * fulill the start gap, the next steps is the minimum to align 2832 * that. It is the minimum needed to fulill both. 2833 */ 2834 gap = vma_iter_addr(&vmi) + info->start_gap; 2835 gap += (info->align_offset - gap) & info->align_mask; 2836 tmp = vma_next(&vmi); 2837 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ 2838 if (vm_start_gap(tmp) < gap + length - 1) { 2839 low_limit = tmp->vm_end; 2840 vma_iter_reset(&vmi); 2841 goto retry; 2842 } 2843 } else { 2844 tmp = vma_prev(&vmi); 2845 if (tmp && vm_end_gap(tmp) > gap) { 2846 low_limit = vm_end_gap(tmp); 2847 vma_iter_reset(&vmi); 2848 goto retry; 2849 } 2850 } 2851 2852 return gap; 2853 } 2854 2855 /** 2856 * unmapped_area_topdown() - Find an area between the low_limit and the 2857 * high_limit with the correct alignment and offset at the highest available 2858 * address, all from @info. Note: current->mm is used for the search. 2859 * 2860 * @info: The unmapped area information including the range [low_limit - 2861 * high_limit), the alignment offset and mask. 2862 * 2863 * Return: A memory address or -ENOMEM. 2864 */ 2865 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 2866 { 2867 unsigned long length, gap, gap_end; 2868 unsigned long low_limit, high_limit; 2869 struct vm_area_struct *tmp; 2870 VMA_ITERATOR(vmi, current->mm, 0); 2871 2872 /* Adjust search length to account for worst case alignment overhead */ 2873 length = info->length + info->align_mask + info->start_gap; 2874 if (length < info->length) 2875 return -ENOMEM; 2876 2877 low_limit = info->low_limit; 2878 if (low_limit < mmap_min_addr) 2879 low_limit = mmap_min_addr; 2880 high_limit = info->high_limit; 2881 retry: 2882 if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) 2883 return -ENOMEM; 2884 2885 gap = vma_iter_end(&vmi) - info->length; 2886 gap -= (gap - info->align_offset) & info->align_mask; 2887 gap_end = vma_iter_end(&vmi); 2888 tmp = vma_next(&vmi); 2889 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ 2890 if (vm_start_gap(tmp) < gap_end) { 2891 high_limit = vm_start_gap(tmp); 2892 vma_iter_reset(&vmi); 2893 goto retry; 2894 } 2895 } else { 2896 tmp = vma_prev(&vmi); 2897 if (tmp && vm_end_gap(tmp) > gap) { 2898 high_limit = tmp->vm_start; 2899 vma_iter_reset(&vmi); 2900 goto retry; 2901 } 2902 } 2903 2904 return gap; 2905 } 2906 2907 /* 2908 * Verify that the stack growth is acceptable and 2909 * update accounting. This is shared with both the 2910 * grow-up and grow-down cases. 2911 */ 2912 static int acct_stack_growth(struct vm_area_struct *vma, 2913 unsigned long size, unsigned long grow) 2914 { 2915 struct mm_struct *mm = vma->vm_mm; 2916 unsigned long new_start; 2917 2918 /* address space limit tests */ 2919 if (!may_expand_vm(mm, vma->vm_flags, grow)) 2920 return -ENOMEM; 2921 2922 /* Stack limit test */ 2923 if (size > rlimit(RLIMIT_STACK)) 2924 return -ENOMEM; 2925 2926 /* mlock limit tests */ 2927 if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) 2928 return -ENOMEM; 2929 2930 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2931 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2932 vma->vm_end - size; 2933 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2934 return -EFAULT; 2935 2936 /* 2937 * Overcommit.. This must be the final test, as it will 2938 * update security statistics. 2939 */ 2940 if (security_vm_enough_memory_mm(mm, grow)) 2941 return -ENOMEM; 2942 2943 return 0; 2944 } 2945 2946 #if defined(CONFIG_STACK_GROWSUP) 2947 /* 2948 * PA-RISC uses this for its stack. 2949 * vma is the last one with address > vma->vm_end. Have to extend vma. 2950 */ 2951 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2952 { 2953 struct mm_struct *mm = vma->vm_mm; 2954 struct vm_area_struct *next; 2955 unsigned long gap_addr; 2956 int error = 0; 2957 VMA_ITERATOR(vmi, mm, vma->vm_start); 2958 2959 if (!(vma->vm_flags & VM_GROWSUP)) 2960 return -EFAULT; 2961 2962 mmap_assert_write_locked(mm); 2963 2964 /* Guard against exceeding limits of the address space. */ 2965 address &= PAGE_MASK; 2966 if (address >= (TASK_SIZE & PAGE_MASK)) 2967 return -ENOMEM; 2968 address += PAGE_SIZE; 2969 2970 /* Enforce stack_guard_gap */ 2971 gap_addr = address + stack_guard_gap; 2972 2973 /* Guard against overflow */ 2974 if (gap_addr < address || gap_addr > TASK_SIZE) 2975 gap_addr = TASK_SIZE; 2976 2977 next = find_vma_intersection(mm, vma->vm_end, gap_addr); 2978 if (next && vma_is_accessible(next)) { 2979 if (!(next->vm_flags & VM_GROWSUP)) 2980 return -ENOMEM; 2981 /* Check that both stack segments have the same anon_vma? */ 2982 } 2983 2984 if (next) 2985 vma_iter_prev_range_limit(&vmi, address); 2986 2987 vma_iter_config(&vmi, vma->vm_start, address); 2988 if (vma_iter_prealloc(&vmi, vma)) 2989 return -ENOMEM; 2990 2991 /* We must make sure the anon_vma is allocated. */ 2992 if (unlikely(anon_vma_prepare(vma))) { 2993 vma_iter_free(&vmi); 2994 return -ENOMEM; 2995 } 2996 2997 /* Lock the VMA before expanding to prevent concurrent page faults */ 2998 vma_start_write(vma); 2999 /* We update the anon VMA tree. */ 3000 anon_vma_lock_write(vma->anon_vma); 3001 3002 /* Somebody else might have raced and expanded it already */ 3003 if (address > vma->vm_end) { 3004 unsigned long size, grow; 3005 3006 size = address - vma->vm_start; 3007 grow = (address - vma->vm_end) >> PAGE_SHIFT; 3008 3009 error = -ENOMEM; 3010 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 3011 error = acct_stack_growth(vma, size, grow); 3012 if (!error) { 3013 if (vma->vm_flags & VM_LOCKED) 3014 mm->locked_vm += grow; 3015 vm_stat_account(mm, vma->vm_flags, grow); 3016 anon_vma_interval_tree_pre_update_vma(vma); 3017 vma->vm_end = address; 3018 /* Overwrite old entry in mtree. */ 3019 vma_iter_store_overwrite(&vmi, vma); 3020 anon_vma_interval_tree_post_update_vma(vma); 3021 3022 perf_event_mmap(vma); 3023 } 3024 } 3025 } 3026 anon_vma_unlock_write(vma->anon_vma); 3027 vma_iter_free(&vmi); 3028 validate_mm(mm); 3029 return error; 3030 } 3031 #endif /* CONFIG_STACK_GROWSUP */ 3032 3033 /* 3034 * vma is the first one with address < vma->vm_start. Have to extend vma. 3035 * mmap_lock held for writing. 3036 */ 3037 int expand_downwards(struct vm_area_struct *vma, unsigned long address) 3038 { 3039 struct mm_struct *mm = vma->vm_mm; 3040 struct vm_area_struct *prev; 3041 int error = 0; 3042 VMA_ITERATOR(vmi, mm, vma->vm_start); 3043 3044 if (!(vma->vm_flags & VM_GROWSDOWN)) 3045 return -EFAULT; 3046 3047 mmap_assert_write_locked(mm); 3048 3049 address &= PAGE_MASK; 3050 if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) 3051 return -EPERM; 3052 3053 /* Enforce stack_guard_gap */ 3054 prev = vma_prev(&vmi); 3055 /* Check that both stack segments have the same anon_vma? */ 3056 if (prev) { 3057 if (!(prev->vm_flags & VM_GROWSDOWN) && 3058 vma_is_accessible(prev) && 3059 (address - prev->vm_end < stack_guard_gap)) 3060 return -ENOMEM; 3061 } 3062 3063 if (prev) 3064 vma_iter_next_range_limit(&vmi, vma->vm_start); 3065 3066 vma_iter_config(&vmi, address, vma->vm_end); 3067 if (vma_iter_prealloc(&vmi, vma)) 3068 return -ENOMEM; 3069 3070 /* We must make sure the anon_vma is allocated. */ 3071 if (unlikely(anon_vma_prepare(vma))) { 3072 vma_iter_free(&vmi); 3073 return -ENOMEM; 3074 } 3075 3076 /* Lock the VMA before expanding to prevent concurrent page faults */ 3077 vma_start_write(vma); 3078 /* We update the anon VMA tree. */ 3079 anon_vma_lock_write(vma->anon_vma); 3080 3081 /* Somebody else might have raced and expanded it already */ 3082 if (address < vma->vm_start) { 3083 unsigned long size, grow; 3084 3085 size = vma->vm_end - address; 3086 grow = (vma->vm_start - address) >> PAGE_SHIFT; 3087 3088 error = -ENOMEM; 3089 if (grow <= vma->vm_pgoff) { 3090 error = acct_stack_growth(vma, size, grow); 3091 if (!error) { 3092 if (vma->vm_flags & VM_LOCKED) 3093 mm->locked_vm += grow; 3094 vm_stat_account(mm, vma->vm_flags, grow); 3095 anon_vma_interval_tree_pre_update_vma(vma); 3096 vma->vm_start = address; 3097 vma->vm_pgoff -= grow; 3098 /* Overwrite old entry in mtree. */ 3099 vma_iter_store_overwrite(&vmi, vma); 3100 anon_vma_interval_tree_post_update_vma(vma); 3101 3102 perf_event_mmap(vma); 3103 } 3104 } 3105 } 3106 anon_vma_unlock_write(vma->anon_vma); 3107 vma_iter_free(&vmi); 3108 validate_mm(mm); 3109 return error; 3110 } 3111 3112 int __vm_munmap(unsigned long start, size_t len, bool unlock) 3113 { 3114 int ret; 3115 struct mm_struct *mm = current->mm; 3116 LIST_HEAD(uf); 3117 VMA_ITERATOR(vmi, mm, start); 3118 3119 if (mmap_write_lock_killable(mm)) 3120 return -EINTR; 3121 3122 ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); 3123 if (ret || !unlock) 3124 mmap_write_unlock(mm); 3125 3126 userfaultfd_unmap_complete(mm, &uf); 3127 return ret; 3128 } 3129 3130 3131 /* Insert vm structure into process list sorted by address 3132 * and into the inode's i_mmap tree. If vm_file is non-NULL 3133 * then i_mmap_rwsem is taken here. 3134 */ 3135 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 3136 { 3137 unsigned long charged = vma_pages(vma); 3138 3139 3140 if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 3141 return -ENOMEM; 3142 3143 if ((vma->vm_flags & VM_ACCOUNT) && 3144 security_vm_enough_memory_mm(mm, charged)) 3145 return -ENOMEM; 3146 3147 /* 3148 * The vm_pgoff of a purely anonymous vma should be irrelevant 3149 * until its first write fault, when page's anon_vma and index 3150 * are set. But now set the vm_pgoff it will almost certainly 3151 * end up with (unless mremap moves it elsewhere before that 3152 * first wfault), so /proc/pid/maps tells a consistent story. 3153 * 3154 * By setting it to reflect the virtual start address of the 3155 * vma, merges and splits can happen in a seamless way, just 3156 * using the existing file pgoff checks and manipulations. 3157 * Similarly in do_mmap and in do_brk_flags. 3158 */ 3159 if (vma_is_anonymous(vma)) { 3160 BUG_ON(vma->anon_vma); 3161 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 3162 } 3163 3164 if (vma_link(mm, vma)) { 3165 if (vma->vm_flags & VM_ACCOUNT) 3166 vm_unacct_memory(charged); 3167 return -ENOMEM; 3168 } 3169 3170 return 0; 3171 } 3172