1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 /* 4 * VMA-specific functions. 5 */ 6 7 #include "vma_internal.h" 8 #include "vma.h" 9 10 struct mmap_state { 11 struct mm_struct *mm; 12 struct vma_iterator *vmi; 13 14 unsigned long addr; 15 unsigned long end; 16 pgoff_t pgoff; 17 unsigned long pglen; 18 unsigned long flags; 19 struct file *file; 20 pgprot_t page_prot; 21 22 /* User-defined fields, perhaps updated by .mmap_prepare(). */ 23 const struct vm_operations_struct *vm_ops; 24 void *vm_private_data; 25 26 unsigned long charged; 27 28 struct vm_area_struct *prev; 29 struct vm_area_struct *next; 30 31 /* Unmapping state. */ 32 struct vma_munmap_struct vms; 33 struct ma_state mas_detach; 34 struct maple_tree mt_detach; 35 }; 36 37 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ 38 struct mmap_state name = { \ 39 .mm = mm_, \ 40 .vmi = vmi_, \ 41 .addr = addr_, \ 42 .end = (addr_) + (len_), \ 43 .pgoff = pgoff_, \ 44 .pglen = PHYS_PFN(len_), \ 45 .flags = flags_, \ 46 .file = file_, \ 47 .page_prot = vm_get_page_prot(flags_), \ 48 } 49 50 #define VMG_MMAP_STATE(name, map_, vma_) \ 51 struct vma_merge_struct name = { \ 52 .mm = (map_)->mm, \ 53 .vmi = (map_)->vmi, \ 54 .start = (map_)->addr, \ 55 .end = (map_)->end, \ 56 .flags = (map_)->flags, \ 57 .pgoff = (map_)->pgoff, \ 58 .file = (map_)->file, \ 59 .prev = (map_)->prev, \ 60 .middle = vma_, \ 61 .next = (vma_) ? NULL : (map_)->next, \ 62 .state = VMA_MERGE_START, \ 63 } 64 65 /* 66 * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain 67 * more than one anon_vma_chain connecting it to more than one anon_vma. A merge 68 * would mean a wider range of folios sharing the root anon_vma lock, and thus 69 * potential lock contention, we do not wish to encourage merging such that this 70 * scales to a problem. 71 */ 72 static bool vma_had_uncowed_parents(struct vm_area_struct *vma) 73 { 74 /* 75 * The list_is_singular() test is to avoid merging VMA cloned from 76 * parents. This can improve scalability caused by anon_vma lock. 77 */ 78 return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); 79 } 80 81 static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) 82 { 83 struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; 84 85 if (!mpol_equal(vmg->policy, vma_policy(vma))) 86 return false; 87 /* 88 * VM_SOFTDIRTY should not prevent from VMA merging, if we 89 * match the flags but dirty bit -- the caller should mark 90 * merged VMA as dirty. If dirty bit won't be excluded from 91 * comparison, we increase pressure on the memory system forcing 92 * the kernel to generate new VMAs when old one could be 93 * extended instead. 94 */ 95 if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) 96 return false; 97 if (vma->vm_file != vmg->file) 98 return false; 99 if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) 100 return false; 101 if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) 102 return false; 103 return true; 104 } 105 106 static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) 107 { 108 struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; 109 struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ 110 struct anon_vma *tgt_anon = tgt->anon_vma; 111 struct anon_vma *src_anon = vmg->anon_vma; 112 113 /* 114 * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we 115 * will remove the existing VMA's anon_vma's so there's no scalability 116 * concerns. 117 */ 118 VM_WARN_ON(src && src_anon != src->anon_vma); 119 120 /* Case 1 - we will dup_anon_vma() from src into tgt. */ 121 if (!tgt_anon && src_anon) 122 return !vma_had_uncowed_parents(src); 123 /* Case 2 - we will simply use tgt's anon_vma. */ 124 if (tgt_anon && !src_anon) 125 return !vma_had_uncowed_parents(tgt); 126 /* Case 3 - the anon_vma's are already shared. */ 127 return src_anon == tgt_anon; 128 } 129 130 /* 131 * init_multi_vma_prep() - Initializer for struct vma_prepare 132 * @vp: The vma_prepare struct 133 * @vma: The vma that will be altered once locked 134 * @vmg: The merge state that will be used to determine adjustment and VMA 135 * removal. 136 */ 137 static void init_multi_vma_prep(struct vma_prepare *vp, 138 struct vm_area_struct *vma, 139 struct vma_merge_struct *vmg) 140 { 141 struct vm_area_struct *adjust; 142 struct vm_area_struct **remove = &vp->remove; 143 144 memset(vp, 0, sizeof(struct vma_prepare)); 145 vp->vma = vma; 146 vp->anon_vma = vma->anon_vma; 147 148 if (vmg && vmg->__remove_middle) { 149 *remove = vmg->middle; 150 remove = &vp->remove2; 151 } 152 if (vmg && vmg->__remove_next) 153 *remove = vmg->next; 154 155 if (vmg && vmg->__adjust_middle_start) 156 adjust = vmg->middle; 157 else if (vmg && vmg->__adjust_next_start) 158 adjust = vmg->next; 159 else 160 adjust = NULL; 161 162 vp->adj_next = adjust; 163 if (!vp->anon_vma && adjust) 164 vp->anon_vma = adjust->anon_vma; 165 166 VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && 167 vp->anon_vma != adjust->anon_vma); 168 169 vp->file = vma->vm_file; 170 if (vp->file) 171 vp->mapping = vma->vm_file->f_mapping; 172 } 173 174 /* 175 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 176 * in front of (at a lower virtual address and file offset than) the vma. 177 * 178 * We cannot merge two vmas if they have differently assigned (non-NULL) 179 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 180 * 181 * We don't check here for the merged mmap wrapping around the end of pagecache 182 * indices (16TB on ia32) because do_mmap() does not permit mmap's which 183 * wrap, nor mmaps which cover the final page at index -1UL. 184 * 185 * We assume the vma may be removed as part of the merge. 186 */ 187 static bool can_vma_merge_before(struct vma_merge_struct *vmg) 188 { 189 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); 190 191 if (is_mergeable_vma(vmg, /* merge_next = */ true) && 192 is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { 193 if (vmg->next->vm_pgoff == vmg->pgoff + pglen) 194 return true; 195 } 196 197 return false; 198 } 199 200 /* 201 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 202 * beyond (at a higher virtual address and file offset than) the vma. 203 * 204 * We cannot merge two vmas if they have differently assigned (non-NULL) 205 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 206 * 207 * We assume that vma is not removed as part of the merge. 208 */ 209 static bool can_vma_merge_after(struct vma_merge_struct *vmg) 210 { 211 if (is_mergeable_vma(vmg, /* merge_next = */ false) && 212 is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { 213 if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) 214 return true; 215 } 216 return false; 217 } 218 219 static void __vma_link_file(struct vm_area_struct *vma, 220 struct address_space *mapping) 221 { 222 if (vma_is_shared_maywrite(vma)) 223 mapping_allow_writable(mapping); 224 225 flush_dcache_mmap_lock(mapping); 226 vma_interval_tree_insert(vma, &mapping->i_mmap); 227 flush_dcache_mmap_unlock(mapping); 228 } 229 230 /* 231 * Requires inode->i_mapping->i_mmap_rwsem 232 */ 233 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 234 struct address_space *mapping) 235 { 236 if (vma_is_shared_maywrite(vma)) 237 mapping_unmap_writable(mapping); 238 239 flush_dcache_mmap_lock(mapping); 240 vma_interval_tree_remove(vma, &mapping->i_mmap); 241 flush_dcache_mmap_unlock(mapping); 242 } 243 244 /* 245 * vma has some anon_vma assigned, and is already inserted on that 246 * anon_vma's interval trees. 247 * 248 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 249 * vma must be removed from the anon_vma's interval trees using 250 * anon_vma_interval_tree_pre_update_vma(). 251 * 252 * After the update, the vma will be reinserted using 253 * anon_vma_interval_tree_post_update_vma(). 254 * 255 * The entire update must be protected by exclusive mmap_lock and by 256 * the root anon_vma's mutex. 257 */ 258 static void 259 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 260 { 261 struct anon_vma_chain *avc; 262 263 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 264 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 265 } 266 267 static void 268 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 269 { 270 struct anon_vma_chain *avc; 271 272 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 273 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 274 } 275 276 /* 277 * vma_prepare() - Helper function for handling locking VMAs prior to altering 278 * @vp: The initialized vma_prepare struct 279 */ 280 static void vma_prepare(struct vma_prepare *vp) 281 { 282 if (vp->file) { 283 uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 284 285 if (vp->adj_next) 286 uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 287 vp->adj_next->vm_end); 288 289 i_mmap_lock_write(vp->mapping); 290 if (vp->insert && vp->insert->vm_file) { 291 /* 292 * Put into interval tree now, so instantiated pages 293 * are visible to arm/parisc __flush_dcache_page 294 * throughout; but we cannot insert into address 295 * space until vma start or end is updated. 296 */ 297 __vma_link_file(vp->insert, 298 vp->insert->vm_file->f_mapping); 299 } 300 } 301 302 if (vp->anon_vma) { 303 anon_vma_lock_write(vp->anon_vma); 304 anon_vma_interval_tree_pre_update_vma(vp->vma); 305 if (vp->adj_next) 306 anon_vma_interval_tree_pre_update_vma(vp->adj_next); 307 } 308 309 if (vp->file) { 310 flush_dcache_mmap_lock(vp->mapping); 311 vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 312 if (vp->adj_next) 313 vma_interval_tree_remove(vp->adj_next, 314 &vp->mapping->i_mmap); 315 } 316 317 } 318 319 /* 320 * vma_complete- Helper function for handling the unlocking after altering VMAs, 321 * or for inserting a VMA. 322 * 323 * @vp: The vma_prepare struct 324 * @vmi: The vma iterator 325 * @mm: The mm_struct 326 */ 327 static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, 328 struct mm_struct *mm) 329 { 330 if (vp->file) { 331 if (vp->adj_next) 332 vma_interval_tree_insert(vp->adj_next, 333 &vp->mapping->i_mmap); 334 vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 335 flush_dcache_mmap_unlock(vp->mapping); 336 } 337 338 if (vp->remove && vp->file) { 339 __remove_shared_vm_struct(vp->remove, vp->mapping); 340 if (vp->remove2) 341 __remove_shared_vm_struct(vp->remove2, vp->mapping); 342 } else if (vp->insert) { 343 /* 344 * split_vma has split insert from vma, and needs 345 * us to insert it before dropping the locks 346 * (it may either follow vma or precede it). 347 */ 348 vma_iter_store_new(vmi, vp->insert); 349 mm->map_count++; 350 } 351 352 if (vp->anon_vma) { 353 anon_vma_interval_tree_post_update_vma(vp->vma); 354 if (vp->adj_next) 355 anon_vma_interval_tree_post_update_vma(vp->adj_next); 356 anon_vma_unlock_write(vp->anon_vma); 357 } 358 359 if (vp->file) { 360 i_mmap_unlock_write(vp->mapping); 361 uprobe_mmap(vp->vma); 362 363 if (vp->adj_next) 364 uprobe_mmap(vp->adj_next); 365 } 366 367 if (vp->remove) { 368 again: 369 vma_mark_detached(vp->remove); 370 if (vp->file) { 371 uprobe_munmap(vp->remove, vp->remove->vm_start, 372 vp->remove->vm_end); 373 fput(vp->file); 374 } 375 if (vp->remove->anon_vma) 376 anon_vma_merge(vp->vma, vp->remove); 377 mm->map_count--; 378 mpol_put(vma_policy(vp->remove)); 379 if (!vp->remove2) 380 WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 381 vm_area_free(vp->remove); 382 383 /* 384 * In mprotect's case 6 (see comments on vma_merge), 385 * we are removing both mid and next vmas 386 */ 387 if (vp->remove2) { 388 vp->remove = vp->remove2; 389 vp->remove2 = NULL; 390 goto again; 391 } 392 } 393 if (vp->insert && vp->file) 394 uprobe_mmap(vp->insert); 395 } 396 397 /* 398 * init_vma_prep() - Initializer wrapper for vma_prepare struct 399 * @vp: The vma_prepare struct 400 * @vma: The vma that will be altered once locked 401 */ 402 static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) 403 { 404 init_multi_vma_prep(vp, vma, NULL); 405 } 406 407 /* 408 * Can the proposed VMA be merged with the left (previous) VMA taking into 409 * account the start position of the proposed range. 410 */ 411 static bool can_vma_merge_left(struct vma_merge_struct *vmg) 412 413 { 414 return vmg->prev && vmg->prev->vm_end == vmg->start && 415 can_vma_merge_after(vmg); 416 } 417 418 /* 419 * Can the proposed VMA be merged with the right (next) VMA taking into 420 * account the end position of the proposed range. 421 * 422 * In addition, if we can merge with the left VMA, ensure that left and right 423 * anon_vma's are also compatible. 424 */ 425 static bool can_vma_merge_right(struct vma_merge_struct *vmg, 426 bool can_merge_left) 427 { 428 struct vm_area_struct *next = vmg->next; 429 struct vm_area_struct *prev; 430 431 if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) 432 return false; 433 434 if (!can_merge_left) 435 return true; 436 437 /* 438 * If we can merge with prev (left) and next (right), indicating that 439 * each VMA's anon_vma is compatible with the proposed anon_vma, this 440 * does not mean prev and next are compatible with EACH OTHER. 441 * 442 * We therefore check this in addition to mergeability to either side. 443 */ 444 prev = vmg->prev; 445 return !prev->anon_vma || !next->anon_vma || 446 prev->anon_vma == next->anon_vma; 447 } 448 449 /* 450 * Close a vm structure and free it. 451 */ 452 void remove_vma(struct vm_area_struct *vma) 453 { 454 might_sleep(); 455 vma_close(vma); 456 if (vma->vm_file) 457 fput(vma->vm_file); 458 mpol_put(vma_policy(vma)); 459 vm_area_free(vma); 460 } 461 462 /* 463 * Get rid of page table information in the indicated region. 464 * 465 * Called with the mm semaphore held. 466 */ 467 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, 468 struct vm_area_struct *prev, struct vm_area_struct *next) 469 { 470 struct mm_struct *mm = vma->vm_mm; 471 struct mmu_gather tlb; 472 473 tlb_gather_mmu(&tlb, mm); 474 update_hiwater_rss(mm); 475 unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, 476 /* mm_wr_locked = */ true); 477 mas_set(mas, vma->vm_end); 478 free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 479 next ? next->vm_start : USER_PGTABLES_CEILING, 480 /* mm_wr_locked = */ true); 481 tlb_finish_mmu(&tlb); 482 } 483 484 /* 485 * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 486 * has already been checked or doesn't make sense to fail. 487 * VMA Iterator will point to the original VMA. 488 */ 489 static __must_check int 490 __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 491 unsigned long addr, int new_below) 492 { 493 struct vma_prepare vp; 494 struct vm_area_struct *new; 495 int err; 496 497 WARN_ON(vma->vm_start >= addr); 498 WARN_ON(vma->vm_end <= addr); 499 500 if (vma->vm_ops && vma->vm_ops->may_split) { 501 err = vma->vm_ops->may_split(vma, addr); 502 if (err) 503 return err; 504 } 505 506 new = vm_area_dup(vma); 507 if (!new) 508 return -ENOMEM; 509 510 if (new_below) { 511 new->vm_end = addr; 512 } else { 513 new->vm_start = addr; 514 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 515 } 516 517 err = -ENOMEM; 518 vma_iter_config(vmi, new->vm_start, new->vm_end); 519 if (vma_iter_prealloc(vmi, new)) 520 goto out_free_vma; 521 522 err = vma_dup_policy(vma, new); 523 if (err) 524 goto out_free_vmi; 525 526 err = anon_vma_clone(new, vma); 527 if (err) 528 goto out_free_mpol; 529 530 if (new->vm_file) 531 get_file(new->vm_file); 532 533 if (new->vm_ops && new->vm_ops->open) 534 new->vm_ops->open(new); 535 536 vma_start_write(vma); 537 vma_start_write(new); 538 539 init_vma_prep(&vp, vma); 540 vp.insert = new; 541 vma_prepare(&vp); 542 vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); 543 544 if (new_below) { 545 vma->vm_start = addr; 546 vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 547 } else { 548 vma->vm_end = addr; 549 } 550 551 /* vma_complete stores the new vma */ 552 vma_complete(&vp, vmi, vma->vm_mm); 553 validate_mm(vma->vm_mm); 554 555 /* Success. */ 556 if (new_below) 557 vma_next(vmi); 558 else 559 vma_prev(vmi); 560 561 return 0; 562 563 out_free_mpol: 564 mpol_put(vma_policy(new)); 565 out_free_vmi: 566 vma_iter_free(vmi); 567 out_free_vma: 568 vm_area_free(new); 569 return err; 570 } 571 572 /* 573 * Split a vma into two pieces at address 'addr', a new vma is allocated 574 * either for the first part or the tail. 575 */ 576 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 577 unsigned long addr, int new_below) 578 { 579 if (vma->vm_mm->map_count >= sysctl_max_map_count) 580 return -ENOMEM; 581 582 return __split_vma(vmi, vma, addr, new_below); 583 } 584 585 /* 586 * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the 587 * instance that the destination VMA has no anon_vma but the source does. 588 * 589 * @dst: The destination VMA 590 * @src: The source VMA 591 * @dup: Pointer to the destination VMA when successful. 592 * 593 * Returns: 0 on success. 594 */ 595 static int dup_anon_vma(struct vm_area_struct *dst, 596 struct vm_area_struct *src, struct vm_area_struct **dup) 597 { 598 /* 599 * There are three cases to consider for correctly propagating 600 * anon_vma's on merge. 601 * 602 * The first is trivial - neither VMA has anon_vma, we need not do 603 * anything. 604 * 605 * The second where both have anon_vma is also a no-op, as they must 606 * then be the same, so there is simply nothing to copy. 607 * 608 * Here we cover the third - if the destination VMA has no anon_vma, 609 * that is it is unfaulted, we need to ensure that the newly merged 610 * range is referenced by the anon_vma's of the source. 611 */ 612 if (src->anon_vma && !dst->anon_vma) { 613 int ret; 614 615 vma_assert_write_locked(dst); 616 dst->anon_vma = src->anon_vma; 617 ret = anon_vma_clone(dst, src); 618 if (ret) 619 return ret; 620 621 *dup = dst; 622 } 623 624 return 0; 625 } 626 627 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 628 void validate_mm(struct mm_struct *mm) 629 { 630 int bug = 0; 631 int i = 0; 632 struct vm_area_struct *vma; 633 VMA_ITERATOR(vmi, mm, 0); 634 635 mt_validate(&mm->mm_mt); 636 for_each_vma(vmi, vma) { 637 #ifdef CONFIG_DEBUG_VM_RB 638 struct anon_vma *anon_vma = vma->anon_vma; 639 struct anon_vma_chain *avc; 640 #endif 641 unsigned long vmi_start, vmi_end; 642 bool warn = 0; 643 644 vmi_start = vma_iter_addr(&vmi); 645 vmi_end = vma_iter_end(&vmi); 646 if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 647 warn = 1; 648 649 if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 650 warn = 1; 651 652 if (warn) { 653 pr_emerg("issue in %s\n", current->comm); 654 dump_stack(); 655 dump_vma(vma); 656 pr_emerg("tree range: %px start %lx end %lx\n", vma, 657 vmi_start, vmi_end - 1); 658 vma_iter_dump_tree(&vmi); 659 } 660 661 #ifdef CONFIG_DEBUG_VM_RB 662 if (anon_vma) { 663 anon_vma_lock_read(anon_vma); 664 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 665 anon_vma_interval_tree_verify(avc); 666 anon_vma_unlock_read(anon_vma); 667 } 668 #endif 669 /* Check for a infinite loop */ 670 if (++i > mm->map_count + 10) { 671 i = -1; 672 break; 673 } 674 } 675 if (i != mm->map_count) { 676 pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 677 bug = 1; 678 } 679 VM_BUG_ON_MM(bug, mm); 680 } 681 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 682 683 /* 684 * Based on the vmg flag indicating whether we need to adjust the vm_start field 685 * for the middle or next VMA, we calculate what the range of the newly adjusted 686 * VMA ought to be, and set the VMA's range accordingly. 687 */ 688 static void vmg_adjust_set_range(struct vma_merge_struct *vmg) 689 { 690 struct vm_area_struct *adjust; 691 pgoff_t pgoff; 692 693 if (vmg->__adjust_middle_start) { 694 adjust = vmg->middle; 695 pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); 696 } else if (vmg->__adjust_next_start) { 697 adjust = vmg->next; 698 pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); 699 } else { 700 return; 701 } 702 703 vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); 704 } 705 706 /* 707 * Actually perform the VMA merge operation. 708 * 709 * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not 710 * modify any VMAs or cause inconsistent state should an OOM condition arise. 711 * 712 * Returns 0 on success, or an error value on failure. 713 */ 714 static int commit_merge(struct vma_merge_struct *vmg) 715 { 716 struct vm_area_struct *vma; 717 struct vma_prepare vp; 718 719 if (vmg->__adjust_next_start) { 720 /* We manipulate middle and adjust next, which is the target. */ 721 vma = vmg->middle; 722 vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); 723 } else { 724 vma = vmg->target; 725 /* Note: vma iterator must be pointing to 'start'. */ 726 vma_iter_config(vmg->vmi, vmg->start, vmg->end); 727 } 728 729 init_multi_vma_prep(&vp, vma, vmg); 730 731 /* 732 * If vmg->give_up_on_oom is set, we're safe, because we don't actually 733 * manipulate any VMAs until we succeed at preallocation. 734 * 735 * Past this point, we will not return an error. 736 */ 737 if (vma_iter_prealloc(vmg->vmi, vma)) 738 return -ENOMEM; 739 740 vma_prepare(&vp); 741 /* 742 * THP pages may need to do additional splits if we increase 743 * middle->vm_start. 744 */ 745 vma_adjust_trans_huge(vma, vmg->start, vmg->end, 746 vmg->__adjust_middle_start ? vmg->middle : NULL); 747 vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); 748 vmg_adjust_set_range(vmg); 749 vma_iter_store_overwrite(vmg->vmi, vmg->target); 750 751 vma_complete(&vp, vmg->vmi, vma->vm_mm); 752 753 return 0; 754 } 755 756 /* We can only remove VMAs when merging if they do not have a close hook. */ 757 static bool can_merge_remove_vma(struct vm_area_struct *vma) 758 { 759 return !vma->vm_ops || !vma->vm_ops->close; 760 } 761 762 /* 763 * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its 764 * attributes modified. 765 * 766 * @vmg: Describes the modifications being made to a VMA and associated 767 * metadata. 768 * 769 * When the attributes of a range within a VMA change, then it might be possible 770 * for immediately adjacent VMAs to be merged into that VMA due to having 771 * identical properties. 772 * 773 * This function checks for the existence of any such mergeable VMAs and updates 774 * the maple tree describing the @vmg->middle->vm_mm address space to account 775 * for this, as well as any VMAs shrunk/expanded/deleted as a result of this 776 * merge. 777 * 778 * As part of this operation, if a merge occurs, the @vmg object will have its 779 * vma, start, end, and pgoff fields modified to execute the merge. Subsequent 780 * calls to this function should reset these fields. 781 * 782 * Returns: The merged VMA if merge succeeds, or NULL otherwise. 783 * 784 * ASSUMPTIONS: 785 * - The caller must assign the VMA to be modifed to @vmg->middle. 786 * - The caller must have set @vmg->prev to the previous VMA, if there is one. 787 * - The caller must not set @vmg->next, as we determine this. 788 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. 789 * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). 790 */ 791 static __must_check struct vm_area_struct *vma_merge_existing_range( 792 struct vma_merge_struct *vmg) 793 { 794 struct vm_area_struct *middle = vmg->middle; 795 struct vm_area_struct *prev = vmg->prev; 796 struct vm_area_struct *next; 797 struct vm_area_struct *anon_dup = NULL; 798 unsigned long start = vmg->start; 799 unsigned long end = vmg->end; 800 bool left_side = middle && start == middle->vm_start; 801 bool right_side = middle && end == middle->vm_end; 802 int err = 0; 803 bool merge_left, merge_right, merge_both; 804 805 mmap_assert_write_locked(vmg->mm); 806 VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ 807 VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ 808 VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); 809 VM_WARN_ON_VMG(start >= end, vmg); 810 811 /* 812 * If middle == prev, then we are offset into a VMA. Otherwise, if we are 813 * not, we must span a portion of the VMA. 814 */ 815 VM_WARN_ON_VMG(middle && 816 ((middle != prev && vmg->start != middle->vm_start) || 817 vmg->end > middle->vm_end), vmg); 818 /* The vmi must be positioned within vmg->middle. */ 819 VM_WARN_ON_VMG(middle && 820 !(vma_iter_addr(vmg->vmi) >= middle->vm_start && 821 vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); 822 823 vmg->state = VMA_MERGE_NOMERGE; 824 825 /* 826 * If a special mapping or if the range being modified is neither at the 827 * furthermost left or right side of the VMA, then we have no chance of 828 * merging and should abort. 829 */ 830 if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) 831 return NULL; 832 833 if (left_side) 834 merge_left = can_vma_merge_left(vmg); 835 else 836 merge_left = false; 837 838 if (right_side) { 839 next = vmg->next = vma_iter_next_range(vmg->vmi); 840 vma_iter_prev_range(vmg->vmi); 841 842 merge_right = can_vma_merge_right(vmg, merge_left); 843 } else { 844 merge_right = false; 845 next = NULL; 846 } 847 848 if (merge_left) /* If merging prev, position iterator there. */ 849 vma_prev(vmg->vmi); 850 else if (!merge_right) /* If we have nothing to merge, abort. */ 851 return NULL; 852 853 merge_both = merge_left && merge_right; 854 /* If we span the entire VMA, a merge implies it will be deleted. */ 855 vmg->__remove_middle = left_side && right_side; 856 857 /* 858 * If we need to remove middle in its entirety but are unable to do so, 859 * we have no sensible recourse but to abort the merge. 860 */ 861 if (vmg->__remove_middle && !can_merge_remove_vma(middle)) 862 return NULL; 863 864 /* 865 * If we merge both VMAs, then next is also deleted. This implies 866 * merge_will_delete_vma also. 867 */ 868 vmg->__remove_next = merge_both; 869 870 /* 871 * If we cannot delete next, then we can reduce the operation to merging 872 * prev and middle (thereby deleting middle). 873 */ 874 if (vmg->__remove_next && !can_merge_remove_vma(next)) { 875 vmg->__remove_next = false; 876 merge_right = false; 877 merge_both = false; 878 } 879 880 /* No matter what happens, we will be adjusting middle. */ 881 vma_start_write(middle); 882 883 if (merge_right) { 884 vma_start_write(next); 885 vmg->target = next; 886 } 887 888 if (merge_left) { 889 vma_start_write(prev); 890 vmg->target = prev; 891 } 892 893 if (merge_both) { 894 /* 895 * |<-------------------->| 896 * |-------********-------| 897 * prev middle next 898 * extend delete delete 899 */ 900 901 vmg->start = prev->vm_start; 902 vmg->end = next->vm_end; 903 vmg->pgoff = prev->vm_pgoff; 904 905 /* 906 * We already ensured anon_vma compatibility above, so now it's 907 * simply a case of, if prev has no anon_vma object, which of 908 * next or middle contains the anon_vma we must duplicate. 909 */ 910 err = dup_anon_vma(prev, next->anon_vma ? next : middle, 911 &anon_dup); 912 } else if (merge_left) { 913 /* 914 * |<------------>| OR 915 * |<----------------->| 916 * |-------************* 917 * prev middle 918 * extend shrink/delete 919 */ 920 921 vmg->start = prev->vm_start; 922 vmg->pgoff = prev->vm_pgoff; 923 924 if (!vmg->__remove_middle) 925 vmg->__adjust_middle_start = true; 926 927 err = dup_anon_vma(prev, middle, &anon_dup); 928 } else { /* merge_right */ 929 /* 930 * |<------------->| OR 931 * |<----------------->| 932 * *************-------| 933 * middle next 934 * shrink/delete extend 935 */ 936 937 pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); 938 939 VM_WARN_ON_VMG(!merge_right, vmg); 940 /* If we are offset into a VMA, then prev must be middle. */ 941 VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); 942 943 if (vmg->__remove_middle) { 944 vmg->end = next->vm_end; 945 vmg->pgoff = next->vm_pgoff - pglen; 946 } else { 947 /* We shrink middle and expand next. */ 948 vmg->__adjust_next_start = true; 949 vmg->start = middle->vm_start; 950 vmg->end = start; 951 vmg->pgoff = middle->vm_pgoff; 952 } 953 954 err = dup_anon_vma(next, middle, &anon_dup); 955 } 956 957 if (err) 958 goto abort; 959 960 err = commit_merge(vmg); 961 if (err) { 962 VM_WARN_ON(err != -ENOMEM); 963 964 if (anon_dup) 965 unlink_anon_vmas(anon_dup); 966 967 /* 968 * We've cleaned up any cloned anon_vma's, no VMAs have been 969 * modified, no harm no foul if the user requests that we not 970 * report this and just give up, leaving the VMAs unmerged. 971 */ 972 if (!vmg->give_up_on_oom) 973 vmg->state = VMA_MERGE_ERROR_NOMEM; 974 return NULL; 975 } 976 977 khugepaged_enter_vma(vmg->target, vmg->flags); 978 vmg->state = VMA_MERGE_SUCCESS; 979 return vmg->target; 980 981 abort: 982 vma_iter_set(vmg->vmi, start); 983 vma_iter_load(vmg->vmi); 984 985 /* 986 * This means we have failed to clone anon_vma's correctly, but no 987 * actual changes to VMAs have occurred, so no harm no foul - if the 988 * user doesn't want this reported and instead just wants to give up on 989 * the merge, allow it. 990 */ 991 if (!vmg->give_up_on_oom) 992 vmg->state = VMA_MERGE_ERROR_NOMEM; 993 return NULL; 994 } 995 996 /* 997 * vma_merge_new_range - Attempt to merge a new VMA into address space 998 * 999 * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end 1000 * (exclusive), which we try to merge with any adjacent VMAs if possible. 1001 * 1002 * We are about to add a VMA to the address space starting at @vmg->start and 1003 * ending at @vmg->end. There are three different possible scenarios: 1004 * 1005 * 1. There is a VMA with identical properties immediately adjacent to the 1006 * proposed new VMA [@vmg->start, @vmg->end) either before or after it - 1007 * EXPAND that VMA: 1008 * 1009 * Proposed: |-----| or |-----| 1010 * Existing: |----| |----| 1011 * 1012 * 2. There are VMAs with identical properties immediately adjacent to the 1013 * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - 1014 * EXPAND the former and REMOVE the latter: 1015 * 1016 * Proposed: |-----| 1017 * Existing: |----| |----| 1018 * 1019 * 3. There are no VMAs immediately adjacent to the proposed new VMA or those 1020 * VMAs do not have identical attributes - NO MERGE POSSIBLE. 1021 * 1022 * In instances where we can merge, this function returns the expanded VMA which 1023 * will have its range adjusted accordingly and the underlying maple tree also 1024 * adjusted. 1025 * 1026 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer 1027 * to the VMA we expanded. 1028 * 1029 * This function adjusts @vmg to provide @vmg->next if not already specified, 1030 * and adjusts [@vmg->start, @vmg->end) to span the expanded range. 1031 * 1032 * ASSUMPTIONS: 1033 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. 1034 * - The caller must have determined that [@vmg->start, @vmg->end) is empty, 1035 other than VMAs that will be unmapped should the operation succeed. 1036 * - The caller must have specified the previous vma in @vmg->prev. 1037 * - The caller must have specified the next vma in @vmg->next. 1038 * - The caller must have positioned the vmi at or before the gap. 1039 */ 1040 struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) 1041 { 1042 struct vm_area_struct *prev = vmg->prev; 1043 struct vm_area_struct *next = vmg->next; 1044 unsigned long end = vmg->end; 1045 bool can_merge_left, can_merge_right; 1046 1047 mmap_assert_write_locked(vmg->mm); 1048 VM_WARN_ON_VMG(vmg->middle, vmg); 1049 /* vmi must point at or before the gap. */ 1050 VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); 1051 1052 vmg->state = VMA_MERGE_NOMERGE; 1053 1054 /* Special VMAs are unmergeable, also if no prev/next. */ 1055 if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) 1056 return NULL; 1057 1058 can_merge_left = can_vma_merge_left(vmg); 1059 can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); 1060 1061 /* If we can merge with the next VMA, adjust vmg accordingly. */ 1062 if (can_merge_right) { 1063 vmg->end = next->vm_end; 1064 vmg->middle = next; 1065 } 1066 1067 /* If we can merge with the previous VMA, adjust vmg accordingly. */ 1068 if (can_merge_left) { 1069 vmg->start = prev->vm_start; 1070 vmg->middle = prev; 1071 vmg->pgoff = prev->vm_pgoff; 1072 1073 /* 1074 * If this merge would result in removal of the next VMA but we 1075 * are not permitted to do so, reduce the operation to merging 1076 * prev and vma. 1077 */ 1078 if (can_merge_right && !can_merge_remove_vma(next)) 1079 vmg->end = end; 1080 1081 /* In expand-only case we are already positioned at prev. */ 1082 if (!vmg->just_expand) { 1083 /* Equivalent to going to the previous range. */ 1084 vma_prev(vmg->vmi); 1085 } 1086 } 1087 1088 /* 1089 * Now try to expand adjacent VMA(s). This takes care of removing the 1090 * following VMA if we have VMAs on both sides. 1091 */ 1092 if (vmg->middle && !vma_expand(vmg)) { 1093 khugepaged_enter_vma(vmg->middle, vmg->flags); 1094 vmg->state = VMA_MERGE_SUCCESS; 1095 return vmg->middle; 1096 } 1097 1098 return NULL; 1099 } 1100 1101 /* 1102 * vma_expand - Expand an existing VMA 1103 * 1104 * @vmg: Describes a VMA expansion operation. 1105 * 1106 * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. 1107 * Will expand over vmg->next if it's different from vmg->middle and vmg->end == 1108 * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with 1109 * vmg->next needs to be handled by the caller. 1110 * 1111 * Returns: 0 on success. 1112 * 1113 * ASSUMPTIONS: 1114 * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. 1115 * - The caller must have set @vmg->middle and @vmg->next. 1116 */ 1117 int vma_expand(struct vma_merge_struct *vmg) 1118 { 1119 struct vm_area_struct *anon_dup = NULL; 1120 bool remove_next = false; 1121 struct vm_area_struct *middle = vmg->middle; 1122 struct vm_area_struct *next = vmg->next; 1123 1124 mmap_assert_write_locked(vmg->mm); 1125 1126 vma_start_write(middle); 1127 if (next && (middle != next) && (vmg->end == next->vm_end)) { 1128 int ret; 1129 1130 remove_next = true; 1131 /* This should already have been checked by this point. */ 1132 VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); 1133 vma_start_write(next); 1134 /* 1135 * In this case we don't report OOM, so vmg->give_up_on_mm is 1136 * safe. 1137 */ 1138 ret = dup_anon_vma(middle, next, &anon_dup); 1139 if (ret) 1140 return ret; 1141 } 1142 1143 /* Not merging but overwriting any part of next is not handled. */ 1144 VM_WARN_ON_VMG(next && !remove_next && 1145 next != middle && vmg->end > next->vm_start, vmg); 1146 /* Only handles expanding */ 1147 VM_WARN_ON_VMG(middle->vm_start < vmg->start || 1148 middle->vm_end > vmg->end, vmg); 1149 1150 vmg->target = middle; 1151 if (remove_next) 1152 vmg->__remove_next = true; 1153 1154 if (commit_merge(vmg)) 1155 goto nomem; 1156 1157 return 0; 1158 1159 nomem: 1160 if (anon_dup) 1161 unlink_anon_vmas(anon_dup); 1162 /* 1163 * If the user requests that we just give upon OOM, we are safe to do so 1164 * here, as commit merge provides this contract to us. Nothing has been 1165 * changed - no harm no foul, just don't report it. 1166 */ 1167 if (!vmg->give_up_on_oom) 1168 vmg->state = VMA_MERGE_ERROR_NOMEM; 1169 return -ENOMEM; 1170 } 1171 1172 /* 1173 * vma_shrink() - Reduce an existing VMAs memory area 1174 * @vmi: The vma iterator 1175 * @vma: The VMA to modify 1176 * @start: The new start 1177 * @end: The new end 1178 * 1179 * Returns: 0 on success, -ENOMEM otherwise 1180 */ 1181 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 1182 unsigned long start, unsigned long end, pgoff_t pgoff) 1183 { 1184 struct vma_prepare vp; 1185 1186 WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 1187 1188 if (vma->vm_start < start) 1189 vma_iter_config(vmi, vma->vm_start, start); 1190 else 1191 vma_iter_config(vmi, end, vma->vm_end); 1192 1193 if (vma_iter_prealloc(vmi, NULL)) 1194 return -ENOMEM; 1195 1196 vma_start_write(vma); 1197 1198 init_vma_prep(&vp, vma); 1199 vma_prepare(&vp); 1200 vma_adjust_trans_huge(vma, start, end, NULL); 1201 1202 vma_iter_clear(vmi); 1203 vma_set_range(vma, start, end, pgoff); 1204 vma_complete(&vp, vmi, vma->vm_mm); 1205 validate_mm(vma->vm_mm); 1206 return 0; 1207 } 1208 1209 static inline void vms_clear_ptes(struct vma_munmap_struct *vms, 1210 struct ma_state *mas_detach, bool mm_wr_locked) 1211 { 1212 struct mmu_gather tlb; 1213 1214 if (!vms->clear_ptes) /* Nothing to do */ 1215 return; 1216 1217 /* 1218 * We can free page tables without write-locking mmap_lock because VMAs 1219 * were isolated before we downgraded mmap_lock. 1220 */ 1221 mas_set(mas_detach, 1); 1222 tlb_gather_mmu(&tlb, vms->vma->vm_mm); 1223 update_hiwater_rss(vms->vma->vm_mm); 1224 unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, 1225 vms->vma_count, mm_wr_locked); 1226 1227 mas_set(mas_detach, 1); 1228 /* start and end may be different if there is no prev or next vma. */ 1229 free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, 1230 vms->unmap_end, mm_wr_locked); 1231 tlb_finish_mmu(&tlb); 1232 vms->clear_ptes = false; 1233 } 1234 1235 static void vms_clean_up_area(struct vma_munmap_struct *vms, 1236 struct ma_state *mas_detach) 1237 { 1238 struct vm_area_struct *vma; 1239 1240 if (!vms->nr_pages) 1241 return; 1242 1243 vms_clear_ptes(vms, mas_detach, true); 1244 mas_set(mas_detach, 0); 1245 mas_for_each(mas_detach, vma, ULONG_MAX) 1246 vma_close(vma); 1247 } 1248 1249 /* 1250 * vms_complete_munmap_vmas() - Finish the munmap() operation 1251 * @vms: The vma munmap struct 1252 * @mas_detach: The maple state of the detached vmas 1253 * 1254 * This updates the mm_struct, unmaps the region, frees the resources 1255 * used for the munmap() and may downgrade the lock - if requested. Everything 1256 * needed to be done once the vma maple tree is updated. 1257 */ 1258 static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, 1259 struct ma_state *mas_detach) 1260 { 1261 struct vm_area_struct *vma; 1262 struct mm_struct *mm; 1263 1264 mm = current->mm; 1265 mm->map_count -= vms->vma_count; 1266 mm->locked_vm -= vms->locked_vm; 1267 if (vms->unlock) 1268 mmap_write_downgrade(mm); 1269 1270 if (!vms->nr_pages) 1271 return; 1272 1273 vms_clear_ptes(vms, mas_detach, !vms->unlock); 1274 /* Update high watermark before we lower total_vm */ 1275 update_hiwater_vm(mm); 1276 /* Stat accounting */ 1277 WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); 1278 /* Paranoid bookkeeping */ 1279 VM_WARN_ON(vms->exec_vm > mm->exec_vm); 1280 VM_WARN_ON(vms->stack_vm > mm->stack_vm); 1281 VM_WARN_ON(vms->data_vm > mm->data_vm); 1282 mm->exec_vm -= vms->exec_vm; 1283 mm->stack_vm -= vms->stack_vm; 1284 mm->data_vm -= vms->data_vm; 1285 1286 /* Remove and clean up vmas */ 1287 mas_set(mas_detach, 0); 1288 mas_for_each(mas_detach, vma, ULONG_MAX) 1289 remove_vma(vma); 1290 1291 vm_unacct_memory(vms->nr_accounted); 1292 validate_mm(mm); 1293 if (vms->unlock) 1294 mmap_read_unlock(mm); 1295 1296 __mt_destroy(mas_detach->tree); 1297 } 1298 1299 /* 1300 * reattach_vmas() - Undo any munmap work and free resources 1301 * @mas_detach: The maple state with the detached maple tree 1302 * 1303 * Reattach any detached vmas and free up the maple tree used to track the vmas. 1304 */ 1305 static void reattach_vmas(struct ma_state *mas_detach) 1306 { 1307 struct vm_area_struct *vma; 1308 1309 mas_set(mas_detach, 0); 1310 mas_for_each(mas_detach, vma, ULONG_MAX) 1311 vma_mark_attached(vma); 1312 1313 __mt_destroy(mas_detach->tree); 1314 } 1315 1316 /* 1317 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree 1318 * for removal at a later date. Handles splitting first and last if necessary 1319 * and marking the vmas as isolated. 1320 * 1321 * @vms: The vma munmap struct 1322 * @mas_detach: The maple state tracking the detached tree 1323 * 1324 * Return: 0 on success, error otherwise 1325 */ 1326 static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, 1327 struct ma_state *mas_detach) 1328 { 1329 struct vm_area_struct *next = NULL; 1330 int error; 1331 1332 /* 1333 * If we need to split any vma, do it now to save pain later. 1334 * Does it split the first one? 1335 */ 1336 if (vms->start > vms->vma->vm_start) { 1337 1338 /* 1339 * Make sure that map_count on return from munmap() will 1340 * not exceed its limit; but let map_count go just above 1341 * its limit temporarily, to help free resources as expected. 1342 */ 1343 if (vms->end < vms->vma->vm_end && 1344 vms->vma->vm_mm->map_count >= sysctl_max_map_count) { 1345 error = -ENOMEM; 1346 goto map_count_exceeded; 1347 } 1348 1349 /* Don't bother splitting the VMA if we can't unmap it anyway */ 1350 if (!can_modify_vma(vms->vma)) { 1351 error = -EPERM; 1352 goto start_split_failed; 1353 } 1354 1355 error = __split_vma(vms->vmi, vms->vma, vms->start, 1); 1356 if (error) 1357 goto start_split_failed; 1358 } 1359 vms->prev = vma_prev(vms->vmi); 1360 if (vms->prev) 1361 vms->unmap_start = vms->prev->vm_end; 1362 1363 /* 1364 * Detach a range of VMAs from the mm. Using next as a temp variable as 1365 * it is always overwritten. 1366 */ 1367 for_each_vma_range(*(vms->vmi), next, vms->end) { 1368 long nrpages; 1369 1370 if (!can_modify_vma(next)) { 1371 error = -EPERM; 1372 goto modify_vma_failed; 1373 } 1374 /* Does it split the end? */ 1375 if (next->vm_end > vms->end) { 1376 error = __split_vma(vms->vmi, next, vms->end, 0); 1377 if (error) 1378 goto end_split_failed; 1379 } 1380 vma_start_write(next); 1381 mas_set(mas_detach, vms->vma_count++); 1382 error = mas_store_gfp(mas_detach, next, GFP_KERNEL); 1383 if (error) 1384 goto munmap_gather_failed; 1385 1386 vma_mark_detached(next); 1387 nrpages = vma_pages(next); 1388 1389 vms->nr_pages += nrpages; 1390 if (next->vm_flags & VM_LOCKED) 1391 vms->locked_vm += nrpages; 1392 1393 if (next->vm_flags & VM_ACCOUNT) 1394 vms->nr_accounted += nrpages; 1395 1396 if (is_exec_mapping(next->vm_flags)) 1397 vms->exec_vm += nrpages; 1398 else if (is_stack_mapping(next->vm_flags)) 1399 vms->stack_vm += nrpages; 1400 else if (is_data_mapping(next->vm_flags)) 1401 vms->data_vm += nrpages; 1402 1403 if (vms->uf) { 1404 /* 1405 * If userfaultfd_unmap_prep returns an error the vmas 1406 * will remain split, but userland will get a 1407 * highly unexpected error anyway. This is no 1408 * different than the case where the first of the two 1409 * __split_vma fails, but we don't undo the first 1410 * split, despite we could. This is unlikely enough 1411 * failure that it's not worth optimizing it for. 1412 */ 1413 error = userfaultfd_unmap_prep(next, vms->start, 1414 vms->end, vms->uf); 1415 if (error) 1416 goto userfaultfd_error; 1417 } 1418 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 1419 BUG_ON(next->vm_start < vms->start); 1420 BUG_ON(next->vm_start > vms->end); 1421 #endif 1422 } 1423 1424 vms->next = vma_next(vms->vmi); 1425 if (vms->next) 1426 vms->unmap_end = vms->next->vm_start; 1427 1428 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1429 /* Make sure no VMAs are about to be lost. */ 1430 { 1431 MA_STATE(test, mas_detach->tree, 0, 0); 1432 struct vm_area_struct *vma_mas, *vma_test; 1433 int test_count = 0; 1434 1435 vma_iter_set(vms->vmi, vms->start); 1436 rcu_read_lock(); 1437 vma_test = mas_find(&test, vms->vma_count - 1); 1438 for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { 1439 BUG_ON(vma_mas != vma_test); 1440 test_count++; 1441 vma_test = mas_next(&test, vms->vma_count - 1); 1442 } 1443 rcu_read_unlock(); 1444 BUG_ON(vms->vma_count != test_count); 1445 } 1446 #endif 1447 1448 while (vma_iter_addr(vms->vmi) > vms->start) 1449 vma_iter_prev_range(vms->vmi); 1450 1451 vms->clear_ptes = true; 1452 return 0; 1453 1454 userfaultfd_error: 1455 munmap_gather_failed: 1456 end_split_failed: 1457 modify_vma_failed: 1458 reattach_vmas(mas_detach); 1459 start_split_failed: 1460 map_count_exceeded: 1461 return error; 1462 } 1463 1464 /* 1465 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct 1466 * @vms: The vma munmap struct 1467 * @vmi: The vma iterator 1468 * @vma: The first vm_area_struct to munmap 1469 * @start: The aligned start address to munmap 1470 * @end: The aligned end address to munmap 1471 * @uf: The userfaultfd list_head 1472 * @unlock: Unlock after the operation. Only unlocked on success 1473 */ 1474 static void init_vma_munmap(struct vma_munmap_struct *vms, 1475 struct vma_iterator *vmi, struct vm_area_struct *vma, 1476 unsigned long start, unsigned long end, struct list_head *uf, 1477 bool unlock) 1478 { 1479 vms->vmi = vmi; 1480 vms->vma = vma; 1481 if (vma) { 1482 vms->start = start; 1483 vms->end = end; 1484 } else { 1485 vms->start = vms->end = 0; 1486 } 1487 vms->unlock = unlock; 1488 vms->uf = uf; 1489 vms->vma_count = 0; 1490 vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; 1491 vms->exec_vm = vms->stack_vm = vms->data_vm = 0; 1492 vms->unmap_start = FIRST_USER_ADDRESS; 1493 vms->unmap_end = USER_PGTABLES_CEILING; 1494 vms->clear_ptes = false; 1495 } 1496 1497 /* 1498 * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 1499 * @vmi: The vma iterator 1500 * @vma: The starting vm_area_struct 1501 * @mm: The mm_struct 1502 * @start: The aligned start address to munmap. 1503 * @end: The aligned end address to munmap. 1504 * @uf: The userfaultfd list_head 1505 * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 1506 * success. 1507 * 1508 * Return: 0 on success and drops the lock if so directed, error and leaves the 1509 * lock held otherwise. 1510 */ 1511 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 1512 struct mm_struct *mm, unsigned long start, unsigned long end, 1513 struct list_head *uf, bool unlock) 1514 { 1515 struct maple_tree mt_detach; 1516 MA_STATE(mas_detach, &mt_detach, 0, 0); 1517 mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 1518 mt_on_stack(mt_detach); 1519 struct vma_munmap_struct vms; 1520 int error; 1521 1522 init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); 1523 error = vms_gather_munmap_vmas(&vms, &mas_detach); 1524 if (error) 1525 goto gather_failed; 1526 1527 error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 1528 if (error) 1529 goto clear_tree_failed; 1530 1531 /* Point of no return */ 1532 vms_complete_munmap_vmas(&vms, &mas_detach); 1533 return 0; 1534 1535 clear_tree_failed: 1536 reattach_vmas(&mas_detach); 1537 gather_failed: 1538 validate_mm(mm); 1539 return error; 1540 } 1541 1542 /* 1543 * do_vmi_munmap() - munmap a given range. 1544 * @vmi: The vma iterator 1545 * @mm: The mm_struct 1546 * @start: The start address to munmap 1547 * @len: The length of the range to munmap 1548 * @uf: The userfaultfd list_head 1549 * @unlock: set to true if the user wants to drop the mmap_lock on success 1550 * 1551 * This function takes a @mas that is either pointing to the previous VMA or set 1552 * to MA_START and sets it up to remove the mapping(s). The @len will be 1553 * aligned. 1554 * 1555 * Return: 0 on success and drops the lock if so directed, error and leaves the 1556 * lock held otherwise. 1557 */ 1558 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 1559 unsigned long start, size_t len, struct list_head *uf, 1560 bool unlock) 1561 { 1562 unsigned long end; 1563 struct vm_area_struct *vma; 1564 1565 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 1566 return -EINVAL; 1567 1568 end = start + PAGE_ALIGN(len); 1569 if (end == start) 1570 return -EINVAL; 1571 1572 /* Find the first overlapping VMA */ 1573 vma = vma_find(vmi, end); 1574 if (!vma) { 1575 if (unlock) 1576 mmap_write_unlock(mm); 1577 return 0; 1578 } 1579 1580 return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 1581 } 1582 1583 /* 1584 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1585 * context and anonymous VMA name within the range [start, end). 1586 * 1587 * As a result, we might be able to merge the newly modified VMA range with an 1588 * adjacent VMA with identical properties. 1589 * 1590 * If no merge is possible and the range does not span the entirety of the VMA, 1591 * we then need to split the VMA to accommodate the change. 1592 * 1593 * The function returns either the merged VMA, the original VMA if a split was 1594 * required instead, or an error if the split failed. 1595 */ 1596 static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) 1597 { 1598 struct vm_area_struct *vma = vmg->middle; 1599 unsigned long start = vmg->start; 1600 unsigned long end = vmg->end; 1601 struct vm_area_struct *merged; 1602 1603 /* First, try to merge. */ 1604 merged = vma_merge_existing_range(vmg); 1605 if (merged) 1606 return merged; 1607 if (vmg_nomem(vmg)) 1608 return ERR_PTR(-ENOMEM); 1609 1610 /* 1611 * Split can fail for reasons other than OOM, so if the user requests 1612 * this it's probably a mistake. 1613 */ 1614 VM_WARN_ON(vmg->give_up_on_oom && 1615 (vma->vm_start != start || vma->vm_end != end)); 1616 1617 /* Split any preceding portion of the VMA. */ 1618 if (vma->vm_start < start) { 1619 int err = split_vma(vmg->vmi, vma, start, 1); 1620 1621 if (err) 1622 return ERR_PTR(err); 1623 } 1624 1625 /* Split any trailing portion of the VMA. */ 1626 if (vma->vm_end > end) { 1627 int err = split_vma(vmg->vmi, vma, end, 0); 1628 1629 if (err) 1630 return ERR_PTR(err); 1631 } 1632 1633 return vma; 1634 } 1635 1636 struct vm_area_struct *vma_modify_flags( 1637 struct vma_iterator *vmi, struct vm_area_struct *prev, 1638 struct vm_area_struct *vma, unsigned long start, unsigned long end, 1639 unsigned long new_flags) 1640 { 1641 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1642 1643 vmg.flags = new_flags; 1644 1645 return vma_modify(&vmg); 1646 } 1647 1648 struct vm_area_struct 1649 *vma_modify_flags_name(struct vma_iterator *vmi, 1650 struct vm_area_struct *prev, 1651 struct vm_area_struct *vma, 1652 unsigned long start, 1653 unsigned long end, 1654 unsigned long new_flags, 1655 struct anon_vma_name *new_name) 1656 { 1657 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1658 1659 vmg.flags = new_flags; 1660 vmg.anon_name = new_name; 1661 1662 return vma_modify(&vmg); 1663 } 1664 1665 struct vm_area_struct 1666 *vma_modify_policy(struct vma_iterator *vmi, 1667 struct vm_area_struct *prev, 1668 struct vm_area_struct *vma, 1669 unsigned long start, unsigned long end, 1670 struct mempolicy *new_pol) 1671 { 1672 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1673 1674 vmg.policy = new_pol; 1675 1676 return vma_modify(&vmg); 1677 } 1678 1679 struct vm_area_struct 1680 *vma_modify_flags_uffd(struct vma_iterator *vmi, 1681 struct vm_area_struct *prev, 1682 struct vm_area_struct *vma, 1683 unsigned long start, unsigned long end, 1684 unsigned long new_flags, 1685 struct vm_userfaultfd_ctx new_ctx, 1686 bool give_up_on_oom) 1687 { 1688 VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); 1689 1690 vmg.flags = new_flags; 1691 vmg.uffd_ctx = new_ctx; 1692 if (give_up_on_oom) 1693 vmg.give_up_on_oom = true; 1694 1695 return vma_modify(&vmg); 1696 } 1697 1698 /* 1699 * Expand vma by delta bytes, potentially merging with an immediately adjacent 1700 * VMA with identical properties. 1701 */ 1702 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1703 struct vm_area_struct *vma, 1704 unsigned long delta) 1705 { 1706 VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); 1707 1708 vmg.next = vma_iter_next_rewind(vmi, NULL); 1709 vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ 1710 1711 return vma_merge_new_range(&vmg); 1712 } 1713 1714 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 1715 { 1716 vb->count = 0; 1717 } 1718 1719 static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 1720 { 1721 struct address_space *mapping; 1722 int i; 1723 1724 mapping = vb->vmas[0]->vm_file->f_mapping; 1725 i_mmap_lock_write(mapping); 1726 for (i = 0; i < vb->count; i++) { 1727 VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 1728 __remove_shared_vm_struct(vb->vmas[i], mapping); 1729 } 1730 i_mmap_unlock_write(mapping); 1731 1732 unlink_file_vma_batch_init(vb); 1733 } 1734 1735 void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 1736 struct vm_area_struct *vma) 1737 { 1738 if (vma->vm_file == NULL) 1739 return; 1740 1741 if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 1742 vb->count == ARRAY_SIZE(vb->vmas)) 1743 unlink_file_vma_batch_process(vb); 1744 1745 vb->vmas[vb->count] = vma; 1746 vb->count++; 1747 } 1748 1749 void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 1750 { 1751 if (vb->count > 0) 1752 unlink_file_vma_batch_process(vb); 1753 } 1754 1755 /* 1756 * Unlink a file-based vm structure from its interval tree, to hide 1757 * vma from rmap and vmtruncate before freeing its page tables. 1758 */ 1759 void unlink_file_vma(struct vm_area_struct *vma) 1760 { 1761 struct file *file = vma->vm_file; 1762 1763 if (file) { 1764 struct address_space *mapping = file->f_mapping; 1765 1766 i_mmap_lock_write(mapping); 1767 __remove_shared_vm_struct(vma, mapping); 1768 i_mmap_unlock_write(mapping); 1769 } 1770 } 1771 1772 void vma_link_file(struct vm_area_struct *vma) 1773 { 1774 struct file *file = vma->vm_file; 1775 struct address_space *mapping; 1776 1777 if (file) { 1778 mapping = file->f_mapping; 1779 i_mmap_lock_write(mapping); 1780 __vma_link_file(vma, mapping); 1781 i_mmap_unlock_write(mapping); 1782 } 1783 } 1784 1785 int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 1786 { 1787 VMA_ITERATOR(vmi, mm, 0); 1788 1789 vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1790 if (vma_iter_prealloc(&vmi, vma)) 1791 return -ENOMEM; 1792 1793 vma_start_write(vma); 1794 vma_iter_store_new(&vmi, vma); 1795 vma_link_file(vma); 1796 mm->map_count++; 1797 validate_mm(mm); 1798 return 0; 1799 } 1800 1801 /* 1802 * Copy the vma structure to a new location in the same mm, 1803 * prior to moving page table entries, to effect an mremap move. 1804 */ 1805 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1806 unsigned long addr, unsigned long len, pgoff_t pgoff, 1807 bool *need_rmap_locks) 1808 { 1809 struct vm_area_struct *vma = *vmap; 1810 unsigned long vma_start = vma->vm_start; 1811 struct mm_struct *mm = vma->vm_mm; 1812 struct vm_area_struct *new_vma; 1813 bool faulted_in_anon_vma = true; 1814 VMA_ITERATOR(vmi, mm, addr); 1815 VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); 1816 1817 /* 1818 * If anonymous vma has not yet been faulted, update new pgoff 1819 * to match new location, to increase its chance of merging. 1820 */ 1821 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 1822 pgoff = addr >> PAGE_SHIFT; 1823 faulted_in_anon_vma = false; 1824 } 1825 1826 new_vma = find_vma_prev(mm, addr, &vmg.prev); 1827 if (new_vma && new_vma->vm_start < addr + len) 1828 return NULL; /* should never get here */ 1829 1830 vmg.middle = NULL; /* New VMA range. */ 1831 vmg.pgoff = pgoff; 1832 vmg.next = vma_iter_next_rewind(&vmi, NULL); 1833 new_vma = vma_merge_new_range(&vmg); 1834 1835 if (new_vma) { 1836 /* 1837 * Source vma may have been merged into new_vma 1838 */ 1839 if (unlikely(vma_start >= new_vma->vm_start && 1840 vma_start < new_vma->vm_end)) { 1841 /* 1842 * The only way we can get a vma_merge with 1843 * self during an mremap is if the vma hasn't 1844 * been faulted in yet and we were allowed to 1845 * reset the dst vma->vm_pgoff to the 1846 * destination address of the mremap to allow 1847 * the merge to happen. mremap must change the 1848 * vm_pgoff linearity between src and dst vmas 1849 * (in turn preventing a vma_merge) to be 1850 * safe. It is only safe to keep the vm_pgoff 1851 * linear if there are no pages mapped yet. 1852 */ 1853 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 1854 *vmap = vma = new_vma; 1855 } 1856 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 1857 } else { 1858 new_vma = vm_area_dup(vma); 1859 if (!new_vma) 1860 goto out; 1861 vma_set_range(new_vma, addr, addr + len, pgoff); 1862 if (vma_dup_policy(vma, new_vma)) 1863 goto out_free_vma; 1864 if (anon_vma_clone(new_vma, vma)) 1865 goto out_free_mempol; 1866 if (new_vma->vm_file) 1867 get_file(new_vma->vm_file); 1868 if (new_vma->vm_ops && new_vma->vm_ops->open) 1869 new_vma->vm_ops->open(new_vma); 1870 if (vma_link(mm, new_vma)) 1871 goto out_vma_link; 1872 *need_rmap_locks = false; 1873 } 1874 return new_vma; 1875 1876 out_vma_link: 1877 fixup_hugetlb_reservations(new_vma); 1878 vma_close(new_vma); 1879 1880 if (new_vma->vm_file) 1881 fput(new_vma->vm_file); 1882 1883 unlink_anon_vmas(new_vma); 1884 out_free_mempol: 1885 mpol_put(vma_policy(new_vma)); 1886 out_free_vma: 1887 vm_area_free(new_vma); 1888 out: 1889 return NULL; 1890 } 1891 1892 /* 1893 * Rough compatibility check to quickly see if it's even worth looking 1894 * at sharing an anon_vma. 1895 * 1896 * They need to have the same vm_file, and the flags can only differ 1897 * in things that mprotect may change. 1898 * 1899 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1900 * we can merge the two vma's. For example, we refuse to merge a vma if 1901 * there is a vm_ops->close() function, because that indicates that the 1902 * driver is doing some kind of reference counting. But that doesn't 1903 * really matter for the anon_vma sharing case. 1904 */ 1905 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1906 { 1907 return a->vm_end == b->vm_start && 1908 mpol_equal(vma_policy(a), vma_policy(b)) && 1909 a->vm_file == b->vm_file && 1910 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1911 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1912 } 1913 1914 /* 1915 * Do some basic sanity checking to see if we can re-use the anon_vma 1916 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1917 * the same as 'old', the other will be the new one that is trying 1918 * to share the anon_vma. 1919 * 1920 * NOTE! This runs with mmap_lock held for reading, so it is possible that 1921 * the anon_vma of 'old' is concurrently in the process of being set up 1922 * by another page fault trying to merge _that_. But that's ok: if it 1923 * is being set up, that automatically means that it will be a singleton 1924 * acceptable for merging, so we can do all of this optimistically. But 1925 * we do that READ_ONCE() to make sure that we never re-load the pointer. 1926 * 1927 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1928 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1929 * is to return an anon_vma that is "complex" due to having gone through 1930 * a fork). 1931 * 1932 * We also make sure that the two vma's are compatible (adjacent, 1933 * and with the same memory policies). That's all stable, even with just 1934 * a read lock on the mmap_lock. 1935 */ 1936 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, 1937 struct vm_area_struct *a, 1938 struct vm_area_struct *b) 1939 { 1940 if (anon_vma_compatible(a, b)) { 1941 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1942 1943 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1944 return anon_vma; 1945 } 1946 return NULL; 1947 } 1948 1949 /* 1950 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1951 * neighbouring vmas for a suitable anon_vma, before it goes off 1952 * to allocate a new anon_vma. It checks because a repetitive 1953 * sequence of mprotects and faults may otherwise lead to distinct 1954 * anon_vmas being allocated, preventing vma merge in subsequent 1955 * mprotect. 1956 */ 1957 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1958 { 1959 struct anon_vma *anon_vma = NULL; 1960 struct vm_area_struct *prev, *next; 1961 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1962 1963 /* Try next first. */ 1964 next = vma_iter_load(&vmi); 1965 if (next) { 1966 anon_vma = reusable_anon_vma(next, vma, next); 1967 if (anon_vma) 1968 return anon_vma; 1969 } 1970 1971 prev = vma_prev(&vmi); 1972 VM_BUG_ON_VMA(prev != vma, vma); 1973 prev = vma_prev(&vmi); 1974 /* Try prev next. */ 1975 if (prev) 1976 anon_vma = reusable_anon_vma(prev, prev, vma); 1977 1978 /* 1979 * We might reach here with anon_vma == NULL if we can't find 1980 * any reusable anon_vma. 1981 * There's no absolute need to look only at touching neighbours: 1982 * we could search further afield for "compatible" anon_vmas. 1983 * But it would probably just be a waste of time searching, 1984 * or lead to too many vmas hanging off the same anon_vma. 1985 * We're trying to allow mprotect remerging later on, 1986 * not trying to minimize memory used for anon_vmas. 1987 */ 1988 return anon_vma; 1989 } 1990 1991 static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 1992 { 1993 return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 1994 } 1995 1996 static bool vma_is_shared_writable(struct vm_area_struct *vma) 1997 { 1998 return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 1999 (VM_WRITE | VM_SHARED); 2000 } 2001 2002 static bool vma_fs_can_writeback(struct vm_area_struct *vma) 2003 { 2004 /* No managed pages to writeback. */ 2005 if (vma->vm_flags & VM_PFNMAP) 2006 return false; 2007 2008 return vma->vm_file && vma->vm_file->f_mapping && 2009 mapping_can_writeback(vma->vm_file->f_mapping); 2010 } 2011 2012 /* 2013 * Does this VMA require the underlying folios to have their dirty state 2014 * tracked? 2015 */ 2016 bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 2017 { 2018 /* Only shared, writable VMAs require dirty tracking. */ 2019 if (!vma_is_shared_writable(vma)) 2020 return false; 2021 2022 /* Does the filesystem need to be notified? */ 2023 if (vm_ops_needs_writenotify(vma->vm_ops)) 2024 return true; 2025 2026 /* 2027 * Even if the filesystem doesn't indicate a need for writenotify, if it 2028 * can writeback, dirty tracking is still required. 2029 */ 2030 return vma_fs_can_writeback(vma); 2031 } 2032 2033 /* 2034 * Some shared mappings will want the pages marked read-only 2035 * to track write events. If so, we'll downgrade vm_page_prot 2036 * to the private version (using protection_map[] without the 2037 * VM_SHARED bit). 2038 */ 2039 bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 2040 { 2041 /* If it was private or non-writable, the write bit is already clear */ 2042 if (!vma_is_shared_writable(vma)) 2043 return false; 2044 2045 /* The backer wishes to know when pages are first written to? */ 2046 if (vm_ops_needs_writenotify(vma->vm_ops)) 2047 return true; 2048 2049 /* The open routine did something to the protections that pgprot_modify 2050 * won't preserve? */ 2051 if (pgprot_val(vm_page_prot) != 2052 pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 2053 return false; 2054 2055 /* 2056 * Do we need to track softdirty? hugetlb does not support softdirty 2057 * tracking yet. 2058 */ 2059 if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 2060 return true; 2061 2062 /* Do we need write faults for uffd-wp tracking? */ 2063 if (userfaultfd_wp(vma)) 2064 return true; 2065 2066 /* Can the mapping track the dirty pages? */ 2067 return vma_fs_can_writeback(vma); 2068 } 2069 2070 static DEFINE_MUTEX(mm_all_locks_mutex); 2071 2072 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2073 { 2074 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2075 /* 2076 * The LSB of head.next can't change from under us 2077 * because we hold the mm_all_locks_mutex. 2078 */ 2079 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 2080 /* 2081 * We can safely modify head.next after taking the 2082 * anon_vma->root->rwsem. If some other vma in this mm shares 2083 * the same anon_vma we won't take it again. 2084 * 2085 * No need of atomic instructions here, head.next 2086 * can't change from under us thanks to the 2087 * anon_vma->root->rwsem. 2088 */ 2089 if (__test_and_set_bit(0, (unsigned long *) 2090 &anon_vma->root->rb_root.rb_root.rb_node)) 2091 BUG(); 2092 } 2093 } 2094 2095 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 2096 { 2097 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2098 /* 2099 * AS_MM_ALL_LOCKS can't change from under us because 2100 * we hold the mm_all_locks_mutex. 2101 * 2102 * Operations on ->flags have to be atomic because 2103 * even if AS_MM_ALL_LOCKS is stable thanks to the 2104 * mm_all_locks_mutex, there may be other cpus 2105 * changing other bitflags in parallel to us. 2106 */ 2107 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2108 BUG(); 2109 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 2110 } 2111 } 2112 2113 /* 2114 * This operation locks against the VM for all pte/vma/mm related 2115 * operations that could ever happen on a certain mm. This includes 2116 * vmtruncate, try_to_unmap, and all page faults. 2117 * 2118 * The caller must take the mmap_lock in write mode before calling 2119 * mm_take_all_locks(). The caller isn't allowed to release the 2120 * mmap_lock until mm_drop_all_locks() returns. 2121 * 2122 * mmap_lock in write mode is required in order to block all operations 2123 * that could modify pagetables and free pages without need of 2124 * altering the vma layout. It's also needed in write mode to avoid new 2125 * anon_vmas to be associated with existing vmas. 2126 * 2127 * A single task can't take more than one mm_take_all_locks() in a row 2128 * or it would deadlock. 2129 * 2130 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 2131 * mapping->flags avoid to take the same lock twice, if more than one 2132 * vma in this mm is backed by the same anon_vma or address_space. 2133 * 2134 * We take locks in following order, accordingly to comment at beginning 2135 * of mm/rmap.c: 2136 * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 2137 * hugetlb mapping); 2138 * - all vmas marked locked 2139 * - all i_mmap_rwsem locks; 2140 * - all anon_vma->rwseml 2141 * 2142 * We can take all locks within these types randomly because the VM code 2143 * doesn't nest them and we protected from parallel mm_take_all_locks() by 2144 * mm_all_locks_mutex. 2145 * 2146 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 2147 * that may have to take thousand of locks. 2148 * 2149 * mm_take_all_locks() can fail if it's interrupted by signals. 2150 */ 2151 int mm_take_all_locks(struct mm_struct *mm) 2152 { 2153 struct vm_area_struct *vma; 2154 struct anon_vma_chain *avc; 2155 VMA_ITERATOR(vmi, mm, 0); 2156 2157 mmap_assert_write_locked(mm); 2158 2159 mutex_lock(&mm_all_locks_mutex); 2160 2161 /* 2162 * vma_start_write() does not have a complement in mm_drop_all_locks() 2163 * because vma_start_write() is always asymmetrical; it marks a VMA as 2164 * being written to until mmap_write_unlock() or mmap_write_downgrade() 2165 * is reached. 2166 */ 2167 for_each_vma(vmi, vma) { 2168 if (signal_pending(current)) 2169 goto out_unlock; 2170 vma_start_write(vma); 2171 } 2172 2173 vma_iter_init(&vmi, mm, 0); 2174 for_each_vma(vmi, vma) { 2175 if (signal_pending(current)) 2176 goto out_unlock; 2177 if (vma->vm_file && vma->vm_file->f_mapping && 2178 is_vm_hugetlb_page(vma)) 2179 vm_lock_mapping(mm, vma->vm_file->f_mapping); 2180 } 2181 2182 vma_iter_init(&vmi, mm, 0); 2183 for_each_vma(vmi, vma) { 2184 if (signal_pending(current)) 2185 goto out_unlock; 2186 if (vma->vm_file && vma->vm_file->f_mapping && 2187 !is_vm_hugetlb_page(vma)) 2188 vm_lock_mapping(mm, vma->vm_file->f_mapping); 2189 } 2190 2191 vma_iter_init(&vmi, mm, 0); 2192 for_each_vma(vmi, vma) { 2193 if (signal_pending(current)) 2194 goto out_unlock; 2195 if (vma->anon_vma) 2196 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2197 vm_lock_anon_vma(mm, avc->anon_vma); 2198 } 2199 2200 return 0; 2201 2202 out_unlock: 2203 mm_drop_all_locks(mm); 2204 return -EINTR; 2205 } 2206 2207 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2208 { 2209 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2210 /* 2211 * The LSB of head.next can't change to 0 from under 2212 * us because we hold the mm_all_locks_mutex. 2213 * 2214 * We must however clear the bitflag before unlocking 2215 * the vma so the users using the anon_vma->rb_root will 2216 * never see our bitflag. 2217 * 2218 * No need of atomic instructions here, head.next 2219 * can't change from under us until we release the 2220 * anon_vma->root->rwsem. 2221 */ 2222 if (!__test_and_clear_bit(0, (unsigned long *) 2223 &anon_vma->root->rb_root.rb_root.rb_node)) 2224 BUG(); 2225 anon_vma_unlock_write(anon_vma); 2226 } 2227 } 2228 2229 static void vm_unlock_mapping(struct address_space *mapping) 2230 { 2231 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2232 /* 2233 * AS_MM_ALL_LOCKS can't change to 0 from under us 2234 * because we hold the mm_all_locks_mutex. 2235 */ 2236 i_mmap_unlock_write(mapping); 2237 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2238 &mapping->flags)) 2239 BUG(); 2240 } 2241 } 2242 2243 /* 2244 * The mmap_lock cannot be released by the caller until 2245 * mm_drop_all_locks() returns. 2246 */ 2247 void mm_drop_all_locks(struct mm_struct *mm) 2248 { 2249 struct vm_area_struct *vma; 2250 struct anon_vma_chain *avc; 2251 VMA_ITERATOR(vmi, mm, 0); 2252 2253 mmap_assert_write_locked(mm); 2254 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2255 2256 for_each_vma(vmi, vma) { 2257 if (vma->anon_vma) 2258 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2259 vm_unlock_anon_vma(avc->anon_vma); 2260 if (vma->vm_file && vma->vm_file->f_mapping) 2261 vm_unlock_mapping(vma->vm_file->f_mapping); 2262 } 2263 2264 mutex_unlock(&mm_all_locks_mutex); 2265 } 2266 2267 /* 2268 * We account for memory if it's a private writeable mapping, 2269 * not hugepages and VM_NORESERVE wasn't set. 2270 */ 2271 static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) 2272 { 2273 /* 2274 * hugetlb has its own accounting separate from the core VM 2275 * VM_HUGETLB may not be set yet so we cannot check for that flag. 2276 */ 2277 if (file && is_file_hugepages(file)) 2278 return false; 2279 2280 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 2281 } 2282 2283 /* 2284 * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() 2285 * operation. 2286 * @vms: The vma unmap structure 2287 * @mas_detach: The maple state with the detached maple tree 2288 * 2289 * Reattach any detached vmas, free up the maple tree used to track the vmas. 2290 * If that's not possible because the ptes are cleared (and vm_ops->closed() may 2291 * have been called), then a NULL is written over the vmas and the vmas are 2292 * removed (munmap() completed). 2293 */ 2294 static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, 2295 struct ma_state *mas_detach) 2296 { 2297 struct ma_state *mas = &vms->vmi->mas; 2298 2299 if (!vms->nr_pages) 2300 return; 2301 2302 if (vms->clear_ptes) 2303 return reattach_vmas(mas_detach); 2304 2305 /* 2306 * Aborting cannot just call the vm_ops open() because they are often 2307 * not symmetrical and state data has been lost. Resort to the old 2308 * failure method of leaving a gap where the MAP_FIXED mapping failed. 2309 */ 2310 mas_set_range(mas, vms->start, vms->end - 1); 2311 mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); 2312 /* Clean up the insertion of the unfortunate gap */ 2313 vms_complete_munmap_vmas(vms, mas_detach); 2314 } 2315 2316 /* 2317 * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be 2318 * unmapped once the map operation is completed, check limits, account mapping 2319 * and clean up any pre-existing VMAs. 2320 * 2321 * @map: Mapping state. 2322 * @uf: Userfaultfd context list. 2323 * 2324 * Returns: 0 on success, error code otherwise. 2325 */ 2326 static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) 2327 { 2328 int error; 2329 struct vma_iterator *vmi = map->vmi; 2330 struct vma_munmap_struct *vms = &map->vms; 2331 2332 /* Find the first overlapping VMA and initialise unmap state. */ 2333 vms->vma = vma_find(vmi, map->end); 2334 init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, 2335 /* unlock = */ false); 2336 2337 /* OK, we have overlapping VMAs - prepare to unmap them. */ 2338 if (vms->vma) { 2339 mt_init_flags(&map->mt_detach, 2340 vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 2341 mt_on_stack(map->mt_detach); 2342 mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); 2343 /* Prepare to unmap any existing mapping in the area */ 2344 error = vms_gather_munmap_vmas(vms, &map->mas_detach); 2345 if (error) { 2346 /* On error VMAs will already have been reattached. */ 2347 vms->nr_pages = 0; 2348 return error; 2349 } 2350 2351 map->next = vms->next; 2352 map->prev = vms->prev; 2353 } else { 2354 map->next = vma_iter_next_rewind(vmi, &map->prev); 2355 } 2356 2357 /* Check against address space limit. */ 2358 if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) 2359 return -ENOMEM; 2360 2361 /* Private writable mapping: check memory availability. */ 2362 if (accountable_mapping(map->file, map->flags)) { 2363 map->charged = map->pglen; 2364 map->charged -= vms->nr_accounted; 2365 if (map->charged) { 2366 error = security_vm_enough_memory_mm(map->mm, map->charged); 2367 if (error) 2368 return error; 2369 } 2370 2371 vms->nr_accounted = 0; 2372 map->flags |= VM_ACCOUNT; 2373 } 2374 2375 /* 2376 * Clear PTEs while the vma is still in the tree so that rmap 2377 * cannot race with the freeing later in the truncate scenario. 2378 * This is also needed for mmap_file(), which is why vm_ops 2379 * close function is called. 2380 */ 2381 vms_clean_up_area(vms, &map->mas_detach); 2382 2383 return 0; 2384 } 2385 2386 2387 static int __mmap_new_file_vma(struct mmap_state *map, 2388 struct vm_area_struct *vma) 2389 { 2390 struct vma_iterator *vmi = map->vmi; 2391 int error; 2392 2393 vma->vm_file = get_file(map->file); 2394 2395 if (!map->file->f_op->mmap) 2396 return 0; 2397 2398 error = mmap_file(vma->vm_file, vma); 2399 if (error) { 2400 fput(vma->vm_file); 2401 vma->vm_file = NULL; 2402 2403 vma_iter_set(vmi, vma->vm_end); 2404 /* Undo any partial mapping done by a device driver. */ 2405 unmap_region(&vmi->mas, vma, map->prev, map->next); 2406 2407 return error; 2408 } 2409 2410 /* Drivers cannot alter the address of the VMA. */ 2411 WARN_ON_ONCE(map->addr != vma->vm_start); 2412 /* 2413 * Drivers should not permit writability when previously it was 2414 * disallowed. 2415 */ 2416 VM_WARN_ON_ONCE(map->flags != vma->vm_flags && 2417 !(map->flags & VM_MAYWRITE) && 2418 (vma->vm_flags & VM_MAYWRITE)); 2419 2420 map->flags = vma->vm_flags; 2421 2422 return 0; 2423 } 2424 2425 /* 2426 * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not 2427 * possible. 2428 * 2429 * @map: Mapping state. 2430 * @vmap: Output pointer for the new VMA. 2431 * 2432 * Returns: Zero on success, or an error. 2433 */ 2434 static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) 2435 { 2436 struct vma_iterator *vmi = map->vmi; 2437 int error = 0; 2438 struct vm_area_struct *vma; 2439 2440 /* 2441 * Determine the object being mapped and call the appropriate 2442 * specific mapper. the address has already been validated, but 2443 * not unmapped, but the maps are removed from the list. 2444 */ 2445 vma = vm_area_alloc(map->mm); 2446 if (!vma) 2447 return -ENOMEM; 2448 2449 vma_iter_config(vmi, map->addr, map->end); 2450 vma_set_range(vma, map->addr, map->end, map->pgoff); 2451 vm_flags_init(vma, map->flags); 2452 vma->vm_page_prot = map->page_prot; 2453 2454 if (vma_iter_prealloc(vmi, vma)) { 2455 error = -ENOMEM; 2456 goto free_vma; 2457 } 2458 2459 if (map->file) 2460 error = __mmap_new_file_vma(map, vma); 2461 else if (map->flags & VM_SHARED) 2462 error = shmem_zero_setup(vma); 2463 else 2464 vma_set_anonymous(vma); 2465 2466 if (error) 2467 goto free_iter_vma; 2468 2469 #ifdef CONFIG_SPARC64 2470 /* TODO: Fix SPARC ADI! */ 2471 WARN_ON_ONCE(!arch_validate_flags(map->flags)); 2472 #endif 2473 2474 /* Lock the VMA since it is modified after insertion into VMA tree */ 2475 vma_start_write(vma); 2476 vma_iter_store_new(vmi, vma); 2477 map->mm->map_count++; 2478 vma_link_file(vma); 2479 2480 /* 2481 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below 2482 * call covers the non-merge case. 2483 */ 2484 if (!vma_is_anonymous(vma)) 2485 khugepaged_enter_vma(vma, map->flags); 2486 ksm_add_vma(vma); 2487 *vmap = vma; 2488 return 0; 2489 2490 free_iter_vma: 2491 vma_iter_free(vmi); 2492 free_vma: 2493 vm_area_free(vma); 2494 return error; 2495 } 2496 2497 /* 2498 * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping 2499 * statistics, handle locking and finalise the VMA. 2500 * 2501 * @map: Mapping state. 2502 * @vma: Merged or newly allocated VMA for the mmap()'d region. 2503 */ 2504 static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) 2505 { 2506 struct mm_struct *mm = map->mm; 2507 unsigned long vm_flags = vma->vm_flags; 2508 2509 perf_event_mmap(vma); 2510 2511 /* Unmap any existing mapping in the area. */ 2512 vms_complete_munmap_vmas(&map->vms, &map->mas_detach); 2513 2514 vm_stat_account(mm, vma->vm_flags, map->pglen); 2515 if (vm_flags & VM_LOCKED) { 2516 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || 2517 is_vm_hugetlb_page(vma) || 2518 vma == get_gate_vma(mm)) 2519 vm_flags_clear(vma, VM_LOCKED_MASK); 2520 else 2521 mm->locked_vm += map->pglen; 2522 } 2523 2524 if (vma->vm_file) 2525 uprobe_mmap(vma); 2526 2527 /* 2528 * New (or expanded) vma always get soft dirty status. 2529 * Otherwise user-space soft-dirty page tracker won't 2530 * be able to distinguish situation when vma area unmapped, 2531 * then new mapped in-place (which must be aimed as 2532 * a completely new data area). 2533 */ 2534 vm_flags_set(vma, VM_SOFTDIRTY); 2535 2536 vma_set_page_prot(vma); 2537 } 2538 2539 /* 2540 * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that 2541 * specifies it. 2542 * 2543 * This is called prior to any merge attempt, and updates whitelisted fields 2544 * that are permitted to be updated by the caller. 2545 * 2546 * All but user-defined fields will be pre-populated with original values. 2547 * 2548 * Returns 0 on success, or an error code otherwise. 2549 */ 2550 static int call_mmap_prepare(struct mmap_state *map) 2551 { 2552 int err; 2553 struct vm_area_desc desc = { 2554 .mm = map->mm, 2555 .start = map->addr, 2556 .end = map->end, 2557 2558 .pgoff = map->pgoff, 2559 .file = map->file, 2560 .vm_flags = map->flags, 2561 .page_prot = map->page_prot, 2562 }; 2563 2564 /* Invoke the hook. */ 2565 err = __call_mmap_prepare(map->file, &desc); 2566 if (err) 2567 return err; 2568 2569 /* Update fields permitted to be changed. */ 2570 map->pgoff = desc.pgoff; 2571 map->file = desc.file; 2572 map->flags = desc.vm_flags; 2573 map->page_prot = desc.page_prot; 2574 /* User-defined fields. */ 2575 map->vm_ops = desc.vm_ops; 2576 map->vm_private_data = desc.private_data; 2577 2578 return 0; 2579 } 2580 2581 static void set_vma_user_defined_fields(struct vm_area_struct *vma, 2582 struct mmap_state *map) 2583 { 2584 if (map->vm_ops) 2585 vma->vm_ops = map->vm_ops; 2586 vma->vm_private_data = map->vm_private_data; 2587 } 2588 2589 static unsigned long __mmap_region(struct file *file, unsigned long addr, 2590 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2591 struct list_head *uf) 2592 { 2593 struct mm_struct *mm = current->mm; 2594 struct vm_area_struct *vma = NULL; 2595 int error; 2596 bool have_mmap_prepare = file && file->f_op->mmap_prepare; 2597 VMA_ITERATOR(vmi, mm, addr); 2598 MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); 2599 2600 error = __mmap_prepare(&map, uf); 2601 if (!error && have_mmap_prepare) 2602 error = call_mmap_prepare(&map); 2603 if (error) 2604 goto abort_munmap; 2605 2606 /* Attempt to merge with adjacent VMAs... */ 2607 if (map.prev || map.next) { 2608 VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); 2609 2610 vma = vma_merge_new_range(&vmg); 2611 } 2612 2613 /* ...but if we can't, allocate a new VMA. */ 2614 if (!vma) { 2615 error = __mmap_new_vma(&map, &vma); 2616 if (error) 2617 goto unacct_error; 2618 } 2619 2620 if (have_mmap_prepare) 2621 set_vma_user_defined_fields(vma, &map); 2622 2623 __mmap_complete(&map, vma); 2624 2625 return addr; 2626 2627 /* Accounting was done by __mmap_prepare(). */ 2628 unacct_error: 2629 if (map.charged) 2630 vm_unacct_memory(map.charged); 2631 abort_munmap: 2632 vms_abort_munmap_vmas(&map.vms, &map.mas_detach); 2633 return error; 2634 } 2635 2636 /** 2637 * mmap_region() - Actually perform the userland mapping of a VMA into 2638 * current->mm with known, aligned and overflow-checked @addr and @len, and 2639 * correctly determined VMA flags @vm_flags and page offset @pgoff. 2640 * 2641 * This is an internal memory management function, and should not be used 2642 * directly. 2643 * 2644 * The caller must write-lock current->mm->mmap_lock. 2645 * 2646 * @file: If a file-backed mapping, a pointer to the struct file describing the 2647 * file to be mapped, otherwise NULL. 2648 * @addr: The page-aligned address at which to perform the mapping. 2649 * @len: The page-aligned, non-zero, length of the mapping. 2650 * @vm_flags: The VMA flags which should be applied to the mapping. 2651 * @pgoff: If @file is specified, the page offset into the file, if not then 2652 * the virtual page offset in memory of the anonymous mapping. 2653 * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap 2654 * events. 2655 * 2656 * Returns: Either an error, or the address at which the requested mapping has 2657 * been performed. 2658 */ 2659 unsigned long mmap_region(struct file *file, unsigned long addr, 2660 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2661 struct list_head *uf) 2662 { 2663 unsigned long ret; 2664 bool writable_file_mapping = false; 2665 2666 mmap_assert_write_locked(current->mm); 2667 2668 /* Check to see if MDWE is applicable. */ 2669 if (map_deny_write_exec(vm_flags, vm_flags)) 2670 return -EACCES; 2671 2672 /* Allow architectures to sanity-check the vm_flags. */ 2673 if (!arch_validate_flags(vm_flags)) 2674 return -EINVAL; 2675 2676 /* Map writable and ensure this isn't a sealed memfd. */ 2677 if (file && is_shared_maywrite(vm_flags)) { 2678 int error = mapping_map_writable(file->f_mapping); 2679 2680 if (error) 2681 return error; 2682 writable_file_mapping = true; 2683 } 2684 2685 ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); 2686 2687 /* Clear our write mapping regardless of error. */ 2688 if (writable_file_mapping) 2689 mapping_unmap_writable(file->f_mapping); 2690 2691 validate_mm(current->mm); 2692 return ret; 2693 } 2694 2695 /* 2696 * do_brk_flags() - Increase the brk vma if the flags match. 2697 * @vmi: The vma iterator 2698 * @addr: The start address 2699 * @len: The length of the increase 2700 * @vma: The vma, 2701 * @flags: The VMA Flags 2702 * 2703 * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags 2704 * do not match then create a new anonymous VMA. Eventually we may be able to 2705 * do some brk-specific accounting here. 2706 */ 2707 int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, 2708 unsigned long addr, unsigned long len, unsigned long flags) 2709 { 2710 struct mm_struct *mm = current->mm; 2711 2712 /* 2713 * Check against address space limits by the changed size 2714 * Note: This happens *after* clearing old mappings in some code paths. 2715 */ 2716 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2717 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) 2718 return -ENOMEM; 2719 2720 if (mm->map_count > sysctl_max_map_count) 2721 return -ENOMEM; 2722 2723 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2724 return -ENOMEM; 2725 2726 /* 2727 * Expand the existing vma if possible; Note that singular lists do not 2728 * occur after forking, so the expand will only happen on new VMAs. 2729 */ 2730 if (vma && vma->vm_end == addr) { 2731 VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); 2732 2733 vmg.prev = vma; 2734 /* vmi is positioned at prev, which this mode expects. */ 2735 vmg.just_expand = true; 2736 2737 if (vma_merge_new_range(&vmg)) 2738 goto out; 2739 else if (vmg_nomem(&vmg)) 2740 goto unacct_fail; 2741 } 2742 2743 if (vma) 2744 vma_iter_next_range(vmi); 2745 /* create a vma struct for an anonymous mapping */ 2746 vma = vm_area_alloc(mm); 2747 if (!vma) 2748 goto unacct_fail; 2749 2750 vma_set_anonymous(vma); 2751 vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); 2752 vm_flags_init(vma, flags); 2753 vma->vm_page_prot = vm_get_page_prot(flags); 2754 vma_start_write(vma); 2755 if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) 2756 goto mas_store_fail; 2757 2758 mm->map_count++; 2759 validate_mm(mm); 2760 ksm_add_vma(vma); 2761 out: 2762 perf_event_mmap(vma); 2763 mm->total_vm += len >> PAGE_SHIFT; 2764 mm->data_vm += len >> PAGE_SHIFT; 2765 if (flags & VM_LOCKED) 2766 mm->locked_vm += (len >> PAGE_SHIFT); 2767 vm_flags_set(vma, VM_SOFTDIRTY); 2768 return 0; 2769 2770 mas_store_fail: 2771 vm_area_free(vma); 2772 unacct_fail: 2773 vm_unacct_memory(len >> PAGE_SHIFT); 2774 return -ENOMEM; 2775 } 2776 2777 /** 2778 * unmapped_area() - Find an area between the low_limit and the high_limit with 2779 * the correct alignment and offset, all from @info. Note: current->mm is used 2780 * for the search. 2781 * 2782 * @info: The unmapped area information including the range [low_limit - 2783 * high_limit), the alignment offset and mask. 2784 * 2785 * Return: A memory address or -ENOMEM. 2786 */ 2787 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 2788 { 2789 unsigned long length, gap; 2790 unsigned long low_limit, high_limit; 2791 struct vm_area_struct *tmp; 2792 VMA_ITERATOR(vmi, current->mm, 0); 2793 2794 /* Adjust search length to account for worst case alignment overhead */ 2795 length = info->length + info->align_mask + info->start_gap; 2796 if (length < info->length) 2797 return -ENOMEM; 2798 2799 low_limit = info->low_limit; 2800 if (low_limit < mmap_min_addr) 2801 low_limit = mmap_min_addr; 2802 high_limit = info->high_limit; 2803 retry: 2804 if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) 2805 return -ENOMEM; 2806 2807 /* 2808 * Adjust for the gap first so it doesn't interfere with the 2809 * later alignment. The first step is the minimum needed to 2810 * fulill the start gap, the next steps is the minimum to align 2811 * that. It is the minimum needed to fulill both. 2812 */ 2813 gap = vma_iter_addr(&vmi) + info->start_gap; 2814 gap += (info->align_offset - gap) & info->align_mask; 2815 tmp = vma_next(&vmi); 2816 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ 2817 if (vm_start_gap(tmp) < gap + length - 1) { 2818 low_limit = tmp->vm_end; 2819 vma_iter_reset(&vmi); 2820 goto retry; 2821 } 2822 } else { 2823 tmp = vma_prev(&vmi); 2824 if (tmp && vm_end_gap(tmp) > gap) { 2825 low_limit = vm_end_gap(tmp); 2826 vma_iter_reset(&vmi); 2827 goto retry; 2828 } 2829 } 2830 2831 return gap; 2832 } 2833 2834 /** 2835 * unmapped_area_topdown() - Find an area between the low_limit and the 2836 * high_limit with the correct alignment and offset at the highest available 2837 * address, all from @info. Note: current->mm is used for the search. 2838 * 2839 * @info: The unmapped area information including the range [low_limit - 2840 * high_limit), the alignment offset and mask. 2841 * 2842 * Return: A memory address or -ENOMEM. 2843 */ 2844 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 2845 { 2846 unsigned long length, gap, gap_end; 2847 unsigned long low_limit, high_limit; 2848 struct vm_area_struct *tmp; 2849 VMA_ITERATOR(vmi, current->mm, 0); 2850 2851 /* Adjust search length to account for worst case alignment overhead */ 2852 length = info->length + info->align_mask + info->start_gap; 2853 if (length < info->length) 2854 return -ENOMEM; 2855 2856 low_limit = info->low_limit; 2857 if (low_limit < mmap_min_addr) 2858 low_limit = mmap_min_addr; 2859 high_limit = info->high_limit; 2860 retry: 2861 if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) 2862 return -ENOMEM; 2863 2864 gap = vma_iter_end(&vmi) - info->length; 2865 gap -= (gap - info->align_offset) & info->align_mask; 2866 gap_end = vma_iter_end(&vmi); 2867 tmp = vma_next(&vmi); 2868 if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ 2869 if (vm_start_gap(tmp) < gap_end) { 2870 high_limit = vm_start_gap(tmp); 2871 vma_iter_reset(&vmi); 2872 goto retry; 2873 } 2874 } else { 2875 tmp = vma_prev(&vmi); 2876 if (tmp && vm_end_gap(tmp) > gap) { 2877 high_limit = tmp->vm_start; 2878 vma_iter_reset(&vmi); 2879 goto retry; 2880 } 2881 } 2882 2883 return gap; 2884 } 2885 2886 /* 2887 * Verify that the stack growth is acceptable and 2888 * update accounting. This is shared with both the 2889 * grow-up and grow-down cases. 2890 */ 2891 static int acct_stack_growth(struct vm_area_struct *vma, 2892 unsigned long size, unsigned long grow) 2893 { 2894 struct mm_struct *mm = vma->vm_mm; 2895 unsigned long new_start; 2896 2897 /* address space limit tests */ 2898 if (!may_expand_vm(mm, vma->vm_flags, grow)) 2899 return -ENOMEM; 2900 2901 /* Stack limit test */ 2902 if (size > rlimit(RLIMIT_STACK)) 2903 return -ENOMEM; 2904 2905 /* mlock limit tests */ 2906 if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) 2907 return -ENOMEM; 2908 2909 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2910 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2911 vma->vm_end - size; 2912 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2913 return -EFAULT; 2914 2915 /* 2916 * Overcommit.. This must be the final test, as it will 2917 * update security statistics. 2918 */ 2919 if (security_vm_enough_memory_mm(mm, grow)) 2920 return -ENOMEM; 2921 2922 return 0; 2923 } 2924 2925 #if defined(CONFIG_STACK_GROWSUP) 2926 /* 2927 * PA-RISC uses this for its stack. 2928 * vma is the last one with address > vma->vm_end. Have to extend vma. 2929 */ 2930 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2931 { 2932 struct mm_struct *mm = vma->vm_mm; 2933 struct vm_area_struct *next; 2934 unsigned long gap_addr; 2935 int error = 0; 2936 VMA_ITERATOR(vmi, mm, vma->vm_start); 2937 2938 if (!(vma->vm_flags & VM_GROWSUP)) 2939 return -EFAULT; 2940 2941 mmap_assert_write_locked(mm); 2942 2943 /* Guard against exceeding limits of the address space. */ 2944 address &= PAGE_MASK; 2945 if (address >= (TASK_SIZE & PAGE_MASK)) 2946 return -ENOMEM; 2947 address += PAGE_SIZE; 2948 2949 /* Enforce stack_guard_gap */ 2950 gap_addr = address + stack_guard_gap; 2951 2952 /* Guard against overflow */ 2953 if (gap_addr < address || gap_addr > TASK_SIZE) 2954 gap_addr = TASK_SIZE; 2955 2956 next = find_vma_intersection(mm, vma->vm_end, gap_addr); 2957 if (next && vma_is_accessible(next)) { 2958 if (!(next->vm_flags & VM_GROWSUP)) 2959 return -ENOMEM; 2960 /* Check that both stack segments have the same anon_vma? */ 2961 } 2962 2963 if (next) 2964 vma_iter_prev_range_limit(&vmi, address); 2965 2966 vma_iter_config(&vmi, vma->vm_start, address); 2967 if (vma_iter_prealloc(&vmi, vma)) 2968 return -ENOMEM; 2969 2970 /* We must make sure the anon_vma is allocated. */ 2971 if (unlikely(anon_vma_prepare(vma))) { 2972 vma_iter_free(&vmi); 2973 return -ENOMEM; 2974 } 2975 2976 /* Lock the VMA before expanding to prevent concurrent page faults */ 2977 vma_start_write(vma); 2978 /* We update the anon VMA tree. */ 2979 anon_vma_lock_write(vma->anon_vma); 2980 2981 /* Somebody else might have raced and expanded it already */ 2982 if (address > vma->vm_end) { 2983 unsigned long size, grow; 2984 2985 size = address - vma->vm_start; 2986 grow = (address - vma->vm_end) >> PAGE_SHIFT; 2987 2988 error = -ENOMEM; 2989 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2990 error = acct_stack_growth(vma, size, grow); 2991 if (!error) { 2992 if (vma->vm_flags & VM_LOCKED) 2993 mm->locked_vm += grow; 2994 vm_stat_account(mm, vma->vm_flags, grow); 2995 anon_vma_interval_tree_pre_update_vma(vma); 2996 vma->vm_end = address; 2997 /* Overwrite old entry in mtree. */ 2998 vma_iter_store_overwrite(&vmi, vma); 2999 anon_vma_interval_tree_post_update_vma(vma); 3000 3001 perf_event_mmap(vma); 3002 } 3003 } 3004 } 3005 anon_vma_unlock_write(vma->anon_vma); 3006 vma_iter_free(&vmi); 3007 validate_mm(mm); 3008 return error; 3009 } 3010 #endif /* CONFIG_STACK_GROWSUP */ 3011 3012 /* 3013 * vma is the first one with address < vma->vm_start. Have to extend vma. 3014 * mmap_lock held for writing. 3015 */ 3016 int expand_downwards(struct vm_area_struct *vma, unsigned long address) 3017 { 3018 struct mm_struct *mm = vma->vm_mm; 3019 struct vm_area_struct *prev; 3020 int error = 0; 3021 VMA_ITERATOR(vmi, mm, vma->vm_start); 3022 3023 if (!(vma->vm_flags & VM_GROWSDOWN)) 3024 return -EFAULT; 3025 3026 mmap_assert_write_locked(mm); 3027 3028 address &= PAGE_MASK; 3029 if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) 3030 return -EPERM; 3031 3032 /* Enforce stack_guard_gap */ 3033 prev = vma_prev(&vmi); 3034 /* Check that both stack segments have the same anon_vma? */ 3035 if (prev) { 3036 if (!(prev->vm_flags & VM_GROWSDOWN) && 3037 vma_is_accessible(prev) && 3038 (address - prev->vm_end < stack_guard_gap)) 3039 return -ENOMEM; 3040 } 3041 3042 if (prev) 3043 vma_iter_next_range_limit(&vmi, vma->vm_start); 3044 3045 vma_iter_config(&vmi, address, vma->vm_end); 3046 if (vma_iter_prealloc(&vmi, vma)) 3047 return -ENOMEM; 3048 3049 /* We must make sure the anon_vma is allocated. */ 3050 if (unlikely(anon_vma_prepare(vma))) { 3051 vma_iter_free(&vmi); 3052 return -ENOMEM; 3053 } 3054 3055 /* Lock the VMA before expanding to prevent concurrent page faults */ 3056 vma_start_write(vma); 3057 /* We update the anon VMA tree. */ 3058 anon_vma_lock_write(vma->anon_vma); 3059 3060 /* Somebody else might have raced and expanded it already */ 3061 if (address < vma->vm_start) { 3062 unsigned long size, grow; 3063 3064 size = vma->vm_end - address; 3065 grow = (vma->vm_start - address) >> PAGE_SHIFT; 3066 3067 error = -ENOMEM; 3068 if (grow <= vma->vm_pgoff) { 3069 error = acct_stack_growth(vma, size, grow); 3070 if (!error) { 3071 if (vma->vm_flags & VM_LOCKED) 3072 mm->locked_vm += grow; 3073 vm_stat_account(mm, vma->vm_flags, grow); 3074 anon_vma_interval_tree_pre_update_vma(vma); 3075 vma->vm_start = address; 3076 vma->vm_pgoff -= grow; 3077 /* Overwrite old entry in mtree. */ 3078 vma_iter_store_overwrite(&vmi, vma); 3079 anon_vma_interval_tree_post_update_vma(vma); 3080 3081 perf_event_mmap(vma); 3082 } 3083 } 3084 } 3085 anon_vma_unlock_write(vma->anon_vma); 3086 vma_iter_free(&vmi); 3087 validate_mm(mm); 3088 return error; 3089 } 3090 3091 int __vm_munmap(unsigned long start, size_t len, bool unlock) 3092 { 3093 int ret; 3094 struct mm_struct *mm = current->mm; 3095 LIST_HEAD(uf); 3096 VMA_ITERATOR(vmi, mm, start); 3097 3098 if (mmap_write_lock_killable(mm)) 3099 return -EINTR; 3100 3101 ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); 3102 if (ret || !unlock) 3103 mmap_write_unlock(mm); 3104 3105 userfaultfd_unmap_complete(mm, &uf); 3106 return ret; 3107 } 3108 3109 3110 /* Insert vm structure into process list sorted by address 3111 * and into the inode's i_mmap tree. If vm_file is non-NULL 3112 * then i_mmap_rwsem is taken here. 3113 */ 3114 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 3115 { 3116 unsigned long charged = vma_pages(vma); 3117 3118 3119 if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 3120 return -ENOMEM; 3121 3122 if ((vma->vm_flags & VM_ACCOUNT) && 3123 security_vm_enough_memory_mm(mm, charged)) 3124 return -ENOMEM; 3125 3126 /* 3127 * The vm_pgoff of a purely anonymous vma should be irrelevant 3128 * until its first write fault, when page's anon_vma and index 3129 * are set. But now set the vm_pgoff it will almost certainly 3130 * end up with (unless mremap moves it elsewhere before that 3131 * first wfault), so /proc/pid/maps tells a consistent story. 3132 * 3133 * By setting it to reflect the virtual start address of the 3134 * vma, merges and splits can happen in a seamless way, just 3135 * using the existing file pgoff checks and manipulations. 3136 * Similarly in do_mmap and in do_brk_flags. 3137 */ 3138 if (vma_is_anonymous(vma)) { 3139 BUG_ON(vma->anon_vma); 3140 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 3141 } 3142 3143 if (vma_link(mm, vma)) { 3144 if (vma->vm_flags & VM_ACCOUNT) 3145 vm_unacct_memory(charged); 3146 return -ENOMEM; 3147 } 3148 3149 return 0; 3150 } 3151