1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 /* 4 * VMA-specific functions. 5 */ 6 7 #include "vma_internal.h" 8 #include "vma.h" 9 10 /* 11 * If the vma has a ->close operation then the driver probably needs to release 12 * per-vma resources, so we don't attempt to merge those if the caller indicates 13 * the current vma may be removed as part of the merge. 14 */ 15 static inline bool is_mergeable_vma(struct vm_area_struct *vma, 16 struct file *file, unsigned long vm_flags, 17 struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 18 struct anon_vma_name *anon_name, bool may_remove_vma) 19 { 20 /* 21 * VM_SOFTDIRTY should not prevent from VMA merging, if we 22 * match the flags but dirty bit -- the caller should mark 23 * merged VMA as dirty. If dirty bit won't be excluded from 24 * comparison, we increase pressure on the memory system forcing 25 * the kernel to generate new VMAs when old one could be 26 * extended instead. 27 */ 28 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 29 return false; 30 if (vma->vm_file != file) 31 return false; 32 if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) 33 return false; 34 if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) 35 return false; 36 if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) 37 return false; 38 return true; 39 } 40 41 static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, 42 struct anon_vma *anon_vma2, struct vm_area_struct *vma) 43 { 44 /* 45 * The list_is_singular() test is to avoid merging VMA cloned from 46 * parents. This can improve scalability caused by anon_vma lock. 47 */ 48 if ((!anon_vma1 || !anon_vma2) && (!vma || 49 list_is_singular(&vma->anon_vma_chain))) 50 return true; 51 return anon_vma1 == anon_vma2; 52 } 53 54 /* 55 * init_multi_vma_prep() - Initializer for struct vma_prepare 56 * @vp: The vma_prepare struct 57 * @vma: The vma that will be altered once locked 58 * @next: The next vma if it is to be adjusted 59 * @remove: The first vma to be removed 60 * @remove2: The second vma to be removed 61 */ 62 static void init_multi_vma_prep(struct vma_prepare *vp, 63 struct vm_area_struct *vma, 64 struct vm_area_struct *next, 65 struct vm_area_struct *remove, 66 struct vm_area_struct *remove2) 67 { 68 memset(vp, 0, sizeof(struct vma_prepare)); 69 vp->vma = vma; 70 vp->anon_vma = vma->anon_vma; 71 vp->remove = remove; 72 vp->remove2 = remove2; 73 vp->adj_next = next; 74 if (!vp->anon_vma && next) 75 vp->anon_vma = next->anon_vma; 76 77 vp->file = vma->vm_file; 78 if (vp->file) 79 vp->mapping = vma->vm_file->f_mapping; 80 81 } 82 83 /* 84 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct 85 * @vms: The vma munmap struct 86 * @vmi: The vma iterator 87 * @vma: The first vm_area_struct to munmap 88 * @start: The aligned start address to munmap 89 * @end: The aligned end address to munmap 90 * @uf: The userfaultfd list_head 91 * @unlock: Unlock after the operation. Only unlocked on success 92 */ 93 static inline void init_vma_munmap(struct vma_munmap_struct *vms, 94 struct vma_iterator *vmi, struct vm_area_struct *vma, 95 unsigned long start, unsigned long end, struct list_head *uf, 96 bool unlock) 97 { 98 vms->vmi = vmi; 99 vms->vma = vma; 100 vms->mm = vma->vm_mm; 101 vms->start = start; 102 vms->end = end; 103 vms->unlock = unlock; 104 vms->uf = uf; 105 vms->vma_count = 0; 106 vms->nr_pages = vms->locked_vm = 0; 107 } 108 109 /* 110 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 111 * in front of (at a lower virtual address and file offset than) the vma. 112 * 113 * We cannot merge two vmas if they have differently assigned (non-NULL) 114 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 115 * 116 * We don't check here for the merged mmap wrapping around the end of pagecache 117 * indices (16TB on ia32) because do_mmap() does not permit mmap's which 118 * wrap, nor mmaps which cover the final page at index -1UL. 119 * 120 * We assume the vma may be removed as part of the merge. 121 */ 122 bool 123 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 124 struct anon_vma *anon_vma, struct file *file, 125 pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 126 struct anon_vma_name *anon_name) 127 { 128 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && 129 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 130 if (vma->vm_pgoff == vm_pgoff) 131 return true; 132 } 133 return false; 134 } 135 136 /* 137 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 138 * beyond (at a higher virtual address and file offset than) the vma. 139 * 140 * We cannot merge two vmas if they have differently assigned (non-NULL) 141 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 142 * 143 * We assume that vma is not removed as part of the merge. 144 */ 145 bool 146 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 147 struct anon_vma *anon_vma, struct file *file, 148 pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 149 struct anon_vma_name *anon_name) 150 { 151 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && 152 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 153 pgoff_t vm_pglen; 154 155 vm_pglen = vma_pages(vma); 156 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 157 return true; 158 } 159 return false; 160 } 161 162 /* 163 * Close a vm structure and free it. 164 */ 165 void remove_vma(struct vm_area_struct *vma, bool unreachable) 166 { 167 might_sleep(); 168 if (vma->vm_ops && vma->vm_ops->close) 169 vma->vm_ops->close(vma); 170 if (vma->vm_file) 171 fput(vma->vm_file); 172 mpol_put(vma_policy(vma)); 173 if (unreachable) 174 __vm_area_free(vma); 175 else 176 vm_area_free(vma); 177 } 178 179 /* 180 * Get rid of page table information in the indicated region. 181 * 182 * Called with the mm semaphore held. 183 */ 184 void unmap_region(struct mm_struct *mm, struct ma_state *mas, 185 struct vm_area_struct *vma, struct vm_area_struct *prev, 186 struct vm_area_struct *next, unsigned long start, 187 unsigned long end, unsigned long tree_end, bool mm_wr_locked) 188 { 189 struct mmu_gather tlb; 190 unsigned long mt_start = mas->index; 191 192 lru_add_drain(); 193 tlb_gather_mmu(&tlb, mm); 194 update_hiwater_rss(mm); 195 unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); 196 mas_set(mas, mt_start); 197 free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 198 next ? next->vm_start : USER_PGTABLES_CEILING, 199 mm_wr_locked); 200 tlb_finish_mmu(&tlb); 201 } 202 203 /* 204 * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 205 * has already been checked or doesn't make sense to fail. 206 * VMA Iterator will point to the original VMA. 207 */ 208 static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 209 unsigned long addr, int new_below) 210 { 211 struct vma_prepare vp; 212 struct vm_area_struct *new; 213 int err; 214 215 WARN_ON(vma->vm_start >= addr); 216 WARN_ON(vma->vm_end <= addr); 217 218 if (vma->vm_ops && vma->vm_ops->may_split) { 219 err = vma->vm_ops->may_split(vma, addr); 220 if (err) 221 return err; 222 } 223 224 new = vm_area_dup(vma); 225 if (!new) 226 return -ENOMEM; 227 228 if (new_below) { 229 new->vm_end = addr; 230 } else { 231 new->vm_start = addr; 232 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 233 } 234 235 err = -ENOMEM; 236 vma_iter_config(vmi, new->vm_start, new->vm_end); 237 if (vma_iter_prealloc(vmi, new)) 238 goto out_free_vma; 239 240 err = vma_dup_policy(vma, new); 241 if (err) 242 goto out_free_vmi; 243 244 err = anon_vma_clone(new, vma); 245 if (err) 246 goto out_free_mpol; 247 248 if (new->vm_file) 249 get_file(new->vm_file); 250 251 if (new->vm_ops && new->vm_ops->open) 252 new->vm_ops->open(new); 253 254 vma_start_write(vma); 255 vma_start_write(new); 256 257 init_vma_prep(&vp, vma); 258 vp.insert = new; 259 vma_prepare(&vp); 260 vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); 261 262 if (new_below) { 263 vma->vm_start = addr; 264 vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 265 } else { 266 vma->vm_end = addr; 267 } 268 269 /* vma_complete stores the new vma */ 270 vma_complete(&vp, vmi, vma->vm_mm); 271 272 /* Success. */ 273 if (new_below) 274 vma_next(vmi); 275 else 276 vma_prev(vmi); 277 278 return 0; 279 280 out_free_mpol: 281 mpol_put(vma_policy(new)); 282 out_free_vmi: 283 vma_iter_free(vmi); 284 out_free_vma: 285 vm_area_free(new); 286 return err; 287 } 288 289 /* 290 * Split a vma into two pieces at address 'addr', a new vma is allocated 291 * either for the first part or the tail. 292 */ 293 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 294 unsigned long addr, int new_below) 295 { 296 if (vma->vm_mm->map_count >= sysctl_max_map_count) 297 return -ENOMEM; 298 299 return __split_vma(vmi, vma, addr, new_below); 300 } 301 302 /* 303 * Ok - we have the memory areas we should free on a maple tree so release them, 304 * and do the vma updates. 305 * 306 * Called with the mm semaphore held. 307 */ 308 static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) 309 { 310 unsigned long nr_accounted = 0; 311 struct vm_area_struct *vma; 312 313 /* Update high watermark before we lower total_vm */ 314 update_hiwater_vm(mm); 315 mas_for_each(mas, vma, ULONG_MAX) { 316 long nrpages = vma_pages(vma); 317 318 if (vma->vm_flags & VM_ACCOUNT) 319 nr_accounted += nrpages; 320 vm_stat_account(mm, vma->vm_flags, -nrpages); 321 remove_vma(vma, false); 322 } 323 vm_unacct_memory(nr_accounted); 324 } 325 326 /* 327 * init_vma_prep() - Initializer wrapper for vma_prepare struct 328 * @vp: The vma_prepare struct 329 * @vma: The vma that will be altered once locked 330 */ 331 void init_vma_prep(struct vma_prepare *vp, 332 struct vm_area_struct *vma) 333 { 334 init_multi_vma_prep(vp, vma, NULL, NULL, NULL); 335 } 336 337 /* 338 * Requires inode->i_mapping->i_mmap_rwsem 339 */ 340 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 341 struct address_space *mapping) 342 { 343 if (vma_is_shared_maywrite(vma)) 344 mapping_unmap_writable(mapping); 345 346 flush_dcache_mmap_lock(mapping); 347 vma_interval_tree_remove(vma, &mapping->i_mmap); 348 flush_dcache_mmap_unlock(mapping); 349 } 350 351 /* 352 * vma has some anon_vma assigned, and is already inserted on that 353 * anon_vma's interval trees. 354 * 355 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 356 * vma must be removed from the anon_vma's interval trees using 357 * anon_vma_interval_tree_pre_update_vma(). 358 * 359 * After the update, the vma will be reinserted using 360 * anon_vma_interval_tree_post_update_vma(). 361 * 362 * The entire update must be protected by exclusive mmap_lock and by 363 * the root anon_vma's mutex. 364 */ 365 void 366 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 367 { 368 struct anon_vma_chain *avc; 369 370 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 371 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 372 } 373 374 void 375 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 376 { 377 struct anon_vma_chain *avc; 378 379 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 380 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 381 } 382 383 static void __vma_link_file(struct vm_area_struct *vma, 384 struct address_space *mapping) 385 { 386 if (vma_is_shared_maywrite(vma)) 387 mapping_allow_writable(mapping); 388 389 flush_dcache_mmap_lock(mapping); 390 vma_interval_tree_insert(vma, &mapping->i_mmap); 391 flush_dcache_mmap_unlock(mapping); 392 } 393 394 /* 395 * vma_prepare() - Helper function for handling locking VMAs prior to altering 396 * @vp: The initialized vma_prepare struct 397 */ 398 void vma_prepare(struct vma_prepare *vp) 399 { 400 if (vp->file) { 401 uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 402 403 if (vp->adj_next) 404 uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 405 vp->adj_next->vm_end); 406 407 i_mmap_lock_write(vp->mapping); 408 if (vp->insert && vp->insert->vm_file) { 409 /* 410 * Put into interval tree now, so instantiated pages 411 * are visible to arm/parisc __flush_dcache_page 412 * throughout; but we cannot insert into address 413 * space until vma start or end is updated. 414 */ 415 __vma_link_file(vp->insert, 416 vp->insert->vm_file->f_mapping); 417 } 418 } 419 420 if (vp->anon_vma) { 421 anon_vma_lock_write(vp->anon_vma); 422 anon_vma_interval_tree_pre_update_vma(vp->vma); 423 if (vp->adj_next) 424 anon_vma_interval_tree_pre_update_vma(vp->adj_next); 425 } 426 427 if (vp->file) { 428 flush_dcache_mmap_lock(vp->mapping); 429 vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 430 if (vp->adj_next) 431 vma_interval_tree_remove(vp->adj_next, 432 &vp->mapping->i_mmap); 433 } 434 435 } 436 437 /* 438 * dup_anon_vma() - Helper function to duplicate anon_vma 439 * @dst: The destination VMA 440 * @src: The source VMA 441 * @dup: Pointer to the destination VMA when successful. 442 * 443 * Returns: 0 on success. 444 */ 445 static int dup_anon_vma(struct vm_area_struct *dst, 446 struct vm_area_struct *src, struct vm_area_struct **dup) 447 { 448 /* 449 * Easily overlooked: when mprotect shifts the boundary, make sure the 450 * expanding vma has anon_vma set if the shrinking vma had, to cover any 451 * anon pages imported. 452 */ 453 if (src->anon_vma && !dst->anon_vma) { 454 int ret; 455 456 vma_assert_write_locked(dst); 457 dst->anon_vma = src->anon_vma; 458 ret = anon_vma_clone(dst, src); 459 if (ret) 460 return ret; 461 462 *dup = dst; 463 } 464 465 return 0; 466 } 467 468 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 469 void validate_mm(struct mm_struct *mm) 470 { 471 int bug = 0; 472 int i = 0; 473 struct vm_area_struct *vma; 474 VMA_ITERATOR(vmi, mm, 0); 475 476 mt_validate(&mm->mm_mt); 477 for_each_vma(vmi, vma) { 478 #ifdef CONFIG_DEBUG_VM_RB 479 struct anon_vma *anon_vma = vma->anon_vma; 480 struct anon_vma_chain *avc; 481 #endif 482 unsigned long vmi_start, vmi_end; 483 bool warn = 0; 484 485 vmi_start = vma_iter_addr(&vmi); 486 vmi_end = vma_iter_end(&vmi); 487 if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 488 warn = 1; 489 490 if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 491 warn = 1; 492 493 if (warn) { 494 pr_emerg("issue in %s\n", current->comm); 495 dump_stack(); 496 dump_vma(vma); 497 pr_emerg("tree range: %px start %lx end %lx\n", vma, 498 vmi_start, vmi_end - 1); 499 vma_iter_dump_tree(&vmi); 500 } 501 502 #ifdef CONFIG_DEBUG_VM_RB 503 if (anon_vma) { 504 anon_vma_lock_read(anon_vma); 505 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 506 anon_vma_interval_tree_verify(avc); 507 anon_vma_unlock_read(anon_vma); 508 } 509 #endif 510 i++; 511 } 512 if (i != mm->map_count) { 513 pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 514 bug = 1; 515 } 516 VM_BUG_ON_MM(bug, mm); 517 } 518 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 519 520 /* 521 * vma_expand - Expand an existing VMA 522 * 523 * @vmi: The vma iterator 524 * @vma: The vma to expand 525 * @start: The start of the vma 526 * @end: The exclusive end of the vma 527 * @pgoff: The page offset of vma 528 * @next: The current of next vma. 529 * 530 * Expand @vma to @start and @end. Can expand off the start and end. Will 531 * expand over @next if it's different from @vma and @end == @next->vm_end. 532 * Checking if the @vma can expand and merge with @next needs to be handled by 533 * the caller. 534 * 535 * Returns: 0 on success 536 */ 537 int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 538 unsigned long start, unsigned long end, pgoff_t pgoff, 539 struct vm_area_struct *next) 540 { 541 struct vm_area_struct *anon_dup = NULL; 542 bool remove_next = false; 543 struct vma_prepare vp; 544 545 vma_start_write(vma); 546 if (next && (vma != next) && (end == next->vm_end)) { 547 int ret; 548 549 remove_next = true; 550 vma_start_write(next); 551 ret = dup_anon_vma(vma, next, &anon_dup); 552 if (ret) 553 return ret; 554 } 555 556 init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); 557 /* Not merging but overwriting any part of next is not handled. */ 558 VM_WARN_ON(next && !vp.remove && 559 next != vma && end > next->vm_start); 560 /* Only handles expanding */ 561 VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); 562 563 /* Note: vma iterator must be pointing to 'start' */ 564 vma_iter_config(vmi, start, end); 565 if (vma_iter_prealloc(vmi, vma)) 566 goto nomem; 567 568 vma_prepare(&vp); 569 vma_adjust_trans_huge(vma, start, end, 0); 570 vma_set_range(vma, start, end, pgoff); 571 vma_iter_store(vmi, vma); 572 573 vma_complete(&vp, vmi, vma->vm_mm); 574 return 0; 575 576 nomem: 577 if (anon_dup) 578 unlink_anon_vmas(anon_dup); 579 return -ENOMEM; 580 } 581 582 /* 583 * vma_shrink() - Reduce an existing VMAs memory area 584 * @vmi: The vma iterator 585 * @vma: The VMA to modify 586 * @start: The new start 587 * @end: The new end 588 * 589 * Returns: 0 on success, -ENOMEM otherwise 590 */ 591 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 592 unsigned long start, unsigned long end, pgoff_t pgoff) 593 { 594 struct vma_prepare vp; 595 596 WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 597 598 if (vma->vm_start < start) 599 vma_iter_config(vmi, vma->vm_start, start); 600 else 601 vma_iter_config(vmi, end, vma->vm_end); 602 603 if (vma_iter_prealloc(vmi, NULL)) 604 return -ENOMEM; 605 606 vma_start_write(vma); 607 608 init_vma_prep(&vp, vma); 609 vma_prepare(&vp); 610 vma_adjust_trans_huge(vma, start, end, 0); 611 612 vma_iter_clear(vmi); 613 vma_set_range(vma, start, end, pgoff); 614 vma_complete(&vp, vmi, vma->vm_mm); 615 return 0; 616 } 617 618 /* 619 * vma_complete- Helper function for handling the unlocking after altering VMAs, 620 * or for inserting a VMA. 621 * 622 * @vp: The vma_prepare struct 623 * @vmi: The vma iterator 624 * @mm: The mm_struct 625 */ 626 void vma_complete(struct vma_prepare *vp, 627 struct vma_iterator *vmi, struct mm_struct *mm) 628 { 629 if (vp->file) { 630 if (vp->adj_next) 631 vma_interval_tree_insert(vp->adj_next, 632 &vp->mapping->i_mmap); 633 vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 634 flush_dcache_mmap_unlock(vp->mapping); 635 } 636 637 if (vp->remove && vp->file) { 638 __remove_shared_vm_struct(vp->remove, vp->mapping); 639 if (vp->remove2) 640 __remove_shared_vm_struct(vp->remove2, vp->mapping); 641 } else if (vp->insert) { 642 /* 643 * split_vma has split insert from vma, and needs 644 * us to insert it before dropping the locks 645 * (it may either follow vma or precede it). 646 */ 647 vma_iter_store(vmi, vp->insert); 648 mm->map_count++; 649 } 650 651 if (vp->anon_vma) { 652 anon_vma_interval_tree_post_update_vma(vp->vma); 653 if (vp->adj_next) 654 anon_vma_interval_tree_post_update_vma(vp->adj_next); 655 anon_vma_unlock_write(vp->anon_vma); 656 } 657 658 if (vp->file) { 659 i_mmap_unlock_write(vp->mapping); 660 uprobe_mmap(vp->vma); 661 662 if (vp->adj_next) 663 uprobe_mmap(vp->adj_next); 664 } 665 666 if (vp->remove) { 667 again: 668 vma_mark_detached(vp->remove, true); 669 if (vp->file) { 670 uprobe_munmap(vp->remove, vp->remove->vm_start, 671 vp->remove->vm_end); 672 fput(vp->file); 673 } 674 if (vp->remove->anon_vma) 675 anon_vma_merge(vp->vma, vp->remove); 676 mm->map_count--; 677 mpol_put(vma_policy(vp->remove)); 678 if (!vp->remove2) 679 WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 680 vm_area_free(vp->remove); 681 682 /* 683 * In mprotect's case 6 (see comments on vma_merge), 684 * we are removing both mid and next vmas 685 */ 686 if (vp->remove2) { 687 vp->remove = vp->remove2; 688 vp->remove2 = NULL; 689 goto again; 690 } 691 } 692 if (vp->insert && vp->file) 693 uprobe_mmap(vp->insert); 694 validate_mm(mm); 695 } 696 697 /* 698 * abort_munmap_vmas - Undo any munmap work and free resources 699 * 700 * Reattach any detached vmas and free up the maple tree used to track the vmas. 701 */ 702 static inline void abort_munmap_vmas(struct ma_state *mas_detach) 703 { 704 struct vm_area_struct *vma; 705 706 mas_set(mas_detach, 0); 707 mas_for_each(mas_detach, vma, ULONG_MAX) 708 vma_mark_detached(vma, false); 709 710 __mt_destroy(mas_detach->tree); 711 } 712 713 /* 714 * vms_complete_munmap_vmas() - Finish the munmap() operation 715 * @vms: The vma munmap struct 716 * @mas_detach: The maple state of the detached vmas 717 * 718 * This updates the mm_struct, unmaps the region, frees the resources 719 * used for the munmap() and may downgrade the lock - if requested. Everything 720 * needed to be done once the vma maple tree is updated. 721 */ 722 static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, 723 struct ma_state *mas_detach) 724 { 725 struct vm_area_struct *prev, *next; 726 struct mm_struct *mm; 727 728 mm = vms->mm; 729 mm->map_count -= vms->vma_count; 730 mm->locked_vm -= vms->locked_vm; 731 if (vms->unlock) 732 mmap_write_downgrade(mm); 733 734 prev = vma_iter_prev_range(vms->vmi); 735 next = vma_next(vms->vmi); 736 if (next) 737 vma_iter_prev_range(vms->vmi); 738 739 /* 740 * We can free page tables without write-locking mmap_lock because VMAs 741 * were isolated before we downgraded mmap_lock. 742 */ 743 mas_set(mas_detach, 1); 744 unmap_region(mm, mas_detach, vms->vma, prev, next, vms->start, vms->end, 745 vms->vma_count, !vms->unlock); 746 /* Statistics and freeing VMAs */ 747 mas_set(mas_detach, 0); 748 remove_mt(mm, mas_detach); 749 validate_mm(mm); 750 if (vms->unlock) 751 mmap_read_unlock(mm); 752 753 __mt_destroy(mas_detach->tree); 754 } 755 756 /* 757 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree 758 * for removal at a later date. Handles splitting first and last if necessary 759 * and marking the vmas as isolated. 760 * 761 * @vms: The vma munmap struct 762 * @mas_detach: The maple state tracking the detached tree 763 * 764 * Return: 0 on success, -EPERM on mseal vmas, -ENOMEM otherwise 765 */ 766 static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, 767 struct ma_state *mas_detach) 768 { 769 struct vm_area_struct *next = NULL; 770 int error = -ENOMEM; 771 772 /* 773 * If we need to split any vma, do it now to save pain later. 774 * 775 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 776 * unmapped vm_area_struct will remain in use: so lower split_vma 777 * places tmp vma above, and higher split_vma places tmp vma below. 778 */ 779 780 /* Does it split the first one? */ 781 if (vms->start > vms->vma->vm_start) { 782 783 /* 784 * Make sure that map_count on return from munmap() will 785 * not exceed its limit; but let map_count go just above 786 * its limit temporarily, to help free resources as expected. 787 */ 788 if (vms->end < vms->vma->vm_end && 789 vms->mm->map_count >= sysctl_max_map_count) 790 goto map_count_exceeded; 791 792 /* Don't bother splitting the VMA if we can't unmap it anyway */ 793 if (!can_modify_vma(vms->vma)) { 794 error = -EPERM; 795 goto start_split_failed; 796 } 797 798 if (__split_vma(vms->vmi, vms->vma, vms->start, 1)) 799 goto start_split_failed; 800 } 801 802 /* 803 * Detach a range of VMAs from the mm. Using next as a temp variable as 804 * it is always overwritten. 805 */ 806 next = vms->vma; 807 do { 808 if (!can_modify_vma(next)) { 809 error = -EPERM; 810 goto modify_vma_failed; 811 } 812 813 /* Does it split the end? */ 814 if (next->vm_end > vms->end) { 815 if (__split_vma(vms->vmi, next, vms->end, 0)) 816 goto end_split_failed; 817 } 818 vma_start_write(next); 819 mas_set(mas_detach, vms->vma_count++); 820 if (mas_store_gfp(mas_detach, next, GFP_KERNEL)) 821 goto munmap_gather_failed; 822 823 vma_mark_detached(next, true); 824 if (next->vm_flags & VM_LOCKED) 825 vms->locked_vm += vma_pages(next); 826 827 if (unlikely(vms->uf)) { 828 /* 829 * If userfaultfd_unmap_prep returns an error the vmas 830 * will remain split, but userland will get a 831 * highly unexpected error anyway. This is no 832 * different than the case where the first of the two 833 * __split_vma fails, but we don't undo the first 834 * split, despite we could. This is unlikely enough 835 * failure that it's not worth optimizing it for. 836 */ 837 if (userfaultfd_unmap_prep(next, vms->start, vms->end, 838 vms->uf)) 839 goto userfaultfd_error; 840 } 841 #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 842 BUG_ON(next->vm_start < vms->start); 843 BUG_ON(next->vm_start > vms->end); 844 #endif 845 } for_each_vma_range(*(vms->vmi), next, vms->end); 846 847 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 848 /* Make sure no VMAs are about to be lost. */ 849 { 850 MA_STATE(test, mas_detach->tree, 0, 0); 851 struct vm_area_struct *vma_mas, *vma_test; 852 int test_count = 0; 853 854 vma_iter_set(vms->vmi, vms->start); 855 rcu_read_lock(); 856 vma_test = mas_find(&test, vms->vma_count - 1); 857 for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { 858 BUG_ON(vma_mas != vma_test); 859 test_count++; 860 vma_test = mas_next(&test, vms->vma_count - 1); 861 } 862 rcu_read_unlock(); 863 BUG_ON(vms->vma_count != test_count); 864 } 865 #endif 866 867 while (vma_iter_addr(vms->vmi) > vms->start) 868 vma_iter_prev_range(vms->vmi); 869 870 return 0; 871 872 userfaultfd_error: 873 munmap_gather_failed: 874 end_split_failed: 875 modify_vma_failed: 876 abort_munmap_vmas(mas_detach); 877 start_split_failed: 878 map_count_exceeded: 879 return error; 880 } 881 882 /* 883 * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 884 * @vmi: The vma iterator 885 * @vma: The starting vm_area_struct 886 * @mm: The mm_struct 887 * @start: The aligned start address to munmap. 888 * @end: The aligned end address to munmap. 889 * @uf: The userfaultfd list_head 890 * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 891 * success. 892 * 893 * Return: 0 on success and drops the lock if so directed, error and leaves the 894 * lock held otherwise. 895 */ 896 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 897 struct mm_struct *mm, unsigned long start, unsigned long end, 898 struct list_head *uf, bool unlock) 899 { 900 struct maple_tree mt_detach; 901 MA_STATE(mas_detach, &mt_detach, 0, 0); 902 mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 903 mt_on_stack(mt_detach); 904 struct vma_munmap_struct vms; 905 int error; 906 907 init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); 908 error = vms_gather_munmap_vmas(&vms, &mas_detach); 909 if (error) 910 goto gather_failed; 911 912 error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 913 if (error) 914 goto clear_tree_failed; 915 916 /* Point of no return */ 917 vms_complete_munmap_vmas(&vms, &mas_detach); 918 return 0; 919 920 clear_tree_failed: 921 abort_munmap_vmas(&mas_detach); 922 gather_failed: 923 validate_mm(mm); 924 return error; 925 } 926 927 /* 928 * do_vmi_munmap() - munmap a given range. 929 * @vmi: The vma iterator 930 * @mm: The mm_struct 931 * @start: The start address to munmap 932 * @len: The length of the range to munmap 933 * @uf: The userfaultfd list_head 934 * @unlock: set to true if the user wants to drop the mmap_lock on success 935 * 936 * This function takes a @mas that is either pointing to the previous VMA or set 937 * to MA_START and sets it up to remove the mapping(s). The @len will be 938 * aligned. 939 * 940 * Return: 0 on success and drops the lock if so directed, error and leaves the 941 * lock held otherwise. 942 */ 943 int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 944 unsigned long start, size_t len, struct list_head *uf, 945 bool unlock) 946 { 947 unsigned long end; 948 struct vm_area_struct *vma; 949 950 if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 951 return -EINVAL; 952 953 end = start + PAGE_ALIGN(len); 954 if (end == start) 955 return -EINVAL; 956 957 /* Find the first overlapping VMA */ 958 vma = vma_find(vmi, end); 959 if (!vma) { 960 if (unlock) 961 mmap_write_unlock(mm); 962 return 0; 963 } 964 965 return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 966 } 967 968 /* 969 * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), 970 * figure out whether that can be merged with its predecessor or its 971 * successor. Or both (it neatly fills a hole). 972 * 973 * In most cases - when called for mmap, brk or mremap - [addr,end) is 974 * certain not to be mapped by the time vma_merge is called; but when 975 * called for mprotect, it is certain to be already mapped (either at 976 * an offset within prev, or at the start of next), and the flags of 977 * this area are about to be changed to vm_flags - and the no-change 978 * case has already been eliminated. 979 * 980 * The following mprotect cases have to be considered, where **** is 981 * the area passed down from mprotect_fixup, never extending beyond one 982 * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts 983 * at the same address as **** and is of the same or larger span, and 984 * NNNN the next vma after ****: 985 * 986 * **** **** **** 987 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC 988 * cannot merge might become might become 989 * PPNNNNNNNNNN PPPPPPPPPPCC 990 * mmap, brk or case 4 below case 5 below 991 * mremap move: 992 * **** **** 993 * PPPP NNNN PPPPCCCCNNNN 994 * might become might become 995 * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or 996 * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or 997 * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 998 * 999 * It is important for case 8 that the vma CCCC overlapping the 1000 * region **** is never going to extended over NNNN. Instead NNNN must 1001 * be extended in region **** and CCCC must be removed. This way in 1002 * all cases where vma_merge succeeds, the moment vma_merge drops the 1003 * rmap_locks, the properties of the merged vma will be already 1004 * correct for the whole merged range. Some of those properties like 1005 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must 1006 * be correct for the whole merged range immediately after the 1007 * rmap_locks are released. Otherwise if NNNN would be removed and 1008 * CCCC would be extended over the NNNN range, remove_migration_ptes 1009 * or other rmap walkers (if working on addresses beyond the "end" 1010 * parameter) may establish ptes with the wrong permissions of CCCC 1011 * instead of the right permissions of NNNN. 1012 * 1013 * In the code below: 1014 * PPPP is represented by *prev 1015 * CCCC is represented by *curr or not represented at all (NULL) 1016 * NNNN is represented by *next or not represented at all (NULL) 1017 * **** is not represented - it will be merged and the vma containing the 1018 * area is returned, or the function will return NULL 1019 */ 1020 static struct vm_area_struct 1021 *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, 1022 struct vm_area_struct *src, unsigned long addr, unsigned long end, 1023 unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, 1024 struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 1025 struct anon_vma_name *anon_name) 1026 { 1027 struct mm_struct *mm = src->vm_mm; 1028 struct anon_vma *anon_vma = src->anon_vma; 1029 struct file *file = src->vm_file; 1030 struct vm_area_struct *curr, *next, *res; 1031 struct vm_area_struct *vma, *adjust, *remove, *remove2; 1032 struct vm_area_struct *anon_dup = NULL; 1033 struct vma_prepare vp; 1034 pgoff_t vma_pgoff; 1035 int err = 0; 1036 bool merge_prev = false; 1037 bool merge_next = false; 1038 bool vma_expanded = false; 1039 unsigned long vma_start = addr; 1040 unsigned long vma_end = end; 1041 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1042 long adj_start = 0; 1043 1044 /* 1045 * We later require that vma->vm_flags == vm_flags, 1046 * so this tests vma->vm_flags & VM_SPECIAL, too. 1047 */ 1048 if (vm_flags & VM_SPECIAL) 1049 return NULL; 1050 1051 /* Does the input range span an existing VMA? (cases 5 - 8) */ 1052 curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); 1053 1054 if (!curr || /* cases 1 - 4 */ 1055 end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ 1056 next = vma_lookup(mm, end); 1057 else 1058 next = NULL; /* case 5 */ 1059 1060 if (prev) { 1061 vma_start = prev->vm_start; 1062 vma_pgoff = prev->vm_pgoff; 1063 1064 /* Can we merge the predecessor? */ 1065 if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) 1066 && can_vma_merge_after(prev, vm_flags, anon_vma, file, 1067 pgoff, vm_userfaultfd_ctx, anon_name)) { 1068 merge_prev = true; 1069 vma_prev(vmi); 1070 } 1071 } 1072 1073 /* Can we merge the successor? */ 1074 if (next && mpol_equal(policy, vma_policy(next)) && 1075 can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, 1076 vm_userfaultfd_ctx, anon_name)) { 1077 merge_next = true; 1078 } 1079 1080 /* Verify some invariant that must be enforced by the caller. */ 1081 VM_WARN_ON(prev && addr <= prev->vm_start); 1082 VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); 1083 VM_WARN_ON(addr >= end); 1084 1085 if (!merge_prev && !merge_next) 1086 return NULL; /* Not mergeable. */ 1087 1088 if (merge_prev) 1089 vma_start_write(prev); 1090 1091 res = vma = prev; 1092 remove = remove2 = adjust = NULL; 1093 1094 /* Can we merge both the predecessor and the successor? */ 1095 if (merge_prev && merge_next && 1096 is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { 1097 vma_start_write(next); 1098 remove = next; /* case 1 */ 1099 vma_end = next->vm_end; 1100 err = dup_anon_vma(prev, next, &anon_dup); 1101 if (curr) { /* case 6 */ 1102 vma_start_write(curr); 1103 remove = curr; 1104 remove2 = next; 1105 /* 1106 * Note that the dup_anon_vma below cannot overwrite err 1107 * since the first caller would do nothing unless next 1108 * has an anon_vma. 1109 */ 1110 if (!next->anon_vma) 1111 err = dup_anon_vma(prev, curr, &anon_dup); 1112 } 1113 } else if (merge_prev) { /* case 2 */ 1114 if (curr) { 1115 vma_start_write(curr); 1116 if (end == curr->vm_end) { /* case 7 */ 1117 /* 1118 * can_vma_merge_after() assumed we would not be 1119 * removing prev vma, so it skipped the check 1120 * for vm_ops->close, but we are removing curr 1121 */ 1122 if (curr->vm_ops && curr->vm_ops->close) 1123 err = -EINVAL; 1124 remove = curr; 1125 } else { /* case 5 */ 1126 adjust = curr; 1127 adj_start = (end - curr->vm_start); 1128 } 1129 if (!err) 1130 err = dup_anon_vma(prev, curr, &anon_dup); 1131 } 1132 } else { /* merge_next */ 1133 vma_start_write(next); 1134 res = next; 1135 if (prev && addr < prev->vm_end) { /* case 4 */ 1136 vma_start_write(prev); 1137 vma_end = addr; 1138 adjust = next; 1139 adj_start = -(prev->vm_end - addr); 1140 err = dup_anon_vma(next, prev, &anon_dup); 1141 } else { 1142 /* 1143 * Note that cases 3 and 8 are the ONLY ones where prev 1144 * is permitted to be (but is not necessarily) NULL. 1145 */ 1146 vma = next; /* case 3 */ 1147 vma_start = addr; 1148 vma_end = next->vm_end; 1149 vma_pgoff = next->vm_pgoff - pglen; 1150 if (curr) { /* case 8 */ 1151 vma_pgoff = curr->vm_pgoff; 1152 vma_start_write(curr); 1153 remove = curr; 1154 err = dup_anon_vma(next, curr, &anon_dup); 1155 } 1156 } 1157 } 1158 1159 /* Error in anon_vma clone. */ 1160 if (err) 1161 goto anon_vma_fail; 1162 1163 if (vma_start < vma->vm_start || vma_end > vma->vm_end) 1164 vma_expanded = true; 1165 1166 if (vma_expanded) { 1167 vma_iter_config(vmi, vma_start, vma_end); 1168 } else { 1169 vma_iter_config(vmi, adjust->vm_start + adj_start, 1170 adjust->vm_end); 1171 } 1172 1173 if (vma_iter_prealloc(vmi, vma)) 1174 goto prealloc_fail; 1175 1176 init_multi_vma_prep(&vp, vma, adjust, remove, remove2); 1177 VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && 1178 vp.anon_vma != adjust->anon_vma); 1179 1180 vma_prepare(&vp); 1181 vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); 1182 vma_set_range(vma, vma_start, vma_end, vma_pgoff); 1183 1184 if (vma_expanded) 1185 vma_iter_store(vmi, vma); 1186 1187 if (adj_start) { 1188 adjust->vm_start += adj_start; 1189 adjust->vm_pgoff += adj_start >> PAGE_SHIFT; 1190 if (adj_start < 0) { 1191 WARN_ON(vma_expanded); 1192 vma_iter_store(vmi, next); 1193 } 1194 } 1195 1196 vma_complete(&vp, vmi, mm); 1197 khugepaged_enter_vma(res, vm_flags); 1198 return res; 1199 1200 prealloc_fail: 1201 if (anon_dup) 1202 unlink_anon_vmas(anon_dup); 1203 1204 anon_vma_fail: 1205 vma_iter_set(vmi, addr); 1206 vma_iter_load(vmi); 1207 return NULL; 1208 } 1209 1210 /* 1211 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1212 * context and anonymous VMA name within the range [start, end). 1213 * 1214 * As a result, we might be able to merge the newly modified VMA range with an 1215 * adjacent VMA with identical properties. 1216 * 1217 * If no merge is possible and the range does not span the entirety of the VMA, 1218 * we then need to split the VMA to accommodate the change. 1219 * 1220 * The function returns either the merged VMA, the original VMA if a split was 1221 * required instead, or an error if the split failed. 1222 */ 1223 struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1224 struct vm_area_struct *prev, 1225 struct vm_area_struct *vma, 1226 unsigned long start, unsigned long end, 1227 unsigned long vm_flags, 1228 struct mempolicy *policy, 1229 struct vm_userfaultfd_ctx uffd_ctx, 1230 struct anon_vma_name *anon_name) 1231 { 1232 pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 1233 struct vm_area_struct *merged; 1234 1235 merged = vma_merge(vmi, prev, vma, start, end, vm_flags, 1236 pgoff, policy, uffd_ctx, anon_name); 1237 if (merged) 1238 return merged; 1239 1240 if (vma->vm_start < start) { 1241 int err = split_vma(vmi, vma, start, 1); 1242 1243 if (err) 1244 return ERR_PTR(err); 1245 } 1246 1247 if (vma->vm_end > end) { 1248 int err = split_vma(vmi, vma, end, 0); 1249 1250 if (err) 1251 return ERR_PTR(err); 1252 } 1253 1254 return vma; 1255 } 1256 1257 /* 1258 * Attempt to merge a newly mapped VMA with those adjacent to it. The caller 1259 * must ensure that [start, end) does not overlap any existing VMA. 1260 */ 1261 struct vm_area_struct 1262 *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 1263 struct vm_area_struct *vma, unsigned long start, 1264 unsigned long end, pgoff_t pgoff) 1265 { 1266 return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, 1267 vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1268 } 1269 1270 /* 1271 * Expand vma by delta bytes, potentially merging with an immediately adjacent 1272 * VMA with identical properties. 1273 */ 1274 struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1275 struct vm_area_struct *vma, 1276 unsigned long delta) 1277 { 1278 pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); 1279 1280 /* vma is specified as prev, so case 1 or 2 will apply. */ 1281 return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, 1282 vma->vm_flags, pgoff, vma_policy(vma), 1283 vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1284 } 1285 1286 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 1287 { 1288 vb->count = 0; 1289 } 1290 1291 static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 1292 { 1293 struct address_space *mapping; 1294 int i; 1295 1296 mapping = vb->vmas[0]->vm_file->f_mapping; 1297 i_mmap_lock_write(mapping); 1298 for (i = 0; i < vb->count; i++) { 1299 VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 1300 __remove_shared_vm_struct(vb->vmas[i], mapping); 1301 } 1302 i_mmap_unlock_write(mapping); 1303 1304 unlink_file_vma_batch_init(vb); 1305 } 1306 1307 void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 1308 struct vm_area_struct *vma) 1309 { 1310 if (vma->vm_file == NULL) 1311 return; 1312 1313 if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 1314 vb->count == ARRAY_SIZE(vb->vmas)) 1315 unlink_file_vma_batch_process(vb); 1316 1317 vb->vmas[vb->count] = vma; 1318 vb->count++; 1319 } 1320 1321 void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 1322 { 1323 if (vb->count > 0) 1324 unlink_file_vma_batch_process(vb); 1325 } 1326 1327 /* 1328 * Unlink a file-based vm structure from its interval tree, to hide 1329 * vma from rmap and vmtruncate before freeing its page tables. 1330 */ 1331 void unlink_file_vma(struct vm_area_struct *vma) 1332 { 1333 struct file *file = vma->vm_file; 1334 1335 if (file) { 1336 struct address_space *mapping = file->f_mapping; 1337 1338 i_mmap_lock_write(mapping); 1339 __remove_shared_vm_struct(vma, mapping); 1340 i_mmap_unlock_write(mapping); 1341 } 1342 } 1343 1344 void vma_link_file(struct vm_area_struct *vma) 1345 { 1346 struct file *file = vma->vm_file; 1347 struct address_space *mapping; 1348 1349 if (file) { 1350 mapping = file->f_mapping; 1351 i_mmap_lock_write(mapping); 1352 __vma_link_file(vma, mapping); 1353 i_mmap_unlock_write(mapping); 1354 } 1355 } 1356 1357 int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 1358 { 1359 VMA_ITERATOR(vmi, mm, 0); 1360 1361 vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1362 if (vma_iter_prealloc(&vmi, vma)) 1363 return -ENOMEM; 1364 1365 vma_start_write(vma); 1366 vma_iter_store(&vmi, vma); 1367 vma_link_file(vma); 1368 mm->map_count++; 1369 validate_mm(mm); 1370 return 0; 1371 } 1372 1373 /* 1374 * Copy the vma structure to a new location in the same mm, 1375 * prior to moving page table entries, to effect an mremap move. 1376 */ 1377 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1378 unsigned long addr, unsigned long len, pgoff_t pgoff, 1379 bool *need_rmap_locks) 1380 { 1381 struct vm_area_struct *vma = *vmap; 1382 unsigned long vma_start = vma->vm_start; 1383 struct mm_struct *mm = vma->vm_mm; 1384 struct vm_area_struct *new_vma, *prev; 1385 bool faulted_in_anon_vma = true; 1386 VMA_ITERATOR(vmi, mm, addr); 1387 1388 /* 1389 * If anonymous vma has not yet been faulted, update new pgoff 1390 * to match new location, to increase its chance of merging. 1391 */ 1392 if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 1393 pgoff = addr >> PAGE_SHIFT; 1394 faulted_in_anon_vma = false; 1395 } 1396 1397 new_vma = find_vma_prev(mm, addr, &prev); 1398 if (new_vma && new_vma->vm_start < addr + len) 1399 return NULL; /* should never get here */ 1400 1401 new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); 1402 if (new_vma) { 1403 /* 1404 * Source vma may have been merged into new_vma 1405 */ 1406 if (unlikely(vma_start >= new_vma->vm_start && 1407 vma_start < new_vma->vm_end)) { 1408 /* 1409 * The only way we can get a vma_merge with 1410 * self during an mremap is if the vma hasn't 1411 * been faulted in yet and we were allowed to 1412 * reset the dst vma->vm_pgoff to the 1413 * destination address of the mremap to allow 1414 * the merge to happen. mremap must change the 1415 * vm_pgoff linearity between src and dst vmas 1416 * (in turn preventing a vma_merge) to be 1417 * safe. It is only safe to keep the vm_pgoff 1418 * linear if there are no pages mapped yet. 1419 */ 1420 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 1421 *vmap = vma = new_vma; 1422 } 1423 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 1424 } else { 1425 new_vma = vm_area_dup(vma); 1426 if (!new_vma) 1427 goto out; 1428 vma_set_range(new_vma, addr, addr + len, pgoff); 1429 if (vma_dup_policy(vma, new_vma)) 1430 goto out_free_vma; 1431 if (anon_vma_clone(new_vma, vma)) 1432 goto out_free_mempol; 1433 if (new_vma->vm_file) 1434 get_file(new_vma->vm_file); 1435 if (new_vma->vm_ops && new_vma->vm_ops->open) 1436 new_vma->vm_ops->open(new_vma); 1437 if (vma_link(mm, new_vma)) 1438 goto out_vma_link; 1439 *need_rmap_locks = false; 1440 } 1441 return new_vma; 1442 1443 out_vma_link: 1444 if (new_vma->vm_ops && new_vma->vm_ops->close) 1445 new_vma->vm_ops->close(new_vma); 1446 1447 if (new_vma->vm_file) 1448 fput(new_vma->vm_file); 1449 1450 unlink_anon_vmas(new_vma); 1451 out_free_mempol: 1452 mpol_put(vma_policy(new_vma)); 1453 out_free_vma: 1454 vm_area_free(new_vma); 1455 out: 1456 return NULL; 1457 } 1458 1459 /* 1460 * Rough compatibility check to quickly see if it's even worth looking 1461 * at sharing an anon_vma. 1462 * 1463 * They need to have the same vm_file, and the flags can only differ 1464 * in things that mprotect may change. 1465 * 1466 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1467 * we can merge the two vma's. For example, we refuse to merge a vma if 1468 * there is a vm_ops->close() function, because that indicates that the 1469 * driver is doing some kind of reference counting. But that doesn't 1470 * really matter for the anon_vma sharing case. 1471 */ 1472 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1473 { 1474 return a->vm_end == b->vm_start && 1475 mpol_equal(vma_policy(a), vma_policy(b)) && 1476 a->vm_file == b->vm_file && 1477 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1478 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1479 } 1480 1481 /* 1482 * Do some basic sanity checking to see if we can re-use the anon_vma 1483 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1484 * the same as 'old', the other will be the new one that is trying 1485 * to share the anon_vma. 1486 * 1487 * NOTE! This runs with mmap_lock held for reading, so it is possible that 1488 * the anon_vma of 'old' is concurrently in the process of being set up 1489 * by another page fault trying to merge _that_. But that's ok: if it 1490 * is being set up, that automatically means that it will be a singleton 1491 * acceptable for merging, so we can do all of this optimistically. But 1492 * we do that READ_ONCE() to make sure that we never re-load the pointer. 1493 * 1494 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1495 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1496 * is to return an anon_vma that is "complex" due to having gone through 1497 * a fork). 1498 * 1499 * We also make sure that the two vma's are compatible (adjacent, 1500 * and with the same memory policies). That's all stable, even with just 1501 * a read lock on the mmap_lock. 1502 */ 1503 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, 1504 struct vm_area_struct *a, 1505 struct vm_area_struct *b) 1506 { 1507 if (anon_vma_compatible(a, b)) { 1508 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1509 1510 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1511 return anon_vma; 1512 } 1513 return NULL; 1514 } 1515 1516 /* 1517 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1518 * neighbouring vmas for a suitable anon_vma, before it goes off 1519 * to allocate a new anon_vma. It checks because a repetitive 1520 * sequence of mprotects and faults may otherwise lead to distinct 1521 * anon_vmas being allocated, preventing vma merge in subsequent 1522 * mprotect. 1523 */ 1524 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1525 { 1526 struct anon_vma *anon_vma = NULL; 1527 struct vm_area_struct *prev, *next; 1528 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1529 1530 /* Try next first. */ 1531 next = vma_iter_load(&vmi); 1532 if (next) { 1533 anon_vma = reusable_anon_vma(next, vma, next); 1534 if (anon_vma) 1535 return anon_vma; 1536 } 1537 1538 prev = vma_prev(&vmi); 1539 VM_BUG_ON_VMA(prev != vma, vma); 1540 prev = vma_prev(&vmi); 1541 /* Try prev next. */ 1542 if (prev) 1543 anon_vma = reusable_anon_vma(prev, prev, vma); 1544 1545 /* 1546 * We might reach here with anon_vma == NULL if we can't find 1547 * any reusable anon_vma. 1548 * There's no absolute need to look only at touching neighbours: 1549 * we could search further afield for "compatible" anon_vmas. 1550 * But it would probably just be a waste of time searching, 1551 * or lead to too many vmas hanging off the same anon_vma. 1552 * We're trying to allow mprotect remerging later on, 1553 * not trying to minimize memory used for anon_vmas. 1554 */ 1555 return anon_vma; 1556 } 1557 1558 static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 1559 { 1560 return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 1561 } 1562 1563 static bool vma_is_shared_writable(struct vm_area_struct *vma) 1564 { 1565 return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 1566 (VM_WRITE | VM_SHARED); 1567 } 1568 1569 static bool vma_fs_can_writeback(struct vm_area_struct *vma) 1570 { 1571 /* No managed pages to writeback. */ 1572 if (vma->vm_flags & VM_PFNMAP) 1573 return false; 1574 1575 return vma->vm_file && vma->vm_file->f_mapping && 1576 mapping_can_writeback(vma->vm_file->f_mapping); 1577 } 1578 1579 /* 1580 * Does this VMA require the underlying folios to have their dirty state 1581 * tracked? 1582 */ 1583 bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 1584 { 1585 /* Only shared, writable VMAs require dirty tracking. */ 1586 if (!vma_is_shared_writable(vma)) 1587 return false; 1588 1589 /* Does the filesystem need to be notified? */ 1590 if (vm_ops_needs_writenotify(vma->vm_ops)) 1591 return true; 1592 1593 /* 1594 * Even if the filesystem doesn't indicate a need for writenotify, if it 1595 * can writeback, dirty tracking is still required. 1596 */ 1597 return vma_fs_can_writeback(vma); 1598 } 1599 1600 /* 1601 * Some shared mappings will want the pages marked read-only 1602 * to track write events. If so, we'll downgrade vm_page_prot 1603 * to the private version (using protection_map[] without the 1604 * VM_SHARED bit). 1605 */ 1606 bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 1607 { 1608 /* If it was private or non-writable, the write bit is already clear */ 1609 if (!vma_is_shared_writable(vma)) 1610 return false; 1611 1612 /* The backer wishes to know when pages are first written to? */ 1613 if (vm_ops_needs_writenotify(vma->vm_ops)) 1614 return true; 1615 1616 /* The open routine did something to the protections that pgprot_modify 1617 * won't preserve? */ 1618 if (pgprot_val(vm_page_prot) != 1619 pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 1620 return false; 1621 1622 /* 1623 * Do we need to track softdirty? hugetlb does not support softdirty 1624 * tracking yet. 1625 */ 1626 if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 1627 return true; 1628 1629 /* Do we need write faults for uffd-wp tracking? */ 1630 if (userfaultfd_wp(vma)) 1631 return true; 1632 1633 /* Can the mapping track the dirty pages? */ 1634 return vma_fs_can_writeback(vma); 1635 } 1636 1637 unsigned long count_vma_pages_range(struct mm_struct *mm, 1638 unsigned long addr, unsigned long end) 1639 { 1640 VMA_ITERATOR(vmi, mm, addr); 1641 struct vm_area_struct *vma; 1642 unsigned long nr_pages = 0; 1643 1644 for_each_vma_range(vmi, vma, end) { 1645 unsigned long vm_start = max(addr, vma->vm_start); 1646 unsigned long vm_end = min(end, vma->vm_end); 1647 1648 nr_pages += PHYS_PFN(vm_end - vm_start); 1649 } 1650 1651 return nr_pages; 1652 } 1653 1654 static DEFINE_MUTEX(mm_all_locks_mutex); 1655 1656 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 1657 { 1658 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1659 /* 1660 * The LSB of head.next can't change from under us 1661 * because we hold the mm_all_locks_mutex. 1662 */ 1663 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 1664 /* 1665 * We can safely modify head.next after taking the 1666 * anon_vma->root->rwsem. If some other vma in this mm shares 1667 * the same anon_vma we won't take it again. 1668 * 1669 * No need of atomic instructions here, head.next 1670 * can't change from under us thanks to the 1671 * anon_vma->root->rwsem. 1672 */ 1673 if (__test_and_set_bit(0, (unsigned long *) 1674 &anon_vma->root->rb_root.rb_root.rb_node)) 1675 BUG(); 1676 } 1677 } 1678 1679 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 1680 { 1681 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1682 /* 1683 * AS_MM_ALL_LOCKS can't change from under us because 1684 * we hold the mm_all_locks_mutex. 1685 * 1686 * Operations on ->flags have to be atomic because 1687 * even if AS_MM_ALL_LOCKS is stable thanks to the 1688 * mm_all_locks_mutex, there may be other cpus 1689 * changing other bitflags in parallel to us. 1690 */ 1691 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 1692 BUG(); 1693 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 1694 } 1695 } 1696 1697 /* 1698 * This operation locks against the VM for all pte/vma/mm related 1699 * operations that could ever happen on a certain mm. This includes 1700 * vmtruncate, try_to_unmap, and all page faults. 1701 * 1702 * The caller must take the mmap_lock in write mode before calling 1703 * mm_take_all_locks(). The caller isn't allowed to release the 1704 * mmap_lock until mm_drop_all_locks() returns. 1705 * 1706 * mmap_lock in write mode is required in order to block all operations 1707 * that could modify pagetables and free pages without need of 1708 * altering the vma layout. It's also needed in write mode to avoid new 1709 * anon_vmas to be associated with existing vmas. 1710 * 1711 * A single task can't take more than one mm_take_all_locks() in a row 1712 * or it would deadlock. 1713 * 1714 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 1715 * mapping->flags avoid to take the same lock twice, if more than one 1716 * vma in this mm is backed by the same anon_vma or address_space. 1717 * 1718 * We take locks in following order, accordingly to comment at beginning 1719 * of mm/rmap.c: 1720 * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 1721 * hugetlb mapping); 1722 * - all vmas marked locked 1723 * - all i_mmap_rwsem locks; 1724 * - all anon_vma->rwseml 1725 * 1726 * We can take all locks within these types randomly because the VM code 1727 * doesn't nest them and we protected from parallel mm_take_all_locks() by 1728 * mm_all_locks_mutex. 1729 * 1730 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 1731 * that may have to take thousand of locks. 1732 * 1733 * mm_take_all_locks() can fail if it's interrupted by signals. 1734 */ 1735 int mm_take_all_locks(struct mm_struct *mm) 1736 { 1737 struct vm_area_struct *vma; 1738 struct anon_vma_chain *avc; 1739 VMA_ITERATOR(vmi, mm, 0); 1740 1741 mmap_assert_write_locked(mm); 1742 1743 mutex_lock(&mm_all_locks_mutex); 1744 1745 /* 1746 * vma_start_write() does not have a complement in mm_drop_all_locks() 1747 * because vma_start_write() is always asymmetrical; it marks a VMA as 1748 * being written to until mmap_write_unlock() or mmap_write_downgrade() 1749 * is reached. 1750 */ 1751 for_each_vma(vmi, vma) { 1752 if (signal_pending(current)) 1753 goto out_unlock; 1754 vma_start_write(vma); 1755 } 1756 1757 vma_iter_init(&vmi, mm, 0); 1758 for_each_vma(vmi, vma) { 1759 if (signal_pending(current)) 1760 goto out_unlock; 1761 if (vma->vm_file && vma->vm_file->f_mapping && 1762 is_vm_hugetlb_page(vma)) 1763 vm_lock_mapping(mm, vma->vm_file->f_mapping); 1764 } 1765 1766 vma_iter_init(&vmi, mm, 0); 1767 for_each_vma(vmi, vma) { 1768 if (signal_pending(current)) 1769 goto out_unlock; 1770 if (vma->vm_file && vma->vm_file->f_mapping && 1771 !is_vm_hugetlb_page(vma)) 1772 vm_lock_mapping(mm, vma->vm_file->f_mapping); 1773 } 1774 1775 vma_iter_init(&vmi, mm, 0); 1776 for_each_vma(vmi, vma) { 1777 if (signal_pending(current)) 1778 goto out_unlock; 1779 if (vma->anon_vma) 1780 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1781 vm_lock_anon_vma(mm, avc->anon_vma); 1782 } 1783 1784 return 0; 1785 1786 out_unlock: 1787 mm_drop_all_locks(mm); 1788 return -EINTR; 1789 } 1790 1791 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 1792 { 1793 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1794 /* 1795 * The LSB of head.next can't change to 0 from under 1796 * us because we hold the mm_all_locks_mutex. 1797 * 1798 * We must however clear the bitflag before unlocking 1799 * the vma so the users using the anon_vma->rb_root will 1800 * never see our bitflag. 1801 * 1802 * No need of atomic instructions here, head.next 1803 * can't change from under us until we release the 1804 * anon_vma->root->rwsem. 1805 */ 1806 if (!__test_and_clear_bit(0, (unsigned long *) 1807 &anon_vma->root->rb_root.rb_root.rb_node)) 1808 BUG(); 1809 anon_vma_unlock_write(anon_vma); 1810 } 1811 } 1812 1813 static void vm_unlock_mapping(struct address_space *mapping) 1814 { 1815 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1816 /* 1817 * AS_MM_ALL_LOCKS can't change to 0 from under us 1818 * because we hold the mm_all_locks_mutex. 1819 */ 1820 i_mmap_unlock_write(mapping); 1821 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 1822 &mapping->flags)) 1823 BUG(); 1824 } 1825 } 1826 1827 /* 1828 * The mmap_lock cannot be released by the caller until 1829 * mm_drop_all_locks() returns. 1830 */ 1831 void mm_drop_all_locks(struct mm_struct *mm) 1832 { 1833 struct vm_area_struct *vma; 1834 struct anon_vma_chain *avc; 1835 VMA_ITERATOR(vmi, mm, 0); 1836 1837 mmap_assert_write_locked(mm); 1838 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 1839 1840 for_each_vma(vmi, vma) { 1841 if (vma->anon_vma) 1842 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1843 vm_unlock_anon_vma(avc->anon_vma); 1844 if (vma->vm_file && vma->vm_file->f_mapping) 1845 vm_unlock_mapping(vma->vm_file->f_mapping); 1846 } 1847 1848 mutex_unlock(&mm_all_locks_mutex); 1849 } 1850