1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/mremap.c 4 * 5 * (C) Copyright 1996 Linus Torvalds 6 * 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/mm_inline.h> 13 #include <linux/hugetlb.h> 14 #include <linux/shm.h> 15 #include <linux/ksm.h> 16 #include <linux/mman.h> 17 #include <linux/swap.h> 18 #include <linux/capability.h> 19 #include <linux/fs.h> 20 #include <linux/swapops.h> 21 #include <linux/highmem.h> 22 #include <linux/security.h> 23 #include <linux/syscalls.h> 24 #include <linux/mmu_notifier.h> 25 #include <linux/uaccess.h> 26 #include <linux/userfaultfd_k.h> 27 #include <linux/mempolicy.h> 28 29 #include <asm/cacheflush.h> 30 #include <asm/tlb.h> 31 #include <asm/pgalloc.h> 32 33 #include "internal.h" 34 35 /* Classify the kind of remap operation being performed. */ 36 enum mremap_type { 37 MREMAP_INVALID, /* Initial state. */ 38 MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ 39 MREMAP_SHRINK, /* old_len > new_len. */ 40 MREMAP_EXPAND, /* old_len < new_len. */ 41 }; 42 43 /* 44 * Describes a VMA mremap() operation and is threaded throughout it. 45 * 46 * Any of the fields may be mutated by the operation, however these values will 47 * always accurately reflect the remap (for instance, we may adjust lengths and 48 * delta to account for hugetlb alignment). 49 */ 50 struct vma_remap_struct { 51 /* User-provided state. */ 52 unsigned long addr; /* User-specified address from which we remap. */ 53 unsigned long old_len; /* Length of range being remapped. */ 54 unsigned long new_len; /* Desired new length of mapping. */ 55 unsigned long flags; /* user-specified MREMAP_* flags. */ 56 unsigned long new_addr; /* Optionally, desired new address. */ 57 58 /* uffd state. */ 59 struct vm_userfaultfd_ctx *uf; 60 struct list_head *uf_unmap_early; 61 struct list_head *uf_unmap; 62 63 /* VMA state, determined in do_mremap(). */ 64 struct vm_area_struct *vma; 65 66 /* Internal state, determined in do_mremap(). */ 67 unsigned long delta; /* Absolute delta of old_len,new_len. */ 68 bool mlocked; /* Was the VMA mlock()'d? */ 69 enum mremap_type remap_type; /* expand, shrink, etc. */ 70 bool mmap_locked; /* Is mm currently write-locked? */ 71 unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ 72 }; 73 74 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) 75 { 76 pgd_t *pgd; 77 p4d_t *p4d; 78 pud_t *pud; 79 80 pgd = pgd_offset(mm, addr); 81 if (pgd_none_or_clear_bad(pgd)) 82 return NULL; 83 84 p4d = p4d_offset(pgd, addr); 85 if (p4d_none_or_clear_bad(p4d)) 86 return NULL; 87 88 pud = pud_offset(p4d, addr); 89 if (pud_none_or_clear_bad(pud)) 90 return NULL; 91 92 return pud; 93 } 94 95 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 96 { 97 pud_t *pud; 98 pmd_t *pmd; 99 100 pud = get_old_pud(mm, addr); 101 if (!pud) 102 return NULL; 103 104 pmd = pmd_offset(pud, addr); 105 if (pmd_none(*pmd)) 106 return NULL; 107 108 return pmd; 109 } 110 111 static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) 112 { 113 pgd_t *pgd; 114 p4d_t *p4d; 115 116 pgd = pgd_offset(mm, addr); 117 p4d = p4d_alloc(mm, pgd, addr); 118 if (!p4d) 119 return NULL; 120 121 return pud_alloc(mm, p4d, addr); 122 } 123 124 static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 125 { 126 pud_t *pud; 127 pmd_t *pmd; 128 129 pud = alloc_new_pud(mm, addr); 130 if (!pud) 131 return NULL; 132 133 pmd = pmd_alloc(mm, pud, addr); 134 if (!pmd) 135 return NULL; 136 137 VM_BUG_ON(pmd_trans_huge(*pmd)); 138 139 return pmd; 140 } 141 142 static void take_rmap_locks(struct vm_area_struct *vma) 143 { 144 if (vma->vm_file) 145 i_mmap_lock_write(vma->vm_file->f_mapping); 146 if (vma->anon_vma) 147 anon_vma_lock_write(vma->anon_vma); 148 } 149 150 static void drop_rmap_locks(struct vm_area_struct *vma) 151 { 152 if (vma->anon_vma) 153 anon_vma_unlock_write(vma->anon_vma); 154 if (vma->vm_file) 155 i_mmap_unlock_write(vma->vm_file->f_mapping); 156 } 157 158 static pte_t move_soft_dirty_pte(pte_t pte) 159 { 160 /* 161 * Set soft dirty bit so we can notice 162 * in userspace the ptes were moved. 163 */ 164 #ifdef CONFIG_MEM_SOFT_DIRTY 165 if (pte_present(pte)) 166 pte = pte_mksoft_dirty(pte); 167 else if (is_swap_pte(pte)) 168 pte = pte_swp_mksoft_dirty(pte); 169 #endif 170 return pte; 171 } 172 173 static int move_ptes(struct pagetable_move_control *pmc, 174 unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) 175 { 176 struct vm_area_struct *vma = pmc->old; 177 bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); 178 struct mm_struct *mm = vma->vm_mm; 179 pte_t *old_pte, *new_pte, pte; 180 pmd_t dummy_pmdval; 181 spinlock_t *old_ptl, *new_ptl; 182 bool force_flush = false; 183 unsigned long old_addr = pmc->old_addr; 184 unsigned long new_addr = pmc->new_addr; 185 unsigned long old_end = old_addr + extent; 186 unsigned long len = old_end - old_addr; 187 int err = 0; 188 189 /* 190 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma 191 * locks to ensure that rmap will always observe either the old or the 192 * new ptes. This is the easiest way to avoid races with 193 * truncate_pagecache(), page migration, etc... 194 * 195 * When need_rmap_locks is false, we use other ways to avoid 196 * such races: 197 * 198 * - During exec() shift_arg_pages(), we use a specially tagged vma 199 * which rmap call sites look for using vma_is_temporary_stack(). 200 * 201 * - During mremap(), new_vma is often known to be placed after vma 202 * in rmap traversal order. This ensures rmap will always observe 203 * either the old pte, or the new pte, or both (the page table locks 204 * serialize access to individual ptes, but only rmap traversal 205 * order guarantees that we won't miss both the old and new ptes). 206 */ 207 if (pmc->need_rmap_locks) 208 take_rmap_locks(vma); 209 210 /* 211 * We don't have to worry about the ordering of src and dst 212 * pte locks because exclusive mmap_lock prevents deadlock. 213 */ 214 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 215 if (!old_pte) { 216 err = -EAGAIN; 217 goto out; 218 } 219 /* 220 * Now new_pte is none, so hpage_collapse_scan_file() path can not find 221 * this by traversing file->f_mapping, so there is no concurrency with 222 * retract_page_tables(). In addition, we already hold the exclusive 223 * mmap_lock, so this new_pte page is stable, so there is no need to get 224 * pmdval and do pmd_same() check. 225 */ 226 new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, 227 &new_ptl); 228 if (!new_pte) { 229 pte_unmap_unlock(old_pte, old_ptl); 230 err = -EAGAIN; 231 goto out; 232 } 233 if (new_ptl != old_ptl) 234 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 235 flush_tlb_batched_pending(vma->vm_mm); 236 arch_enter_lazy_mmu_mode(); 237 238 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 239 new_pte++, new_addr += PAGE_SIZE) { 240 VM_WARN_ON_ONCE(!pte_none(*new_pte)); 241 242 if (pte_none(ptep_get(old_pte))) 243 continue; 244 245 pte = ptep_get_and_clear(mm, old_addr, old_pte); 246 /* 247 * If we are remapping a valid PTE, make sure 248 * to flush TLB before we drop the PTL for the 249 * PTE. 250 * 251 * NOTE! Both old and new PTL matter: the old one 252 * for racing with folio_mkclean(), the new one to 253 * make sure the physical page stays valid until 254 * the TLB entry for the old mapping has been 255 * flushed. 256 */ 257 if (pte_present(pte)) 258 force_flush = true; 259 pte = move_pte(pte, old_addr, new_addr); 260 pte = move_soft_dirty_pte(pte); 261 262 if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) 263 pte_clear(mm, new_addr, new_pte); 264 else { 265 if (need_clear_uffd_wp) { 266 if (pte_present(pte)) 267 pte = pte_clear_uffd_wp(pte); 268 else if (is_swap_pte(pte)) 269 pte = pte_swp_clear_uffd_wp(pte); 270 } 271 set_pte_at(mm, new_addr, new_pte, pte); 272 } 273 } 274 275 arch_leave_lazy_mmu_mode(); 276 if (force_flush) 277 flush_tlb_range(vma, old_end - len, old_end); 278 if (new_ptl != old_ptl) 279 spin_unlock(new_ptl); 280 pte_unmap(new_pte - 1); 281 pte_unmap_unlock(old_pte - 1, old_ptl); 282 out: 283 if (pmc->need_rmap_locks) 284 drop_rmap_locks(vma); 285 return err; 286 } 287 288 #ifndef arch_supports_page_table_move 289 #define arch_supports_page_table_move arch_supports_page_table_move 290 static inline bool arch_supports_page_table_move(void) 291 { 292 return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || 293 IS_ENABLED(CONFIG_HAVE_MOVE_PUD); 294 } 295 #endif 296 297 #ifdef CONFIG_HAVE_MOVE_PMD 298 static bool move_normal_pmd(struct pagetable_move_control *pmc, 299 pmd_t *old_pmd, pmd_t *new_pmd) 300 { 301 spinlock_t *old_ptl, *new_ptl; 302 struct vm_area_struct *vma = pmc->old; 303 struct mm_struct *mm = vma->vm_mm; 304 bool res = false; 305 pmd_t pmd; 306 307 if (!arch_supports_page_table_move()) 308 return false; 309 /* 310 * The destination pmd shouldn't be established, free_pgtables() 311 * should have released it. 312 * 313 * However, there's a case during execve() where we use mremap 314 * to move the initial stack, and in that case the target area 315 * may overlap the source area (always moving down). 316 * 317 * If everything is PMD-aligned, that works fine, as moving 318 * each pmd down will clear the source pmd. But if we first 319 * have a few 4kB-only pages that get moved down, and then 320 * hit the "now the rest is PMD-aligned, let's do everything 321 * one pmd at a time", we will still have the old (now empty 322 * of any 4kB pages, but still there) PMD in the page table 323 * tree. 324 * 325 * Warn on it once - because we really should try to figure 326 * out how to do this better - but then say "I won't move 327 * this pmd". 328 * 329 * One alternative might be to just unmap the target pmd at 330 * this point, and verify that it really is empty. We'll see. 331 */ 332 if (WARN_ON_ONCE(!pmd_none(*new_pmd))) 333 return false; 334 335 /* If this pmd belongs to a uffd vma with remap events disabled, we need 336 * to ensure that the uffd-wp state is cleared from all pgtables. This 337 * means recursing into lower page tables in move_page_tables(), and we 338 * can reuse the existing code if we simply treat the entry as "not 339 * moved". 340 */ 341 if (vma_has_uffd_without_event_remap(vma)) 342 return false; 343 344 /* 345 * We don't have to worry about the ordering of src and dst 346 * ptlocks because exclusive mmap_lock prevents deadlock. 347 */ 348 old_ptl = pmd_lock(mm, old_pmd); 349 new_ptl = pmd_lockptr(mm, new_pmd); 350 if (new_ptl != old_ptl) 351 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 352 353 pmd = *old_pmd; 354 355 /* Racing with collapse? */ 356 if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) 357 goto out_unlock; 358 /* Clear the pmd */ 359 pmd_clear(old_pmd); 360 res = true; 361 362 VM_BUG_ON(!pmd_none(*new_pmd)); 363 364 pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); 365 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); 366 out_unlock: 367 if (new_ptl != old_ptl) 368 spin_unlock(new_ptl); 369 spin_unlock(old_ptl); 370 371 return res; 372 } 373 #else 374 static inline bool move_normal_pmd(struct pagetable_move_control *pmc, 375 pmd_t *old_pmd, pmd_t *new_pmd) 376 { 377 return false; 378 } 379 #endif 380 381 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) 382 static bool move_normal_pud(struct pagetable_move_control *pmc, 383 pud_t *old_pud, pud_t *new_pud) 384 { 385 spinlock_t *old_ptl, *new_ptl; 386 struct vm_area_struct *vma = pmc->old; 387 struct mm_struct *mm = vma->vm_mm; 388 pud_t pud; 389 390 if (!arch_supports_page_table_move()) 391 return false; 392 /* 393 * The destination pud shouldn't be established, free_pgtables() 394 * should have released it. 395 */ 396 if (WARN_ON_ONCE(!pud_none(*new_pud))) 397 return false; 398 399 /* If this pud belongs to a uffd vma with remap events disabled, we need 400 * to ensure that the uffd-wp state is cleared from all pgtables. This 401 * means recursing into lower page tables in move_page_tables(), and we 402 * can reuse the existing code if we simply treat the entry as "not 403 * moved". 404 */ 405 if (vma_has_uffd_without_event_remap(vma)) 406 return false; 407 408 /* 409 * We don't have to worry about the ordering of src and dst 410 * ptlocks because exclusive mmap_lock prevents deadlock. 411 */ 412 old_ptl = pud_lock(mm, old_pud); 413 new_ptl = pud_lockptr(mm, new_pud); 414 if (new_ptl != old_ptl) 415 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 416 417 /* Clear the pud */ 418 pud = *old_pud; 419 pud_clear(old_pud); 420 421 VM_BUG_ON(!pud_none(*new_pud)); 422 423 pud_populate(mm, new_pud, pud_pgtable(pud)); 424 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); 425 if (new_ptl != old_ptl) 426 spin_unlock(new_ptl); 427 spin_unlock(old_ptl); 428 429 return true; 430 } 431 #else 432 static inline bool move_normal_pud(struct pagetable_move_control *pmc, 433 pud_t *old_pud, pud_t *new_pud) 434 { 435 return false; 436 } 437 #endif 438 439 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 440 static bool move_huge_pud(struct pagetable_move_control *pmc, 441 pud_t *old_pud, pud_t *new_pud) 442 { 443 spinlock_t *old_ptl, *new_ptl; 444 struct vm_area_struct *vma = pmc->old; 445 struct mm_struct *mm = vma->vm_mm; 446 pud_t pud; 447 448 /* 449 * The destination pud shouldn't be established, free_pgtables() 450 * should have released it. 451 */ 452 if (WARN_ON_ONCE(!pud_none(*new_pud))) 453 return false; 454 455 /* 456 * We don't have to worry about the ordering of src and dst 457 * ptlocks because exclusive mmap_lock prevents deadlock. 458 */ 459 old_ptl = pud_lock(mm, old_pud); 460 new_ptl = pud_lockptr(mm, new_pud); 461 if (new_ptl != old_ptl) 462 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 463 464 /* Clear the pud */ 465 pud = *old_pud; 466 pud_clear(old_pud); 467 468 VM_BUG_ON(!pud_none(*new_pud)); 469 470 /* Set the new pud */ 471 /* mark soft_ditry when we add pud level soft dirty support */ 472 set_pud_at(mm, pmc->new_addr, new_pud, pud); 473 flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); 474 if (new_ptl != old_ptl) 475 spin_unlock(new_ptl); 476 spin_unlock(old_ptl); 477 478 return true; 479 } 480 #else 481 static bool move_huge_pud(struct pagetable_move_control *pmc, 482 pud_t *old_pud, pud_t *new_pud) 483 484 { 485 WARN_ON_ONCE(1); 486 return false; 487 488 } 489 #endif 490 491 enum pgt_entry { 492 NORMAL_PMD, 493 HPAGE_PMD, 494 NORMAL_PUD, 495 HPAGE_PUD, 496 }; 497 498 /* 499 * Returns an extent of the corresponding size for the pgt_entry specified if 500 * valid. Else returns a smaller extent bounded by the end of the source and 501 * destination pgt_entry. 502 */ 503 static __always_inline unsigned long get_extent(enum pgt_entry entry, 504 struct pagetable_move_control *pmc) 505 { 506 unsigned long next, extent, mask, size; 507 unsigned long old_addr = pmc->old_addr; 508 unsigned long old_end = pmc->old_end; 509 unsigned long new_addr = pmc->new_addr; 510 511 switch (entry) { 512 case HPAGE_PMD: 513 case NORMAL_PMD: 514 mask = PMD_MASK; 515 size = PMD_SIZE; 516 break; 517 case HPAGE_PUD: 518 case NORMAL_PUD: 519 mask = PUD_MASK; 520 size = PUD_SIZE; 521 break; 522 default: 523 BUILD_BUG(); 524 break; 525 } 526 527 next = (old_addr + size) & mask; 528 /* even if next overflowed, extent below will be ok */ 529 extent = next - old_addr; 530 if (extent > old_end - old_addr) 531 extent = old_end - old_addr; 532 next = (new_addr + size) & mask; 533 if (extent > next - new_addr) 534 extent = next - new_addr; 535 return extent; 536 } 537 538 /* 539 * Should move_pgt_entry() acquire the rmap locks? This is either expressed in 540 * the PMC, or overridden in the case of normal, larger page tables. 541 */ 542 static bool should_take_rmap_locks(struct pagetable_move_control *pmc, 543 enum pgt_entry entry) 544 { 545 switch (entry) { 546 case NORMAL_PMD: 547 case NORMAL_PUD: 548 return true; 549 default: 550 return pmc->need_rmap_locks; 551 } 552 } 553 554 /* 555 * Attempts to speedup the move by moving entry at the level corresponding to 556 * pgt_entry. Returns true if the move was successful, else false. 557 */ 558 static bool move_pgt_entry(struct pagetable_move_control *pmc, 559 enum pgt_entry entry, void *old_entry, void *new_entry) 560 { 561 bool moved = false; 562 bool need_rmap_locks = should_take_rmap_locks(pmc, entry); 563 564 /* See comment in move_ptes() */ 565 if (need_rmap_locks) 566 take_rmap_locks(pmc->old); 567 568 switch (entry) { 569 case NORMAL_PMD: 570 moved = move_normal_pmd(pmc, old_entry, new_entry); 571 break; 572 case NORMAL_PUD: 573 moved = move_normal_pud(pmc, old_entry, new_entry); 574 break; 575 case HPAGE_PMD: 576 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 577 move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, 578 new_entry); 579 break; 580 case HPAGE_PUD: 581 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 582 move_huge_pud(pmc, old_entry, new_entry); 583 break; 584 585 default: 586 WARN_ON_ONCE(1); 587 break; 588 } 589 590 if (need_rmap_locks) 591 drop_rmap_locks(pmc->old); 592 593 return moved; 594 } 595 596 /* 597 * A helper to check if aligning down is OK. The aligned address should fall 598 * on *no mapping*. For the stack moving down, that's a special move within 599 * the VMA that is created to span the source and destination of the move, 600 * so we make an exception for it. 601 */ 602 static bool can_align_down(struct pagetable_move_control *pmc, 603 struct vm_area_struct *vma, unsigned long addr_to_align, 604 unsigned long mask) 605 { 606 unsigned long addr_masked = addr_to_align & mask; 607 608 /* 609 * If @addr_to_align of either source or destination is not the beginning 610 * of the corresponding VMA, we can't align down or we will destroy part 611 * of the current mapping. 612 */ 613 if (!pmc->for_stack && vma->vm_start != addr_to_align) 614 return false; 615 616 /* In the stack case we explicitly permit in-VMA alignment. */ 617 if (pmc->for_stack && addr_masked >= vma->vm_start) 618 return true; 619 620 /* 621 * Make sure the realignment doesn't cause the address to fall on an 622 * existing mapping. 623 */ 624 return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; 625 } 626 627 /* 628 * Determine if are in fact able to realign for efficiency to a higher page 629 * table boundary. 630 */ 631 static bool can_realign_addr(struct pagetable_move_control *pmc, 632 unsigned long pagetable_mask) 633 { 634 unsigned long align_mask = ~pagetable_mask; 635 unsigned long old_align = pmc->old_addr & align_mask; 636 unsigned long new_align = pmc->new_addr & align_mask; 637 unsigned long pagetable_size = align_mask + 1; 638 unsigned long old_align_next = pagetable_size - old_align; 639 640 /* 641 * We don't want to have to go hunting for VMAs from the end of the old 642 * VMA to the next page table boundary, also we want to make sure the 643 * operation is wortwhile. 644 * 645 * So ensure that we only perform this realignment if the end of the 646 * range being copied reaches or crosses the page table boundary. 647 * 648 * boundary boundary 649 * .<- old_align -> . 650 * . |----------------.-----------| 651 * . | vma . | 652 * . |----------------.-----------| 653 * . <----------------.-----------> 654 * . len_in 655 * <-------------------------------> 656 * . pagetable_size . 657 * . <----------------> 658 * . old_align_next . 659 */ 660 if (pmc->len_in < old_align_next) 661 return false; 662 663 /* Skip if the addresses are already aligned. */ 664 if (old_align == 0) 665 return false; 666 667 /* Only realign if the new and old addresses are mutually aligned. */ 668 if (old_align != new_align) 669 return false; 670 671 /* Ensure realignment doesn't cause overlap with existing mappings. */ 672 if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || 673 !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) 674 return false; 675 676 return true; 677 } 678 679 /* 680 * Opportunistically realign to specified boundary for faster copy. 681 * 682 * Consider an mremap() of a VMA with page table boundaries as below, and no 683 * preceding VMAs from the lower page table boundary to the start of the VMA, 684 * with the end of the range reaching or crossing the page table boundary. 685 * 686 * boundary boundary 687 * . |----------------.-----------| 688 * . | vma . | 689 * . |----------------.-----------| 690 * . pmc->old_addr . pmc->old_end 691 * . <----------------------------> 692 * . move these page tables 693 * 694 * If we proceed with moving page tables in this scenario, we will have a lot of 695 * work to do traversing old page tables and establishing new ones in the 696 * destination across multiple lower level page tables. 697 * 698 * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the 699 * page table boundary, so we can simply copy a single page table entry for the 700 * aligned portion of the VMA instead: 701 * 702 * boundary boundary 703 * . |----------------.-----------| 704 * . | vma . | 705 * . |----------------.-----------| 706 * pmc->old_addr . pmc->old_end 707 * <-------------------------------------------> 708 * . move these page tables 709 */ 710 static void try_realign_addr(struct pagetable_move_control *pmc, 711 unsigned long pagetable_mask) 712 { 713 714 if (!can_realign_addr(pmc, pagetable_mask)) 715 return; 716 717 /* 718 * Simply align to page table boundaries. Note that we do NOT update the 719 * pmc->old_end value, and since the move_page_tables() operation spans 720 * from [old_addr, old_end) (offsetting new_addr as it is performed), 721 * this simply changes the start of the copy, not the end. 722 */ 723 pmc->old_addr &= pagetable_mask; 724 pmc->new_addr &= pagetable_mask; 725 } 726 727 /* Is the page table move operation done? */ 728 static bool pmc_done(struct pagetable_move_control *pmc) 729 { 730 return pmc->old_addr >= pmc->old_end; 731 } 732 733 /* Advance to the next page table, offset by extent bytes. */ 734 static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) 735 { 736 pmc->old_addr += extent; 737 pmc->new_addr += extent; 738 } 739 740 /* 741 * Determine how many bytes in the specified input range have had their page 742 * tables moved so far. 743 */ 744 static unsigned long pmc_progress(struct pagetable_move_control *pmc) 745 { 746 unsigned long orig_old_addr = pmc->old_end - pmc->len_in; 747 unsigned long old_addr = pmc->old_addr; 748 749 /* 750 * Prevent negative return values when {old,new}_addr was realigned but 751 * we broke out of the loop in move_page_tables() for the first PMD 752 * itself. 753 */ 754 return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; 755 } 756 757 unsigned long move_page_tables(struct pagetable_move_control *pmc) 758 { 759 unsigned long extent; 760 struct mmu_notifier_range range; 761 pmd_t *old_pmd, *new_pmd; 762 pud_t *old_pud, *new_pud; 763 struct mm_struct *mm = pmc->old->vm_mm; 764 765 if (!pmc->len_in) 766 return 0; 767 768 if (is_vm_hugetlb_page(pmc->old)) 769 return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, 770 pmc->new_addr, pmc->len_in); 771 772 /* 773 * If possible, realign addresses to PMD boundary for faster copy. 774 * Only realign if the mremap copying hits a PMD boundary. 775 */ 776 try_realign_addr(pmc, PMD_MASK); 777 778 flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); 779 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, 780 pmc->old_addr, pmc->old_end); 781 mmu_notifier_invalidate_range_start(&range); 782 783 for (; !pmc_done(pmc); pmc_next(pmc, extent)) { 784 cond_resched(); 785 /* 786 * If extent is PUD-sized try to speed up the move by moving at the 787 * PUD level if possible. 788 */ 789 extent = get_extent(NORMAL_PUD, pmc); 790 791 old_pud = get_old_pud(mm, pmc->old_addr); 792 if (!old_pud) 793 continue; 794 new_pud = alloc_new_pud(mm, pmc->new_addr); 795 if (!new_pud) 796 break; 797 if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { 798 if (extent == HPAGE_PUD_SIZE) { 799 move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); 800 /* We ignore and continue on error? */ 801 continue; 802 } 803 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { 804 if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) 805 continue; 806 } 807 808 extent = get_extent(NORMAL_PMD, pmc); 809 old_pmd = get_old_pmd(mm, pmc->old_addr); 810 if (!old_pmd) 811 continue; 812 new_pmd = alloc_new_pmd(mm, pmc->new_addr); 813 if (!new_pmd) 814 break; 815 again: 816 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || 817 pmd_devmap(*old_pmd)) { 818 if (extent == HPAGE_PMD_SIZE && 819 move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) 820 continue; 821 split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); 822 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && 823 extent == PMD_SIZE) { 824 /* 825 * If the extent is PMD-sized, try to speed the move by 826 * moving at the PMD level if possible. 827 */ 828 if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) 829 continue; 830 } 831 if (pmd_none(*old_pmd)) 832 continue; 833 if (pte_alloc(pmc->new->vm_mm, new_pmd)) 834 break; 835 if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) 836 goto again; 837 } 838 839 mmu_notifier_invalidate_range_end(&range); 840 841 return pmc_progress(pmc); 842 } 843 844 /* Set vrm->delta to the difference in VMA size specified by user. */ 845 static void vrm_set_delta(struct vma_remap_struct *vrm) 846 { 847 vrm->delta = abs_diff(vrm->old_len, vrm->new_len); 848 } 849 850 /* Determine what kind of remap this is - shrink, expand or no resize at all. */ 851 static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) 852 { 853 if (vrm->delta == 0) 854 return MREMAP_NO_RESIZE; 855 856 if (vrm->old_len > vrm->new_len) 857 return MREMAP_SHRINK; 858 859 return MREMAP_EXPAND; 860 } 861 862 /* 863 * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs 864 * overlapping? 865 */ 866 static bool vrm_overlaps(struct vma_remap_struct *vrm) 867 { 868 unsigned long start_old = vrm->addr; 869 unsigned long start_new = vrm->new_addr; 870 unsigned long end_old = vrm->addr + vrm->old_len; 871 unsigned long end_new = vrm->new_addr + vrm->new_len; 872 873 /* 874 * start_old end_old 875 * |-----------| 876 * | | 877 * |-----------| 878 * |-------------| 879 * | | 880 * |-------------| 881 * start_new end_new 882 */ 883 if (end_old > start_new && end_new > start_old) 884 return true; 885 886 return false; 887 } 888 889 /* Do the mremap() flags require that the new_addr parameter be specified? */ 890 static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) 891 { 892 return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); 893 } 894 895 /* 896 * Find an unmapped area for the requested vrm->new_addr. 897 * 898 * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only 899 * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to 900 * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. 901 * 902 * Returns 0 on success (with vrm->new_addr updated), or an error code upon 903 * failure. 904 */ 905 static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) 906 { 907 struct vm_area_struct *vma = vrm->vma; 908 unsigned long map_flags = 0; 909 /* Page Offset _into_ the VMA. */ 910 pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; 911 pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; 912 unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; 913 unsigned long res; 914 915 if (vrm->flags & MREMAP_FIXED) 916 map_flags |= MAP_FIXED; 917 if (vma->vm_flags & VM_MAYSHARE) 918 map_flags |= MAP_SHARED; 919 920 res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, 921 map_flags); 922 if (IS_ERR_VALUE(res)) 923 return res; 924 925 vrm->new_addr = res; 926 return 0; 927 } 928 929 /* 930 * Keep track of pages which have been added to the memory mapping. If the VMA 931 * is accounted, also check to see if there is sufficient memory. 932 * 933 * Returns true on success, false if insufficient memory to charge. 934 */ 935 static bool vrm_charge(struct vma_remap_struct *vrm) 936 { 937 unsigned long charged; 938 939 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 940 return true; 941 942 /* 943 * If we don't unmap the old mapping, then we account the entirety of 944 * the length of the new one. Otherwise it's just the delta in size. 945 */ 946 if (vrm->flags & MREMAP_DONTUNMAP) 947 charged = vrm->new_len >> PAGE_SHIFT; 948 else 949 charged = vrm->delta >> PAGE_SHIFT; 950 951 952 /* This accounts 'charged' pages of memory. */ 953 if (security_vm_enough_memory_mm(current->mm, charged)) 954 return false; 955 956 vrm->charged = charged; 957 return true; 958 } 959 960 /* 961 * an error has occurred so we will not be using vrm->charged memory. Unaccount 962 * this memory if the VMA is accounted. 963 */ 964 static void vrm_uncharge(struct vma_remap_struct *vrm) 965 { 966 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 967 return; 968 969 vm_unacct_memory(vrm->charged); 970 vrm->charged = 0; 971 } 972 973 /* 974 * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to 975 * account for 'bytes' memory used, and if locked, indicate this in the VRM so 976 * we can handle this correctly later. 977 */ 978 static void vrm_stat_account(struct vma_remap_struct *vrm, 979 unsigned long bytes) 980 { 981 unsigned long pages = bytes >> PAGE_SHIFT; 982 struct mm_struct *mm = current->mm; 983 struct vm_area_struct *vma = vrm->vma; 984 985 vm_stat_account(mm, vma->vm_flags, pages); 986 if (vma->vm_flags & VM_LOCKED) { 987 mm->locked_vm += pages; 988 vrm->mlocked = true; 989 } 990 } 991 992 /* 993 * Perform checks before attempting to write a VMA prior to it being 994 * moved. 995 */ 996 static unsigned long prep_move_vma(struct vma_remap_struct *vrm) 997 { 998 unsigned long err = 0; 999 struct vm_area_struct *vma = vrm->vma; 1000 unsigned long old_addr = vrm->addr; 1001 unsigned long old_len = vrm->old_len; 1002 unsigned long dummy = vma->vm_flags; 1003 1004 /* 1005 * We'd prefer to avoid failure later on in do_munmap: 1006 * which may split one vma into three before unmapping. 1007 */ 1008 if (current->mm->map_count >= sysctl_max_map_count - 3) 1009 return -ENOMEM; 1010 1011 if (vma->vm_ops && vma->vm_ops->may_split) { 1012 if (vma->vm_start != old_addr) 1013 err = vma->vm_ops->may_split(vma, old_addr); 1014 if (!err && vma->vm_end != old_addr + old_len) 1015 err = vma->vm_ops->may_split(vma, old_addr + old_len); 1016 if (err) 1017 return err; 1018 } 1019 1020 /* 1021 * Advise KSM to break any KSM pages in the area to be moved: 1022 * it would be confusing if they were to turn up at the new 1023 * location, where they happen to coincide with different KSM 1024 * pages recently unmapped. But leave vma->vm_flags as it was, 1025 * so KSM can come around to merge on vma and new_vma afterwards. 1026 */ 1027 err = ksm_madvise(vma, old_addr, old_addr + old_len, 1028 MADV_UNMERGEABLE, &dummy); 1029 if (err) 1030 return err; 1031 1032 return 0; 1033 } 1034 1035 /* 1036 * Unmap source VMA for VMA move, turning it from a copy to a move, being 1037 * careful to ensure we do not underflow memory account while doing so if an 1038 * accountable move. 1039 * 1040 * This is best effort, if we fail to unmap then we simply try to correct 1041 * accounting and exit. 1042 */ 1043 static void unmap_source_vma(struct vma_remap_struct *vrm) 1044 { 1045 struct mm_struct *mm = current->mm; 1046 unsigned long addr = vrm->addr; 1047 unsigned long len = vrm->old_len; 1048 struct vm_area_struct *vma = vrm->vma; 1049 VMA_ITERATOR(vmi, mm, addr); 1050 int err; 1051 unsigned long vm_start; 1052 unsigned long vm_end; 1053 /* 1054 * It might seem odd that we check for MREMAP_DONTUNMAP here, given this 1055 * function implies that we unmap the original VMA, which seems 1056 * contradictory. 1057 * 1058 * However, this occurs when this operation was attempted and an error 1059 * arose, in which case we _do_ wish to unmap the _new_ VMA, which means 1060 * we actually _do_ want it be unaccounted. 1061 */ 1062 bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && 1063 !(vrm->flags & MREMAP_DONTUNMAP); 1064 1065 /* 1066 * So we perform a trick here to prevent incorrect accounting. Any merge 1067 * or new VMA allocation performed in copy_vma() does not adjust 1068 * accounting, it is expected that callers handle this. 1069 * 1070 * And indeed we already have, accounting appropriately in the case of 1071 * both in vrm_charge(). 1072 * 1073 * However, when we unmap the existing VMA (to effect the move), this 1074 * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount 1075 * removed pages. 1076 * 1077 * To avoid this we temporarily clear this flag, reinstating on any 1078 * portions of the original VMA that remain. 1079 */ 1080 if (accountable_move) { 1081 vm_flags_clear(vma, VM_ACCOUNT); 1082 /* We are about to split vma, so store the start/end. */ 1083 vm_start = vma->vm_start; 1084 vm_end = vma->vm_end; 1085 } 1086 1087 err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); 1088 vrm->vma = NULL; /* Invalidated. */ 1089 if (err) { 1090 /* OOM: unable to split vma, just get accounts right */ 1091 vm_acct_memory(len >> PAGE_SHIFT); 1092 return; 1093 } 1094 1095 /* 1096 * If we mremap() from a VMA like this: 1097 * 1098 * addr end 1099 * | | 1100 * v v 1101 * |-------------| 1102 * | | 1103 * |-------------| 1104 * 1105 * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above 1106 * we'll end up with: 1107 * 1108 * addr end 1109 * | | 1110 * v v 1111 * |---| |---| 1112 * | A | | B | 1113 * |---| |---| 1114 * 1115 * The VMI is still pointing at addr, so vma_prev() will give us A, and 1116 * a subsequent or lone vma_next() will give as B. 1117 * 1118 * do_vmi_munmap() will have restored the VMI back to addr. 1119 */ 1120 if (accountable_move) { 1121 unsigned long end = addr + len; 1122 1123 if (vm_start < addr) { 1124 struct vm_area_struct *prev = vma_prev(&vmi); 1125 1126 vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ 1127 } 1128 1129 if (vm_end > end) { 1130 struct vm_area_struct *next = vma_next(&vmi); 1131 1132 vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ 1133 } 1134 } 1135 } 1136 1137 /* 1138 * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the 1139 * process. Additionally handle an error occurring on moving of page tables, 1140 * where we reset vrm state to cause unmapping of the new VMA. 1141 * 1142 * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an 1143 * error code. 1144 */ 1145 static int copy_vma_and_data(struct vma_remap_struct *vrm, 1146 struct vm_area_struct **new_vma_ptr) 1147 { 1148 unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; 1149 unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; 1150 unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; 1151 unsigned long moved_len; 1152 struct vm_area_struct *vma = vrm->vma; 1153 struct vm_area_struct *new_vma; 1154 int err = 0; 1155 PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); 1156 1157 new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, 1158 &pmc.need_rmap_locks); 1159 if (!new_vma) { 1160 vrm_uncharge(vrm); 1161 *new_vma_ptr = NULL; 1162 return -ENOMEM; 1163 } 1164 vrm->vma = vma; 1165 pmc.old = vma; 1166 pmc.new = new_vma; 1167 1168 moved_len = move_page_tables(&pmc); 1169 if (moved_len < vrm->old_len) 1170 err = -ENOMEM; 1171 else if (vma->vm_ops && vma->vm_ops->mremap) 1172 err = vma->vm_ops->mremap(new_vma); 1173 1174 if (unlikely(err)) { 1175 PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, 1176 vrm->addr, moved_len); 1177 1178 /* 1179 * On error, move entries back from new area to old, 1180 * which will succeed since page tables still there, 1181 * and then proceed to unmap new area instead of old. 1182 */ 1183 pmc_revert.need_rmap_locks = true; 1184 move_page_tables(&pmc_revert); 1185 1186 vrm->vma = new_vma; 1187 vrm->old_len = vrm->new_len; 1188 vrm->addr = vrm->new_addr; 1189 } else { 1190 mremap_userfaultfd_prep(new_vma, vrm->uf); 1191 } 1192 1193 fixup_hugetlb_reservations(vma); 1194 1195 *new_vma_ptr = new_vma; 1196 return err; 1197 } 1198 1199 /* 1200 * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and 1201 * account flags on remaining VMA by convention (it cannot be mlock()'d any 1202 * longer, as pages in range are no longer mapped), and removing anon_vma_chain 1203 * links from it (if the entire VMA was copied over). 1204 */ 1205 static void dontunmap_complete(struct vma_remap_struct *vrm, 1206 struct vm_area_struct *new_vma) 1207 { 1208 unsigned long start = vrm->addr; 1209 unsigned long end = vrm->addr + vrm->old_len; 1210 unsigned long old_start = vrm->vma->vm_start; 1211 unsigned long old_end = vrm->vma->vm_end; 1212 1213 /* 1214 * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old 1215 * vma. 1216 */ 1217 vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); 1218 1219 /* 1220 * anon_vma links of the old vma is no longer needed after its page 1221 * table has been moved. 1222 */ 1223 if (new_vma != vrm->vma && start == old_start && end == old_end) 1224 unlink_anon_vmas(vrm->vma); 1225 1226 /* Because we won't unmap we don't need to touch locked_vm. */ 1227 } 1228 1229 static unsigned long move_vma(struct vma_remap_struct *vrm) 1230 { 1231 struct mm_struct *mm = current->mm; 1232 struct vm_area_struct *new_vma; 1233 unsigned long hiwater_vm; 1234 int err; 1235 1236 err = prep_move_vma(vrm); 1237 if (err) 1238 return err; 1239 1240 /* If accounted, charge the number of bytes the operation will use. */ 1241 if (!vrm_charge(vrm)) 1242 return -ENOMEM; 1243 1244 /* We don't want racing faults. */ 1245 vma_start_write(vrm->vma); 1246 1247 /* Perform copy step. */ 1248 err = copy_vma_and_data(vrm, &new_vma); 1249 /* 1250 * If we established the copied-to VMA, we attempt to recover from the 1251 * error by setting the destination VMA to the source VMA and unmapping 1252 * it below. 1253 */ 1254 if (err && !new_vma) 1255 return err; 1256 1257 /* 1258 * If we failed to move page tables we still do total_vm increment 1259 * since do_munmap() will decrement it by old_len == new_len. 1260 * 1261 * Since total_vm is about to be raised artificially high for a 1262 * moment, we need to restore high watermark afterwards: if stats 1263 * are taken meanwhile, total_vm and hiwater_vm appear too high. 1264 * If this were a serious issue, we'd add a flag to do_munmap(). 1265 */ 1266 hiwater_vm = mm->hiwater_vm; 1267 1268 vrm_stat_account(vrm, vrm->new_len); 1269 if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) 1270 dontunmap_complete(vrm, new_vma); 1271 else 1272 unmap_source_vma(vrm); 1273 1274 mm->hiwater_vm = hiwater_vm; 1275 1276 return err ? (unsigned long)err : vrm->new_addr; 1277 } 1278 1279 /* 1280 * resize_is_valid() - Ensure the vma can be resized to the new length at the give 1281 * address. 1282 * 1283 * Return 0 on success, error otherwise. 1284 */ 1285 static int resize_is_valid(struct vma_remap_struct *vrm) 1286 { 1287 struct mm_struct *mm = current->mm; 1288 struct vm_area_struct *vma = vrm->vma; 1289 unsigned long addr = vrm->addr; 1290 unsigned long old_len = vrm->old_len; 1291 unsigned long new_len = vrm->new_len; 1292 unsigned long pgoff; 1293 1294 /* 1295 * !old_len is a special case where an attempt is made to 'duplicate' 1296 * a mapping. This makes no sense for private mappings as it will 1297 * instead create a fresh/new mapping unrelated to the original. This 1298 * is contrary to the basic idea of mremap which creates new mappings 1299 * based on the original. There are no known use cases for this 1300 * behavior. As a result, fail such attempts. 1301 */ 1302 if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { 1303 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", 1304 current->comm, current->pid); 1305 return -EINVAL; 1306 } 1307 1308 if ((vrm->flags & MREMAP_DONTUNMAP) && 1309 (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) 1310 return -EINVAL; 1311 1312 /* We can't remap across vm area boundaries */ 1313 if (old_len > vma->vm_end - addr) 1314 return -EFAULT; 1315 1316 if (new_len == old_len) 1317 return 0; 1318 1319 /* Need to be careful about a growing mapping */ 1320 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 1321 pgoff += vma->vm_pgoff; 1322 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 1323 return -EINVAL; 1324 1325 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 1326 return -EFAULT; 1327 1328 if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) 1329 return -EAGAIN; 1330 1331 if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) 1332 return -ENOMEM; 1333 1334 return 0; 1335 } 1336 1337 /* 1338 * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so 1339 * execute this, optionally dropping the mmap lock when we do so. 1340 * 1341 * In both cases this invalidates the VMA, however if we don't drop the lock, 1342 * then load the correct VMA into vrm->vma afterwards. 1343 */ 1344 static unsigned long shrink_vma(struct vma_remap_struct *vrm, 1345 bool drop_lock) 1346 { 1347 struct mm_struct *mm = current->mm; 1348 unsigned long unmap_start = vrm->addr + vrm->new_len; 1349 unsigned long unmap_bytes = vrm->delta; 1350 unsigned long res; 1351 VMA_ITERATOR(vmi, mm, unmap_start); 1352 1353 VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); 1354 1355 res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, 1356 vrm->uf_unmap, drop_lock); 1357 vrm->vma = NULL; /* Invalidated. */ 1358 if (res) 1359 return res; 1360 1361 /* 1362 * If we've not dropped the lock, then we should reload the VMA to 1363 * replace the invalidated VMA with the one that may have now been 1364 * split. 1365 */ 1366 if (drop_lock) { 1367 vrm->mmap_locked = false; 1368 } else { 1369 vrm->vma = vma_lookup(mm, vrm->addr); 1370 if (!vrm->vma) 1371 return -EFAULT; 1372 } 1373 1374 return 0; 1375 } 1376 1377 /* 1378 * mremap_to() - remap a vma to a new location. 1379 * Returns: The new address of the vma or an error. 1380 */ 1381 static unsigned long mremap_to(struct vma_remap_struct *vrm) 1382 { 1383 struct mm_struct *mm = current->mm; 1384 unsigned long err; 1385 1386 /* Is the new length or address silly? */ 1387 if (vrm->new_len > TASK_SIZE || 1388 vrm->new_addr > TASK_SIZE - vrm->new_len) 1389 return -EINVAL; 1390 1391 if (vrm_overlaps(vrm)) 1392 return -EINVAL; 1393 1394 if (vrm->flags & MREMAP_FIXED) { 1395 /* 1396 * In mremap_to(). 1397 * VMA is moved to dst address, and munmap dst first. 1398 * do_munmap will check if dst is sealed. 1399 */ 1400 err = do_munmap(mm, vrm->new_addr, vrm->new_len, 1401 vrm->uf_unmap_early); 1402 vrm->vma = NULL; /* Invalidated. */ 1403 if (err) 1404 return err; 1405 1406 /* 1407 * If we remap a portion of a VMA elsewhere in the same VMA, 1408 * this can invalidate the old VMA. Reset. 1409 */ 1410 vrm->vma = vma_lookup(mm, vrm->addr); 1411 if (!vrm->vma) 1412 return -EFAULT; 1413 } 1414 1415 if (vrm->remap_type == MREMAP_SHRINK) { 1416 err = shrink_vma(vrm, /* drop_lock= */false); 1417 if (err) 1418 return err; 1419 1420 /* Set up for the move now shrink has been executed. */ 1421 vrm->old_len = vrm->new_len; 1422 } 1423 1424 err = resize_is_valid(vrm); 1425 if (err) 1426 return err; 1427 1428 /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ 1429 if (vrm->flags & MREMAP_DONTUNMAP) { 1430 vm_flags_t vm_flags = vrm->vma->vm_flags; 1431 unsigned long pages = vrm->old_len >> PAGE_SHIFT; 1432 1433 if (!may_expand_vm(mm, vm_flags, pages)) 1434 return -ENOMEM; 1435 } 1436 1437 err = vrm_set_new_addr(vrm); 1438 if (err) 1439 return err; 1440 1441 return move_vma(vrm); 1442 } 1443 1444 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) 1445 { 1446 unsigned long end = vma->vm_end + delta; 1447 1448 if (end < vma->vm_end) /* overflow */ 1449 return 0; 1450 if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) 1451 return 0; 1452 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 1453 0, MAP_FIXED) & ~PAGE_MASK) 1454 return 0; 1455 return 1; 1456 } 1457 1458 /* Determine whether we are actually able to execute an in-place expansion. */ 1459 static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) 1460 { 1461 /* Number of bytes from vrm->addr to end of VMA. */ 1462 unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; 1463 1464 /* If end of range aligns to end of VMA, we can just expand in-place. */ 1465 if (suffix_bytes != vrm->old_len) 1466 return false; 1467 1468 /* Check whether this is feasible. */ 1469 if (!vma_expandable(vrm->vma, vrm->delta)) 1470 return false; 1471 1472 return true; 1473 } 1474 1475 /* 1476 * Are the parameters passed to mremap() valid? If so return 0, otherwise return 1477 * error. 1478 */ 1479 static unsigned long check_mremap_params(struct vma_remap_struct *vrm) 1480 1481 { 1482 unsigned long addr = vrm->addr; 1483 unsigned long flags = vrm->flags; 1484 1485 /* Ensure no unexpected flag values. */ 1486 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) 1487 return -EINVAL; 1488 1489 /* Start address must be page-aligned. */ 1490 if (offset_in_page(addr)) 1491 return -EINVAL; 1492 1493 /* 1494 * We allow a zero old-len as a special case 1495 * for DOS-emu "duplicate shm area" thing. But 1496 * a zero new-len is nonsensical. 1497 */ 1498 if (!PAGE_ALIGN(vrm->new_len)) 1499 return -EINVAL; 1500 1501 /* Remainder of checks are for cases with specific new_addr. */ 1502 if (!vrm_implies_new_addr(vrm)) 1503 return 0; 1504 1505 /* The new address must be page-aligned. */ 1506 if (offset_in_page(vrm->new_addr)) 1507 return -EINVAL; 1508 1509 /* A fixed address implies a move. */ 1510 if (!(flags & MREMAP_MAYMOVE)) 1511 return -EINVAL; 1512 1513 /* MREMAP_DONTUNMAP does not allow resizing in the process. */ 1514 if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) 1515 return -EINVAL; 1516 1517 /* 1518 * move_vma() need us to stay 4 maps below the threshold, otherwise 1519 * it will bail out at the very beginning. 1520 * That is a problem if we have already unmaped the regions here 1521 * (new_addr, and old_addr), because userspace will not know the 1522 * state of the vma's after it gets -ENOMEM. 1523 * So, to avoid such scenario we can pre-compute if the whole 1524 * operation has high chances to success map-wise. 1525 * Worst-scenario case is when both vma's (new_addr and old_addr) get 1526 * split in 3 before unmapping it. 1527 * That means 2 more maps (1 for each) to the ones we already hold. 1528 * Check whether current map count plus 2 still leads us to 4 maps below 1529 * the threshold, otherwise return -ENOMEM here to be more safe. 1530 */ 1531 if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) 1532 return -ENOMEM; 1533 1534 return 0; 1535 } 1536 1537 /* 1538 * We know we can expand the VMA in-place by delta pages, so do so. 1539 * 1540 * If we discover the VMA is locked, update mm_struct statistics accordingly and 1541 * indicate so to the caller. 1542 */ 1543 static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) 1544 { 1545 struct mm_struct *mm = current->mm; 1546 struct vm_area_struct *vma = vrm->vma; 1547 VMA_ITERATOR(vmi, mm, vma->vm_end); 1548 1549 if (!vrm_charge(vrm)) 1550 return -ENOMEM; 1551 1552 /* 1553 * Function vma_merge_extend() is called on the 1554 * extension we are adding to the already existing vma, 1555 * vma_merge_extend() will merge this extension with the 1556 * already existing vma (expand operation itself) and 1557 * possibly also with the next vma if it becomes 1558 * adjacent to the expanded vma and otherwise 1559 * compatible. 1560 */ 1561 vma = vma_merge_extend(&vmi, vma, vrm->delta); 1562 if (!vma) { 1563 vrm_uncharge(vrm); 1564 return -ENOMEM; 1565 } 1566 vrm->vma = vma; 1567 1568 vrm_stat_account(vrm, vrm->delta); 1569 1570 return 0; 1571 } 1572 1573 static bool align_hugetlb(struct vma_remap_struct *vrm) 1574 { 1575 struct hstate *h __maybe_unused = hstate_vma(vrm->vma); 1576 1577 vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); 1578 vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); 1579 1580 /* addrs must be huge page aligned */ 1581 if (vrm->addr & ~huge_page_mask(h)) 1582 return false; 1583 if (vrm->new_addr & ~huge_page_mask(h)) 1584 return false; 1585 1586 /* 1587 * Don't allow remap expansion, because the underlying hugetlb 1588 * reservation is not yet capable to handle split reservation. 1589 */ 1590 if (vrm->new_len > vrm->old_len) 1591 return false; 1592 1593 vrm_set_delta(vrm); 1594 1595 return true; 1596 } 1597 1598 /* 1599 * We are mremap()'ing without specifying a fixed address to move to, but are 1600 * requesting that the VMA's size be increased. 1601 * 1602 * Try to do so in-place, if this fails, then move the VMA to a new location to 1603 * action the change. 1604 */ 1605 static unsigned long expand_vma(struct vma_remap_struct *vrm) 1606 { 1607 unsigned long err; 1608 unsigned long addr = vrm->addr; 1609 1610 err = resize_is_valid(vrm); 1611 if (err) 1612 return err; 1613 1614 /* 1615 * [addr, old_len) spans precisely to the end of the VMA, so try to 1616 * expand it in-place. 1617 */ 1618 if (vrm_can_expand_in_place(vrm)) { 1619 err = expand_vma_in_place(vrm); 1620 if (err) 1621 return err; 1622 1623 /* 1624 * We want to populate the newly expanded portion of the VMA to 1625 * satisfy the expectation that mlock()'ing a VMA maintains all 1626 * of its pages in memory. 1627 */ 1628 if (vrm->mlocked) 1629 vrm->new_addr = addr; 1630 1631 /* OK we're done! */ 1632 return addr; 1633 } 1634 1635 /* 1636 * We weren't able to just expand or shrink the area, 1637 * we need to create a new one and move it. 1638 */ 1639 1640 /* We're not allowed to move the VMA, so error out. */ 1641 if (!(vrm->flags & MREMAP_MAYMOVE)) 1642 return -ENOMEM; 1643 1644 /* Find a new location to move the VMA to. */ 1645 err = vrm_set_new_addr(vrm); 1646 if (err) 1647 return err; 1648 1649 return move_vma(vrm); 1650 } 1651 1652 /* 1653 * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the 1654 * first available address to perform the operation. 1655 */ 1656 static unsigned long mremap_at(struct vma_remap_struct *vrm) 1657 { 1658 unsigned long res; 1659 1660 switch (vrm->remap_type) { 1661 case MREMAP_INVALID: 1662 break; 1663 case MREMAP_NO_RESIZE: 1664 /* NO-OP CASE - resizing to the same size. */ 1665 return vrm->addr; 1666 case MREMAP_SHRINK: 1667 /* 1668 * SHRINK CASE. Can always be done in-place. 1669 * 1670 * Simply unmap the shrunken portion of the VMA. This does all 1671 * the needed commit accounting, and we indicate that the mmap 1672 * lock should be dropped. 1673 */ 1674 res = shrink_vma(vrm, /* drop_lock= */true); 1675 if (res) 1676 return res; 1677 1678 return vrm->addr; 1679 case MREMAP_EXPAND: 1680 return expand_vma(vrm); 1681 } 1682 1683 BUG(); 1684 } 1685 1686 static unsigned long do_mremap(struct vma_remap_struct *vrm) 1687 { 1688 struct mm_struct *mm = current->mm; 1689 struct vm_area_struct *vma; 1690 unsigned long ret; 1691 1692 ret = check_mremap_params(vrm); 1693 if (ret) 1694 return ret; 1695 1696 vrm->old_len = PAGE_ALIGN(vrm->old_len); 1697 vrm->new_len = PAGE_ALIGN(vrm->new_len); 1698 vrm_set_delta(vrm); 1699 1700 if (mmap_write_lock_killable(mm)) 1701 return -EINTR; 1702 vrm->mmap_locked = true; 1703 1704 vma = vrm->vma = vma_lookup(mm, vrm->addr); 1705 if (!vma) { 1706 ret = -EFAULT; 1707 goto out; 1708 } 1709 1710 /* If mseal()'d, mremap() is prohibited. */ 1711 if (!can_modify_vma(vma)) { 1712 ret = -EPERM; 1713 goto out; 1714 } 1715 1716 /* Align to hugetlb page size, if required. */ 1717 if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { 1718 ret = -EINVAL; 1719 goto out; 1720 } 1721 1722 vrm->remap_type = vrm_remap_type(vrm); 1723 1724 /* Actually execute mremap. */ 1725 ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); 1726 1727 out: 1728 if (vrm->mmap_locked) { 1729 mmap_write_unlock(mm); 1730 vrm->mmap_locked = false; 1731 1732 if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) 1733 mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); 1734 } 1735 1736 userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); 1737 mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); 1738 userfaultfd_unmap_complete(mm, vrm->uf_unmap); 1739 1740 return ret; 1741 } 1742 1743 /* 1744 * Expand (or shrink) an existing mapping, potentially moving it at the 1745 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1746 * 1747 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1748 * This option implies MREMAP_MAYMOVE. 1749 */ 1750 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1751 unsigned long, new_len, unsigned long, flags, 1752 unsigned long, new_addr) 1753 { 1754 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 1755 LIST_HEAD(uf_unmap_early); 1756 LIST_HEAD(uf_unmap); 1757 /* 1758 * There is a deliberate asymmetry here: we strip the pointer tag 1759 * from the old address but leave the new address alone. This is 1760 * for consistency with mmap(), where we prevent the creation of 1761 * aliasing mappings in userspace by leaving the tag bits of the 1762 * mapping address intact. A non-zero tag will cause the subsequent 1763 * range checks to reject the address as invalid. 1764 * 1765 * See Documentation/arch/arm64/tagged-address-abi.rst for more 1766 * information. 1767 */ 1768 struct vma_remap_struct vrm = { 1769 .addr = untagged_addr(addr), 1770 .old_len = old_len, 1771 .new_len = new_len, 1772 .flags = flags, 1773 .new_addr = new_addr, 1774 1775 .uf = &uf, 1776 .uf_unmap_early = &uf_unmap_early, 1777 .uf_unmap = &uf_unmap, 1778 1779 .remap_type = MREMAP_INVALID, /* We set later. */ 1780 }; 1781 1782 return do_mremap(&vrm); 1783 } 1784