1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/mremap.c 4 * 5 * (C) Copyright 1996 Linus Torvalds 6 * 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/mm_inline.h> 13 #include <linux/hugetlb.h> 14 #include <linux/shm.h> 15 #include <linux/ksm.h> 16 #include <linux/mman.h> 17 #include <linux/swap.h> 18 #include <linux/capability.h> 19 #include <linux/fs.h> 20 #include <linux/swapops.h> 21 #include <linux/highmem.h> 22 #include <linux/security.h> 23 #include <linux/syscalls.h> 24 #include <linux/mmu_notifier.h> 25 #include <linux/uaccess.h> 26 #include <linux/userfaultfd_k.h> 27 #include <linux/mempolicy.h> 28 29 #include <asm/cacheflush.h> 30 #include <asm/tlb.h> 31 #include <asm/pgalloc.h> 32 33 #include "internal.h" 34 35 /* Classify the kind of remap operation being performed. */ 36 enum mremap_type { 37 MREMAP_INVALID, /* Initial state. */ 38 MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ 39 MREMAP_SHRINK, /* old_len > new_len. */ 40 MREMAP_EXPAND, /* old_len < new_len. */ 41 }; 42 43 /* 44 * Describes a VMA mremap() operation and is threaded throughout it. 45 * 46 * Any of the fields may be mutated by the operation, however these values will 47 * always accurately reflect the remap (for instance, we may adjust lengths and 48 * delta to account for hugetlb alignment). 49 */ 50 struct vma_remap_struct { 51 /* User-provided state. */ 52 unsigned long addr; /* User-specified address from which we remap. */ 53 unsigned long old_len; /* Length of range being remapped. */ 54 unsigned long new_len; /* Desired new length of mapping. */ 55 unsigned long flags; /* user-specified MREMAP_* flags. */ 56 unsigned long new_addr; /* Optionally, desired new address. */ 57 58 /* uffd state. */ 59 struct vm_userfaultfd_ctx *uf; 60 struct list_head *uf_unmap_early; 61 struct list_head *uf_unmap; 62 63 /* VMA state, determined in do_mremap(). */ 64 struct vm_area_struct *vma; 65 66 /* Internal state, determined in do_mremap(). */ 67 unsigned long delta; /* Absolute delta of old_len,new_len. */ 68 bool mlocked; /* Was the VMA mlock()'d? */ 69 enum mremap_type remap_type; /* expand, shrink, etc. */ 70 bool mmap_locked; /* Is mm currently write-locked? */ 71 unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ 72 }; 73 74 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) 75 { 76 pgd_t *pgd; 77 p4d_t *p4d; 78 pud_t *pud; 79 80 pgd = pgd_offset(mm, addr); 81 if (pgd_none_or_clear_bad(pgd)) 82 return NULL; 83 84 p4d = p4d_offset(pgd, addr); 85 if (p4d_none_or_clear_bad(p4d)) 86 return NULL; 87 88 pud = pud_offset(p4d, addr); 89 if (pud_none_or_clear_bad(pud)) 90 return NULL; 91 92 return pud; 93 } 94 95 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 96 { 97 pud_t *pud; 98 pmd_t *pmd; 99 100 pud = get_old_pud(mm, addr); 101 if (!pud) 102 return NULL; 103 104 pmd = pmd_offset(pud, addr); 105 if (pmd_none(*pmd)) 106 return NULL; 107 108 return pmd; 109 } 110 111 static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) 112 { 113 pgd_t *pgd; 114 p4d_t *p4d; 115 116 pgd = pgd_offset(mm, addr); 117 p4d = p4d_alloc(mm, pgd, addr); 118 if (!p4d) 119 return NULL; 120 121 return pud_alloc(mm, p4d, addr); 122 } 123 124 static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 125 { 126 pud_t *pud; 127 pmd_t *pmd; 128 129 pud = alloc_new_pud(mm, addr); 130 if (!pud) 131 return NULL; 132 133 pmd = pmd_alloc(mm, pud, addr); 134 if (!pmd) 135 return NULL; 136 137 VM_BUG_ON(pmd_trans_huge(*pmd)); 138 139 return pmd; 140 } 141 142 static void take_rmap_locks(struct vm_area_struct *vma) 143 { 144 if (vma->vm_file) 145 i_mmap_lock_write(vma->vm_file->f_mapping); 146 if (vma->anon_vma) 147 anon_vma_lock_write(vma->anon_vma); 148 } 149 150 static void drop_rmap_locks(struct vm_area_struct *vma) 151 { 152 if (vma->anon_vma) 153 anon_vma_unlock_write(vma->anon_vma); 154 if (vma->vm_file) 155 i_mmap_unlock_write(vma->vm_file->f_mapping); 156 } 157 158 static pte_t move_soft_dirty_pte(pte_t pte) 159 { 160 /* 161 * Set soft dirty bit so we can notice 162 * in userspace the ptes were moved. 163 */ 164 #ifdef CONFIG_MEM_SOFT_DIRTY 165 if (pte_present(pte)) 166 pte = pte_mksoft_dirty(pte); 167 else if (is_swap_pte(pte)) 168 pte = pte_swp_mksoft_dirty(pte); 169 #endif 170 return pte; 171 } 172 173 static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, 174 pte_t *ptep, pte_t pte, int max_nr) 175 { 176 const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 177 struct folio *folio; 178 179 if (max_nr == 1) 180 return 1; 181 182 folio = vm_normal_folio(vma, addr, pte); 183 if (!folio || !folio_test_large(folio)) 184 return 1; 185 186 return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL, 187 NULL, NULL); 188 } 189 190 static int move_ptes(struct pagetable_move_control *pmc, 191 unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) 192 { 193 struct vm_area_struct *vma = pmc->old; 194 bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); 195 struct mm_struct *mm = vma->vm_mm; 196 pte_t *old_ptep, *new_ptep; 197 pte_t old_pte, pte; 198 pmd_t dummy_pmdval; 199 spinlock_t *old_ptl, *new_ptl; 200 bool force_flush = false; 201 unsigned long old_addr = pmc->old_addr; 202 unsigned long new_addr = pmc->new_addr; 203 unsigned long old_end = old_addr + extent; 204 unsigned long len = old_end - old_addr; 205 int max_nr_ptes; 206 int nr_ptes; 207 int err = 0; 208 209 /* 210 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma 211 * locks to ensure that rmap will always observe either the old or the 212 * new ptes. This is the easiest way to avoid races with 213 * truncate_pagecache(), page migration, etc... 214 * 215 * When need_rmap_locks is false, we use other ways to avoid 216 * such races: 217 * 218 * - During exec() shift_arg_pages(), we use a specially tagged vma 219 * which rmap call sites look for using vma_is_temporary_stack(). 220 * 221 * - During mremap(), new_vma is often known to be placed after vma 222 * in rmap traversal order. This ensures rmap will always observe 223 * either the old pte, or the new pte, or both (the page table locks 224 * serialize access to individual ptes, but only rmap traversal 225 * order guarantees that we won't miss both the old and new ptes). 226 */ 227 if (pmc->need_rmap_locks) 228 take_rmap_locks(vma); 229 230 /* 231 * We don't have to worry about the ordering of src and dst 232 * pte locks because exclusive mmap_lock prevents deadlock. 233 */ 234 old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 235 if (!old_ptep) { 236 err = -EAGAIN; 237 goto out; 238 } 239 /* 240 * Now new_pte is none, so hpage_collapse_scan_file() path can not find 241 * this by traversing file->f_mapping, so there is no concurrency with 242 * retract_page_tables(). In addition, we already hold the exclusive 243 * mmap_lock, so this new_pte page is stable, so there is no need to get 244 * pmdval and do pmd_same() check. 245 */ 246 new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, 247 &new_ptl); 248 if (!new_ptep) { 249 pte_unmap_unlock(old_ptep, old_ptl); 250 err = -EAGAIN; 251 goto out; 252 } 253 if (new_ptl != old_ptl) 254 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 255 flush_tlb_batched_pending(vma->vm_mm); 256 arch_enter_lazy_mmu_mode(); 257 258 for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, 259 new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { 260 VM_WARN_ON_ONCE(!pte_none(*new_ptep)); 261 262 nr_ptes = 1; 263 max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; 264 old_pte = ptep_get(old_ptep); 265 if (pte_none(old_pte)) 266 continue; 267 268 /* 269 * If we are remapping a valid PTE, make sure 270 * to flush TLB before we drop the PTL for the 271 * PTE. 272 * 273 * NOTE! Both old and new PTL matter: the old one 274 * for racing with folio_mkclean(), the new one to 275 * make sure the physical page stays valid until 276 * the TLB entry for the old mapping has been 277 * flushed. 278 */ 279 if (pte_present(old_pte)) { 280 nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, 281 old_pte, max_nr_ptes); 282 force_flush = true; 283 } 284 pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); 285 pte = move_pte(pte, old_addr, new_addr); 286 pte = move_soft_dirty_pte(pte); 287 288 if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) 289 pte_clear(mm, new_addr, new_ptep); 290 else { 291 if (need_clear_uffd_wp) { 292 if (pte_present(pte)) 293 pte = pte_clear_uffd_wp(pte); 294 else if (is_swap_pte(pte)) 295 pte = pte_swp_clear_uffd_wp(pte); 296 } 297 set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); 298 } 299 } 300 301 arch_leave_lazy_mmu_mode(); 302 if (force_flush) 303 flush_tlb_range(vma, old_end - len, old_end); 304 if (new_ptl != old_ptl) 305 spin_unlock(new_ptl); 306 pte_unmap(new_ptep - 1); 307 pte_unmap_unlock(old_ptep - 1, old_ptl); 308 out: 309 if (pmc->need_rmap_locks) 310 drop_rmap_locks(vma); 311 return err; 312 } 313 314 #ifndef arch_supports_page_table_move 315 #define arch_supports_page_table_move arch_supports_page_table_move 316 static inline bool arch_supports_page_table_move(void) 317 { 318 return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || 319 IS_ENABLED(CONFIG_HAVE_MOVE_PUD); 320 } 321 #endif 322 323 #ifdef CONFIG_HAVE_MOVE_PMD 324 static bool move_normal_pmd(struct pagetable_move_control *pmc, 325 pmd_t *old_pmd, pmd_t *new_pmd) 326 { 327 spinlock_t *old_ptl, *new_ptl; 328 struct vm_area_struct *vma = pmc->old; 329 struct mm_struct *mm = vma->vm_mm; 330 bool res = false; 331 pmd_t pmd; 332 333 if (!arch_supports_page_table_move()) 334 return false; 335 /* 336 * The destination pmd shouldn't be established, free_pgtables() 337 * should have released it. 338 * 339 * However, there's a case during execve() where we use mremap 340 * to move the initial stack, and in that case the target area 341 * may overlap the source area (always moving down). 342 * 343 * If everything is PMD-aligned, that works fine, as moving 344 * each pmd down will clear the source pmd. But if we first 345 * have a few 4kB-only pages that get moved down, and then 346 * hit the "now the rest is PMD-aligned, let's do everything 347 * one pmd at a time", we will still have the old (now empty 348 * of any 4kB pages, but still there) PMD in the page table 349 * tree. 350 * 351 * Warn on it once - because we really should try to figure 352 * out how to do this better - but then say "I won't move 353 * this pmd". 354 * 355 * One alternative might be to just unmap the target pmd at 356 * this point, and verify that it really is empty. We'll see. 357 */ 358 if (WARN_ON_ONCE(!pmd_none(*new_pmd))) 359 return false; 360 361 /* If this pmd belongs to a uffd vma with remap events disabled, we need 362 * to ensure that the uffd-wp state is cleared from all pgtables. This 363 * means recursing into lower page tables in move_page_tables(), and we 364 * can reuse the existing code if we simply treat the entry as "not 365 * moved". 366 */ 367 if (vma_has_uffd_without_event_remap(vma)) 368 return false; 369 370 /* 371 * We don't have to worry about the ordering of src and dst 372 * ptlocks because exclusive mmap_lock prevents deadlock. 373 */ 374 old_ptl = pmd_lock(mm, old_pmd); 375 new_ptl = pmd_lockptr(mm, new_pmd); 376 if (new_ptl != old_ptl) 377 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 378 379 pmd = *old_pmd; 380 381 /* Racing with collapse? */ 382 if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) 383 goto out_unlock; 384 /* Clear the pmd */ 385 pmd_clear(old_pmd); 386 res = true; 387 388 VM_BUG_ON(!pmd_none(*new_pmd)); 389 390 pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); 391 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); 392 out_unlock: 393 if (new_ptl != old_ptl) 394 spin_unlock(new_ptl); 395 spin_unlock(old_ptl); 396 397 return res; 398 } 399 #else 400 static inline bool move_normal_pmd(struct pagetable_move_control *pmc, 401 pmd_t *old_pmd, pmd_t *new_pmd) 402 { 403 return false; 404 } 405 #endif 406 407 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) 408 static bool move_normal_pud(struct pagetable_move_control *pmc, 409 pud_t *old_pud, pud_t *new_pud) 410 { 411 spinlock_t *old_ptl, *new_ptl; 412 struct vm_area_struct *vma = pmc->old; 413 struct mm_struct *mm = vma->vm_mm; 414 pud_t pud; 415 416 if (!arch_supports_page_table_move()) 417 return false; 418 /* 419 * The destination pud shouldn't be established, free_pgtables() 420 * should have released it. 421 */ 422 if (WARN_ON_ONCE(!pud_none(*new_pud))) 423 return false; 424 425 /* If this pud belongs to a uffd vma with remap events disabled, we need 426 * to ensure that the uffd-wp state is cleared from all pgtables. This 427 * means recursing into lower page tables in move_page_tables(), and we 428 * can reuse the existing code if we simply treat the entry as "not 429 * moved". 430 */ 431 if (vma_has_uffd_without_event_remap(vma)) 432 return false; 433 434 /* 435 * We don't have to worry about the ordering of src and dst 436 * ptlocks because exclusive mmap_lock prevents deadlock. 437 */ 438 old_ptl = pud_lock(mm, old_pud); 439 new_ptl = pud_lockptr(mm, new_pud); 440 if (new_ptl != old_ptl) 441 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 442 443 /* Clear the pud */ 444 pud = *old_pud; 445 pud_clear(old_pud); 446 447 VM_BUG_ON(!pud_none(*new_pud)); 448 449 pud_populate(mm, new_pud, pud_pgtable(pud)); 450 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); 451 if (new_ptl != old_ptl) 452 spin_unlock(new_ptl); 453 spin_unlock(old_ptl); 454 455 return true; 456 } 457 #else 458 static inline bool move_normal_pud(struct pagetable_move_control *pmc, 459 pud_t *old_pud, pud_t *new_pud) 460 { 461 return false; 462 } 463 #endif 464 465 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 466 static bool move_huge_pud(struct pagetable_move_control *pmc, 467 pud_t *old_pud, pud_t *new_pud) 468 { 469 spinlock_t *old_ptl, *new_ptl; 470 struct vm_area_struct *vma = pmc->old; 471 struct mm_struct *mm = vma->vm_mm; 472 pud_t pud; 473 474 /* 475 * The destination pud shouldn't be established, free_pgtables() 476 * should have released it. 477 */ 478 if (WARN_ON_ONCE(!pud_none(*new_pud))) 479 return false; 480 481 /* 482 * We don't have to worry about the ordering of src and dst 483 * ptlocks because exclusive mmap_lock prevents deadlock. 484 */ 485 old_ptl = pud_lock(mm, old_pud); 486 new_ptl = pud_lockptr(mm, new_pud); 487 if (new_ptl != old_ptl) 488 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 489 490 /* Clear the pud */ 491 pud = *old_pud; 492 pud_clear(old_pud); 493 494 VM_BUG_ON(!pud_none(*new_pud)); 495 496 /* Set the new pud */ 497 /* mark soft_ditry when we add pud level soft dirty support */ 498 set_pud_at(mm, pmc->new_addr, new_pud, pud); 499 flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); 500 if (new_ptl != old_ptl) 501 spin_unlock(new_ptl); 502 spin_unlock(old_ptl); 503 504 return true; 505 } 506 #else 507 static bool move_huge_pud(struct pagetable_move_control *pmc, 508 pud_t *old_pud, pud_t *new_pud) 509 510 { 511 WARN_ON_ONCE(1); 512 return false; 513 514 } 515 #endif 516 517 enum pgt_entry { 518 NORMAL_PMD, 519 HPAGE_PMD, 520 NORMAL_PUD, 521 HPAGE_PUD, 522 }; 523 524 /* 525 * Returns an extent of the corresponding size for the pgt_entry specified if 526 * valid. Else returns a smaller extent bounded by the end of the source and 527 * destination pgt_entry. 528 */ 529 static __always_inline unsigned long get_extent(enum pgt_entry entry, 530 struct pagetable_move_control *pmc) 531 { 532 unsigned long next, extent, mask, size; 533 unsigned long old_addr = pmc->old_addr; 534 unsigned long old_end = pmc->old_end; 535 unsigned long new_addr = pmc->new_addr; 536 537 switch (entry) { 538 case HPAGE_PMD: 539 case NORMAL_PMD: 540 mask = PMD_MASK; 541 size = PMD_SIZE; 542 break; 543 case HPAGE_PUD: 544 case NORMAL_PUD: 545 mask = PUD_MASK; 546 size = PUD_SIZE; 547 break; 548 default: 549 BUILD_BUG(); 550 break; 551 } 552 553 next = (old_addr + size) & mask; 554 /* even if next overflowed, extent below will be ok */ 555 extent = next - old_addr; 556 if (extent > old_end - old_addr) 557 extent = old_end - old_addr; 558 next = (new_addr + size) & mask; 559 if (extent > next - new_addr) 560 extent = next - new_addr; 561 return extent; 562 } 563 564 /* 565 * Should move_pgt_entry() acquire the rmap locks? This is either expressed in 566 * the PMC, or overridden in the case of normal, larger page tables. 567 */ 568 static bool should_take_rmap_locks(struct pagetable_move_control *pmc, 569 enum pgt_entry entry) 570 { 571 switch (entry) { 572 case NORMAL_PMD: 573 case NORMAL_PUD: 574 return true; 575 default: 576 return pmc->need_rmap_locks; 577 } 578 } 579 580 /* 581 * Attempts to speedup the move by moving entry at the level corresponding to 582 * pgt_entry. Returns true if the move was successful, else false. 583 */ 584 static bool move_pgt_entry(struct pagetable_move_control *pmc, 585 enum pgt_entry entry, void *old_entry, void *new_entry) 586 { 587 bool moved = false; 588 bool need_rmap_locks = should_take_rmap_locks(pmc, entry); 589 590 /* See comment in move_ptes() */ 591 if (need_rmap_locks) 592 take_rmap_locks(pmc->old); 593 594 switch (entry) { 595 case NORMAL_PMD: 596 moved = move_normal_pmd(pmc, old_entry, new_entry); 597 break; 598 case NORMAL_PUD: 599 moved = move_normal_pud(pmc, old_entry, new_entry); 600 break; 601 case HPAGE_PMD: 602 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 603 move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, 604 new_entry); 605 break; 606 case HPAGE_PUD: 607 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 608 move_huge_pud(pmc, old_entry, new_entry); 609 break; 610 611 default: 612 WARN_ON_ONCE(1); 613 break; 614 } 615 616 if (need_rmap_locks) 617 drop_rmap_locks(pmc->old); 618 619 return moved; 620 } 621 622 /* 623 * A helper to check if aligning down is OK. The aligned address should fall 624 * on *no mapping*. For the stack moving down, that's a special move within 625 * the VMA that is created to span the source and destination of the move, 626 * so we make an exception for it. 627 */ 628 static bool can_align_down(struct pagetable_move_control *pmc, 629 struct vm_area_struct *vma, unsigned long addr_to_align, 630 unsigned long mask) 631 { 632 unsigned long addr_masked = addr_to_align & mask; 633 634 /* 635 * If @addr_to_align of either source or destination is not the beginning 636 * of the corresponding VMA, we can't align down or we will destroy part 637 * of the current mapping. 638 */ 639 if (!pmc->for_stack && vma->vm_start != addr_to_align) 640 return false; 641 642 /* In the stack case we explicitly permit in-VMA alignment. */ 643 if (pmc->for_stack && addr_masked >= vma->vm_start) 644 return true; 645 646 /* 647 * Make sure the realignment doesn't cause the address to fall on an 648 * existing mapping. 649 */ 650 return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; 651 } 652 653 /* 654 * Determine if are in fact able to realign for efficiency to a higher page 655 * table boundary. 656 */ 657 static bool can_realign_addr(struct pagetable_move_control *pmc, 658 unsigned long pagetable_mask) 659 { 660 unsigned long align_mask = ~pagetable_mask; 661 unsigned long old_align = pmc->old_addr & align_mask; 662 unsigned long new_align = pmc->new_addr & align_mask; 663 unsigned long pagetable_size = align_mask + 1; 664 unsigned long old_align_next = pagetable_size - old_align; 665 666 /* 667 * We don't want to have to go hunting for VMAs from the end of the old 668 * VMA to the next page table boundary, also we want to make sure the 669 * operation is wortwhile. 670 * 671 * So ensure that we only perform this realignment if the end of the 672 * range being copied reaches or crosses the page table boundary. 673 * 674 * boundary boundary 675 * .<- old_align -> . 676 * . |----------------.-----------| 677 * . | vma . | 678 * . |----------------.-----------| 679 * . <----------------.-----------> 680 * . len_in 681 * <-------------------------------> 682 * . pagetable_size . 683 * . <----------------> 684 * . old_align_next . 685 */ 686 if (pmc->len_in < old_align_next) 687 return false; 688 689 /* Skip if the addresses are already aligned. */ 690 if (old_align == 0) 691 return false; 692 693 /* Only realign if the new and old addresses are mutually aligned. */ 694 if (old_align != new_align) 695 return false; 696 697 /* Ensure realignment doesn't cause overlap with existing mappings. */ 698 if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || 699 !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) 700 return false; 701 702 return true; 703 } 704 705 /* 706 * Opportunistically realign to specified boundary for faster copy. 707 * 708 * Consider an mremap() of a VMA with page table boundaries as below, and no 709 * preceding VMAs from the lower page table boundary to the start of the VMA, 710 * with the end of the range reaching or crossing the page table boundary. 711 * 712 * boundary boundary 713 * . |----------------.-----------| 714 * . | vma . | 715 * . |----------------.-----------| 716 * . pmc->old_addr . pmc->old_end 717 * . <----------------------------> 718 * . move these page tables 719 * 720 * If we proceed with moving page tables in this scenario, we will have a lot of 721 * work to do traversing old page tables and establishing new ones in the 722 * destination across multiple lower level page tables. 723 * 724 * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the 725 * page table boundary, so we can simply copy a single page table entry for the 726 * aligned portion of the VMA instead: 727 * 728 * boundary boundary 729 * . |----------------.-----------| 730 * . | vma . | 731 * . |----------------.-----------| 732 * pmc->old_addr . pmc->old_end 733 * <-------------------------------------------> 734 * . move these page tables 735 */ 736 static void try_realign_addr(struct pagetable_move_control *pmc, 737 unsigned long pagetable_mask) 738 { 739 740 if (!can_realign_addr(pmc, pagetable_mask)) 741 return; 742 743 /* 744 * Simply align to page table boundaries. Note that we do NOT update the 745 * pmc->old_end value, and since the move_page_tables() operation spans 746 * from [old_addr, old_end) (offsetting new_addr as it is performed), 747 * this simply changes the start of the copy, not the end. 748 */ 749 pmc->old_addr &= pagetable_mask; 750 pmc->new_addr &= pagetable_mask; 751 } 752 753 /* Is the page table move operation done? */ 754 static bool pmc_done(struct pagetable_move_control *pmc) 755 { 756 return pmc->old_addr >= pmc->old_end; 757 } 758 759 /* Advance to the next page table, offset by extent bytes. */ 760 static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) 761 { 762 pmc->old_addr += extent; 763 pmc->new_addr += extent; 764 } 765 766 /* 767 * Determine how many bytes in the specified input range have had their page 768 * tables moved so far. 769 */ 770 static unsigned long pmc_progress(struct pagetable_move_control *pmc) 771 { 772 unsigned long orig_old_addr = pmc->old_end - pmc->len_in; 773 unsigned long old_addr = pmc->old_addr; 774 775 /* 776 * Prevent negative return values when {old,new}_addr was realigned but 777 * we broke out of the loop in move_page_tables() for the first PMD 778 * itself. 779 */ 780 return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; 781 } 782 783 unsigned long move_page_tables(struct pagetable_move_control *pmc) 784 { 785 unsigned long extent; 786 struct mmu_notifier_range range; 787 pmd_t *old_pmd, *new_pmd; 788 pud_t *old_pud, *new_pud; 789 struct mm_struct *mm = pmc->old->vm_mm; 790 791 if (!pmc->len_in) 792 return 0; 793 794 if (is_vm_hugetlb_page(pmc->old)) 795 return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, 796 pmc->new_addr, pmc->len_in); 797 798 /* 799 * If possible, realign addresses to PMD boundary for faster copy. 800 * Only realign if the mremap copying hits a PMD boundary. 801 */ 802 try_realign_addr(pmc, PMD_MASK); 803 804 flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); 805 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, 806 pmc->old_addr, pmc->old_end); 807 mmu_notifier_invalidate_range_start(&range); 808 809 for (; !pmc_done(pmc); pmc_next(pmc, extent)) { 810 cond_resched(); 811 /* 812 * If extent is PUD-sized try to speed up the move by moving at the 813 * PUD level if possible. 814 */ 815 extent = get_extent(NORMAL_PUD, pmc); 816 817 old_pud = get_old_pud(mm, pmc->old_addr); 818 if (!old_pud) 819 continue; 820 new_pud = alloc_new_pud(mm, pmc->new_addr); 821 if (!new_pud) 822 break; 823 if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { 824 if (extent == HPAGE_PUD_SIZE) { 825 move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); 826 /* We ignore and continue on error? */ 827 continue; 828 } 829 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { 830 if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) 831 continue; 832 } 833 834 extent = get_extent(NORMAL_PMD, pmc); 835 old_pmd = get_old_pmd(mm, pmc->old_addr); 836 if (!old_pmd) 837 continue; 838 new_pmd = alloc_new_pmd(mm, pmc->new_addr); 839 if (!new_pmd) 840 break; 841 again: 842 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || 843 pmd_devmap(*old_pmd)) { 844 if (extent == HPAGE_PMD_SIZE && 845 move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) 846 continue; 847 split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); 848 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && 849 extent == PMD_SIZE) { 850 /* 851 * If the extent is PMD-sized, try to speed the move by 852 * moving at the PMD level if possible. 853 */ 854 if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) 855 continue; 856 } 857 if (pmd_none(*old_pmd)) 858 continue; 859 if (pte_alloc(pmc->new->vm_mm, new_pmd)) 860 break; 861 if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) 862 goto again; 863 } 864 865 mmu_notifier_invalidate_range_end(&range); 866 867 return pmc_progress(pmc); 868 } 869 870 /* Set vrm->delta to the difference in VMA size specified by user. */ 871 static void vrm_set_delta(struct vma_remap_struct *vrm) 872 { 873 vrm->delta = abs_diff(vrm->old_len, vrm->new_len); 874 } 875 876 /* Determine what kind of remap this is - shrink, expand or no resize at all. */ 877 static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) 878 { 879 if (vrm->delta == 0) 880 return MREMAP_NO_RESIZE; 881 882 if (vrm->old_len > vrm->new_len) 883 return MREMAP_SHRINK; 884 885 return MREMAP_EXPAND; 886 } 887 888 /* 889 * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs 890 * overlapping? 891 */ 892 static bool vrm_overlaps(struct vma_remap_struct *vrm) 893 { 894 unsigned long start_old = vrm->addr; 895 unsigned long start_new = vrm->new_addr; 896 unsigned long end_old = vrm->addr + vrm->old_len; 897 unsigned long end_new = vrm->new_addr + vrm->new_len; 898 899 /* 900 * start_old end_old 901 * |-----------| 902 * | | 903 * |-----------| 904 * |-------------| 905 * | | 906 * |-------------| 907 * start_new end_new 908 */ 909 if (end_old > start_new && end_new > start_old) 910 return true; 911 912 return false; 913 } 914 915 /* Do the mremap() flags require that the new_addr parameter be specified? */ 916 static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) 917 { 918 return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); 919 } 920 921 /* 922 * Find an unmapped area for the requested vrm->new_addr. 923 * 924 * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only 925 * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to 926 * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. 927 * 928 * Returns 0 on success (with vrm->new_addr updated), or an error code upon 929 * failure. 930 */ 931 static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) 932 { 933 struct vm_area_struct *vma = vrm->vma; 934 unsigned long map_flags = 0; 935 /* Page Offset _into_ the VMA. */ 936 pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; 937 pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; 938 unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; 939 unsigned long res; 940 941 if (vrm->flags & MREMAP_FIXED) 942 map_flags |= MAP_FIXED; 943 if (vma->vm_flags & VM_MAYSHARE) 944 map_flags |= MAP_SHARED; 945 946 res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, 947 map_flags); 948 if (IS_ERR_VALUE(res)) 949 return res; 950 951 vrm->new_addr = res; 952 return 0; 953 } 954 955 /* 956 * Keep track of pages which have been added to the memory mapping. If the VMA 957 * is accounted, also check to see if there is sufficient memory. 958 * 959 * Returns true on success, false if insufficient memory to charge. 960 */ 961 static bool vrm_charge(struct vma_remap_struct *vrm) 962 { 963 unsigned long charged; 964 965 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 966 return true; 967 968 /* 969 * If we don't unmap the old mapping, then we account the entirety of 970 * the length of the new one. Otherwise it's just the delta in size. 971 */ 972 if (vrm->flags & MREMAP_DONTUNMAP) 973 charged = vrm->new_len >> PAGE_SHIFT; 974 else 975 charged = vrm->delta >> PAGE_SHIFT; 976 977 978 /* This accounts 'charged' pages of memory. */ 979 if (security_vm_enough_memory_mm(current->mm, charged)) 980 return false; 981 982 vrm->charged = charged; 983 return true; 984 } 985 986 /* 987 * an error has occurred so we will not be using vrm->charged memory. Unaccount 988 * this memory if the VMA is accounted. 989 */ 990 static void vrm_uncharge(struct vma_remap_struct *vrm) 991 { 992 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 993 return; 994 995 vm_unacct_memory(vrm->charged); 996 vrm->charged = 0; 997 } 998 999 /* 1000 * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to 1001 * account for 'bytes' memory used, and if locked, indicate this in the VRM so 1002 * we can handle this correctly later. 1003 */ 1004 static void vrm_stat_account(struct vma_remap_struct *vrm, 1005 unsigned long bytes) 1006 { 1007 unsigned long pages = bytes >> PAGE_SHIFT; 1008 struct mm_struct *mm = current->mm; 1009 struct vm_area_struct *vma = vrm->vma; 1010 1011 vm_stat_account(mm, vma->vm_flags, pages); 1012 if (vma->vm_flags & VM_LOCKED) { 1013 mm->locked_vm += pages; 1014 vrm->mlocked = true; 1015 } 1016 } 1017 1018 /* 1019 * Perform checks before attempting to write a VMA prior to it being 1020 * moved. 1021 */ 1022 static unsigned long prep_move_vma(struct vma_remap_struct *vrm) 1023 { 1024 unsigned long err = 0; 1025 struct vm_area_struct *vma = vrm->vma; 1026 unsigned long old_addr = vrm->addr; 1027 unsigned long old_len = vrm->old_len; 1028 unsigned long dummy = vma->vm_flags; 1029 1030 /* 1031 * We'd prefer to avoid failure later on in do_munmap: 1032 * which may split one vma into three before unmapping. 1033 */ 1034 if (current->mm->map_count >= sysctl_max_map_count - 3) 1035 return -ENOMEM; 1036 1037 if (vma->vm_ops && vma->vm_ops->may_split) { 1038 if (vma->vm_start != old_addr) 1039 err = vma->vm_ops->may_split(vma, old_addr); 1040 if (!err && vma->vm_end != old_addr + old_len) 1041 err = vma->vm_ops->may_split(vma, old_addr + old_len); 1042 if (err) 1043 return err; 1044 } 1045 1046 /* 1047 * Advise KSM to break any KSM pages in the area to be moved: 1048 * it would be confusing if they were to turn up at the new 1049 * location, where they happen to coincide with different KSM 1050 * pages recently unmapped. But leave vma->vm_flags as it was, 1051 * so KSM can come around to merge on vma and new_vma afterwards. 1052 */ 1053 err = ksm_madvise(vma, old_addr, old_addr + old_len, 1054 MADV_UNMERGEABLE, &dummy); 1055 if (err) 1056 return err; 1057 1058 return 0; 1059 } 1060 1061 /* 1062 * Unmap source VMA for VMA move, turning it from a copy to a move, being 1063 * careful to ensure we do not underflow memory account while doing so if an 1064 * accountable move. 1065 * 1066 * This is best effort, if we fail to unmap then we simply try to correct 1067 * accounting and exit. 1068 */ 1069 static void unmap_source_vma(struct vma_remap_struct *vrm) 1070 { 1071 struct mm_struct *mm = current->mm; 1072 unsigned long addr = vrm->addr; 1073 unsigned long len = vrm->old_len; 1074 struct vm_area_struct *vma = vrm->vma; 1075 VMA_ITERATOR(vmi, mm, addr); 1076 int err; 1077 unsigned long vm_start; 1078 unsigned long vm_end; 1079 /* 1080 * It might seem odd that we check for MREMAP_DONTUNMAP here, given this 1081 * function implies that we unmap the original VMA, which seems 1082 * contradictory. 1083 * 1084 * However, this occurs when this operation was attempted and an error 1085 * arose, in which case we _do_ wish to unmap the _new_ VMA, which means 1086 * we actually _do_ want it be unaccounted. 1087 */ 1088 bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && 1089 !(vrm->flags & MREMAP_DONTUNMAP); 1090 1091 /* 1092 * So we perform a trick here to prevent incorrect accounting. Any merge 1093 * or new VMA allocation performed in copy_vma() does not adjust 1094 * accounting, it is expected that callers handle this. 1095 * 1096 * And indeed we already have, accounting appropriately in the case of 1097 * both in vrm_charge(). 1098 * 1099 * However, when we unmap the existing VMA (to effect the move), this 1100 * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount 1101 * removed pages. 1102 * 1103 * To avoid this we temporarily clear this flag, reinstating on any 1104 * portions of the original VMA that remain. 1105 */ 1106 if (accountable_move) { 1107 vm_flags_clear(vma, VM_ACCOUNT); 1108 /* We are about to split vma, so store the start/end. */ 1109 vm_start = vma->vm_start; 1110 vm_end = vma->vm_end; 1111 } 1112 1113 err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); 1114 vrm->vma = NULL; /* Invalidated. */ 1115 if (err) { 1116 /* OOM: unable to split vma, just get accounts right */ 1117 vm_acct_memory(len >> PAGE_SHIFT); 1118 return; 1119 } 1120 1121 /* 1122 * If we mremap() from a VMA like this: 1123 * 1124 * addr end 1125 * | | 1126 * v v 1127 * |-------------| 1128 * | | 1129 * |-------------| 1130 * 1131 * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above 1132 * we'll end up with: 1133 * 1134 * addr end 1135 * | | 1136 * v v 1137 * |---| |---| 1138 * | A | | B | 1139 * |---| |---| 1140 * 1141 * The VMI is still pointing at addr, so vma_prev() will give us A, and 1142 * a subsequent or lone vma_next() will give as B. 1143 * 1144 * do_vmi_munmap() will have restored the VMI back to addr. 1145 */ 1146 if (accountable_move) { 1147 unsigned long end = addr + len; 1148 1149 if (vm_start < addr) { 1150 struct vm_area_struct *prev = vma_prev(&vmi); 1151 1152 vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ 1153 } 1154 1155 if (vm_end > end) { 1156 struct vm_area_struct *next = vma_next(&vmi); 1157 1158 vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ 1159 } 1160 } 1161 } 1162 1163 /* 1164 * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the 1165 * process. Additionally handle an error occurring on moving of page tables, 1166 * where we reset vrm state to cause unmapping of the new VMA. 1167 * 1168 * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an 1169 * error code. 1170 */ 1171 static int copy_vma_and_data(struct vma_remap_struct *vrm, 1172 struct vm_area_struct **new_vma_ptr) 1173 { 1174 unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; 1175 unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; 1176 unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; 1177 unsigned long moved_len; 1178 struct vm_area_struct *vma = vrm->vma; 1179 struct vm_area_struct *new_vma; 1180 int err = 0; 1181 PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); 1182 1183 new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, 1184 &pmc.need_rmap_locks); 1185 if (!new_vma) { 1186 vrm_uncharge(vrm); 1187 *new_vma_ptr = NULL; 1188 return -ENOMEM; 1189 } 1190 vrm->vma = vma; 1191 pmc.old = vma; 1192 pmc.new = new_vma; 1193 1194 moved_len = move_page_tables(&pmc); 1195 if (moved_len < vrm->old_len) 1196 err = -ENOMEM; 1197 else if (vma->vm_ops && vma->vm_ops->mremap) 1198 err = vma->vm_ops->mremap(new_vma); 1199 1200 if (unlikely(err)) { 1201 PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, 1202 vrm->addr, moved_len); 1203 1204 /* 1205 * On error, move entries back from new area to old, 1206 * which will succeed since page tables still there, 1207 * and then proceed to unmap new area instead of old. 1208 */ 1209 pmc_revert.need_rmap_locks = true; 1210 move_page_tables(&pmc_revert); 1211 1212 vrm->vma = new_vma; 1213 vrm->old_len = vrm->new_len; 1214 vrm->addr = vrm->new_addr; 1215 } else { 1216 mremap_userfaultfd_prep(new_vma, vrm->uf); 1217 } 1218 1219 fixup_hugetlb_reservations(vma); 1220 1221 *new_vma_ptr = new_vma; 1222 return err; 1223 } 1224 1225 /* 1226 * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and 1227 * account flags on remaining VMA by convention (it cannot be mlock()'d any 1228 * longer, as pages in range are no longer mapped), and removing anon_vma_chain 1229 * links from it (if the entire VMA was copied over). 1230 */ 1231 static void dontunmap_complete(struct vma_remap_struct *vrm, 1232 struct vm_area_struct *new_vma) 1233 { 1234 unsigned long start = vrm->addr; 1235 unsigned long end = vrm->addr + vrm->old_len; 1236 unsigned long old_start = vrm->vma->vm_start; 1237 unsigned long old_end = vrm->vma->vm_end; 1238 1239 /* 1240 * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old 1241 * vma. 1242 */ 1243 vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); 1244 1245 /* 1246 * anon_vma links of the old vma is no longer needed after its page 1247 * table has been moved. 1248 */ 1249 if (new_vma != vrm->vma && start == old_start && end == old_end) 1250 unlink_anon_vmas(vrm->vma); 1251 1252 /* Because we won't unmap we don't need to touch locked_vm. */ 1253 } 1254 1255 static unsigned long move_vma(struct vma_remap_struct *vrm) 1256 { 1257 struct mm_struct *mm = current->mm; 1258 struct vm_area_struct *new_vma; 1259 unsigned long hiwater_vm; 1260 int err; 1261 1262 err = prep_move_vma(vrm); 1263 if (err) 1264 return err; 1265 1266 /* If accounted, charge the number of bytes the operation will use. */ 1267 if (!vrm_charge(vrm)) 1268 return -ENOMEM; 1269 1270 /* We don't want racing faults. */ 1271 vma_start_write(vrm->vma); 1272 1273 /* Perform copy step. */ 1274 err = copy_vma_and_data(vrm, &new_vma); 1275 /* 1276 * If we established the copied-to VMA, we attempt to recover from the 1277 * error by setting the destination VMA to the source VMA and unmapping 1278 * it below. 1279 */ 1280 if (err && !new_vma) 1281 return err; 1282 1283 /* 1284 * If we failed to move page tables we still do total_vm increment 1285 * since do_munmap() will decrement it by old_len == new_len. 1286 * 1287 * Since total_vm is about to be raised artificially high for a 1288 * moment, we need to restore high watermark afterwards: if stats 1289 * are taken meanwhile, total_vm and hiwater_vm appear too high. 1290 * If this were a serious issue, we'd add a flag to do_munmap(). 1291 */ 1292 hiwater_vm = mm->hiwater_vm; 1293 1294 vrm_stat_account(vrm, vrm->new_len); 1295 if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) 1296 dontunmap_complete(vrm, new_vma); 1297 else 1298 unmap_source_vma(vrm); 1299 1300 mm->hiwater_vm = hiwater_vm; 1301 1302 return err ? (unsigned long)err : vrm->new_addr; 1303 } 1304 1305 /* 1306 * resize_is_valid() - Ensure the vma can be resized to the new length at the give 1307 * address. 1308 * 1309 * Return 0 on success, error otherwise. 1310 */ 1311 static int resize_is_valid(struct vma_remap_struct *vrm) 1312 { 1313 struct mm_struct *mm = current->mm; 1314 struct vm_area_struct *vma = vrm->vma; 1315 unsigned long addr = vrm->addr; 1316 unsigned long old_len = vrm->old_len; 1317 unsigned long new_len = vrm->new_len; 1318 unsigned long pgoff; 1319 1320 /* 1321 * !old_len is a special case where an attempt is made to 'duplicate' 1322 * a mapping. This makes no sense for private mappings as it will 1323 * instead create a fresh/new mapping unrelated to the original. This 1324 * is contrary to the basic idea of mremap which creates new mappings 1325 * based on the original. There are no known use cases for this 1326 * behavior. As a result, fail such attempts. 1327 */ 1328 if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { 1329 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", 1330 current->comm, current->pid); 1331 return -EINVAL; 1332 } 1333 1334 if ((vrm->flags & MREMAP_DONTUNMAP) && 1335 (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) 1336 return -EINVAL; 1337 1338 /* We can't remap across vm area boundaries */ 1339 if (old_len > vma->vm_end - addr) 1340 return -EFAULT; 1341 1342 if (new_len == old_len) 1343 return 0; 1344 1345 /* Need to be careful about a growing mapping */ 1346 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 1347 pgoff += vma->vm_pgoff; 1348 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 1349 return -EINVAL; 1350 1351 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 1352 return -EFAULT; 1353 1354 if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) 1355 return -EAGAIN; 1356 1357 if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) 1358 return -ENOMEM; 1359 1360 return 0; 1361 } 1362 1363 /* 1364 * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so 1365 * execute this, optionally dropping the mmap lock when we do so. 1366 * 1367 * In both cases this invalidates the VMA, however if we don't drop the lock, 1368 * then load the correct VMA into vrm->vma afterwards. 1369 */ 1370 static unsigned long shrink_vma(struct vma_remap_struct *vrm, 1371 bool drop_lock) 1372 { 1373 struct mm_struct *mm = current->mm; 1374 unsigned long unmap_start = vrm->addr + vrm->new_len; 1375 unsigned long unmap_bytes = vrm->delta; 1376 unsigned long res; 1377 VMA_ITERATOR(vmi, mm, unmap_start); 1378 1379 VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); 1380 1381 res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, 1382 vrm->uf_unmap, drop_lock); 1383 vrm->vma = NULL; /* Invalidated. */ 1384 if (res) 1385 return res; 1386 1387 /* 1388 * If we've not dropped the lock, then we should reload the VMA to 1389 * replace the invalidated VMA with the one that may have now been 1390 * split. 1391 */ 1392 if (drop_lock) { 1393 vrm->mmap_locked = false; 1394 } else { 1395 vrm->vma = vma_lookup(mm, vrm->addr); 1396 if (!vrm->vma) 1397 return -EFAULT; 1398 } 1399 1400 return 0; 1401 } 1402 1403 /* 1404 * mremap_to() - remap a vma to a new location. 1405 * Returns: The new address of the vma or an error. 1406 */ 1407 static unsigned long mremap_to(struct vma_remap_struct *vrm) 1408 { 1409 struct mm_struct *mm = current->mm; 1410 unsigned long err; 1411 1412 /* Is the new length or address silly? */ 1413 if (vrm->new_len > TASK_SIZE || 1414 vrm->new_addr > TASK_SIZE - vrm->new_len) 1415 return -EINVAL; 1416 1417 if (vrm_overlaps(vrm)) 1418 return -EINVAL; 1419 1420 if (vrm->flags & MREMAP_FIXED) { 1421 /* 1422 * In mremap_to(). 1423 * VMA is moved to dst address, and munmap dst first. 1424 * do_munmap will check if dst is sealed. 1425 */ 1426 err = do_munmap(mm, vrm->new_addr, vrm->new_len, 1427 vrm->uf_unmap_early); 1428 vrm->vma = NULL; /* Invalidated. */ 1429 if (err) 1430 return err; 1431 1432 /* 1433 * If we remap a portion of a VMA elsewhere in the same VMA, 1434 * this can invalidate the old VMA. Reset. 1435 */ 1436 vrm->vma = vma_lookup(mm, vrm->addr); 1437 if (!vrm->vma) 1438 return -EFAULT; 1439 } 1440 1441 if (vrm->remap_type == MREMAP_SHRINK) { 1442 err = shrink_vma(vrm, /* drop_lock= */false); 1443 if (err) 1444 return err; 1445 1446 /* Set up for the move now shrink has been executed. */ 1447 vrm->old_len = vrm->new_len; 1448 } 1449 1450 err = resize_is_valid(vrm); 1451 if (err) 1452 return err; 1453 1454 /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ 1455 if (vrm->flags & MREMAP_DONTUNMAP) { 1456 vm_flags_t vm_flags = vrm->vma->vm_flags; 1457 unsigned long pages = vrm->old_len >> PAGE_SHIFT; 1458 1459 if (!may_expand_vm(mm, vm_flags, pages)) 1460 return -ENOMEM; 1461 } 1462 1463 err = vrm_set_new_addr(vrm); 1464 if (err) 1465 return err; 1466 1467 return move_vma(vrm); 1468 } 1469 1470 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) 1471 { 1472 unsigned long end = vma->vm_end + delta; 1473 1474 if (end < vma->vm_end) /* overflow */ 1475 return 0; 1476 if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) 1477 return 0; 1478 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 1479 0, MAP_FIXED) & ~PAGE_MASK) 1480 return 0; 1481 return 1; 1482 } 1483 1484 /* Determine whether we are actually able to execute an in-place expansion. */ 1485 static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) 1486 { 1487 /* Number of bytes from vrm->addr to end of VMA. */ 1488 unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; 1489 1490 /* If end of range aligns to end of VMA, we can just expand in-place. */ 1491 if (suffix_bytes != vrm->old_len) 1492 return false; 1493 1494 /* Check whether this is feasible. */ 1495 if (!vma_expandable(vrm->vma, vrm->delta)) 1496 return false; 1497 1498 return true; 1499 } 1500 1501 /* 1502 * Are the parameters passed to mremap() valid? If so return 0, otherwise return 1503 * error. 1504 */ 1505 static unsigned long check_mremap_params(struct vma_remap_struct *vrm) 1506 1507 { 1508 unsigned long addr = vrm->addr; 1509 unsigned long flags = vrm->flags; 1510 1511 /* Ensure no unexpected flag values. */ 1512 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) 1513 return -EINVAL; 1514 1515 /* Start address must be page-aligned. */ 1516 if (offset_in_page(addr)) 1517 return -EINVAL; 1518 1519 /* 1520 * We allow a zero old-len as a special case 1521 * for DOS-emu "duplicate shm area" thing. But 1522 * a zero new-len is nonsensical. 1523 */ 1524 if (!PAGE_ALIGN(vrm->new_len)) 1525 return -EINVAL; 1526 1527 /* Remainder of checks are for cases with specific new_addr. */ 1528 if (!vrm_implies_new_addr(vrm)) 1529 return 0; 1530 1531 /* The new address must be page-aligned. */ 1532 if (offset_in_page(vrm->new_addr)) 1533 return -EINVAL; 1534 1535 /* A fixed address implies a move. */ 1536 if (!(flags & MREMAP_MAYMOVE)) 1537 return -EINVAL; 1538 1539 /* MREMAP_DONTUNMAP does not allow resizing in the process. */ 1540 if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) 1541 return -EINVAL; 1542 1543 /* 1544 * move_vma() need us to stay 4 maps below the threshold, otherwise 1545 * it will bail out at the very beginning. 1546 * That is a problem if we have already unmaped the regions here 1547 * (new_addr, and old_addr), because userspace will not know the 1548 * state of the vma's after it gets -ENOMEM. 1549 * So, to avoid such scenario we can pre-compute if the whole 1550 * operation has high chances to success map-wise. 1551 * Worst-scenario case is when both vma's (new_addr and old_addr) get 1552 * split in 3 before unmapping it. 1553 * That means 2 more maps (1 for each) to the ones we already hold. 1554 * Check whether current map count plus 2 still leads us to 4 maps below 1555 * the threshold, otherwise return -ENOMEM here to be more safe. 1556 */ 1557 if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) 1558 return -ENOMEM; 1559 1560 return 0; 1561 } 1562 1563 /* 1564 * We know we can expand the VMA in-place by delta pages, so do so. 1565 * 1566 * If we discover the VMA is locked, update mm_struct statistics accordingly and 1567 * indicate so to the caller. 1568 */ 1569 static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) 1570 { 1571 struct mm_struct *mm = current->mm; 1572 struct vm_area_struct *vma = vrm->vma; 1573 VMA_ITERATOR(vmi, mm, vma->vm_end); 1574 1575 if (!vrm_charge(vrm)) 1576 return -ENOMEM; 1577 1578 /* 1579 * Function vma_merge_extend() is called on the 1580 * extension we are adding to the already existing vma, 1581 * vma_merge_extend() will merge this extension with the 1582 * already existing vma (expand operation itself) and 1583 * possibly also with the next vma if it becomes 1584 * adjacent to the expanded vma and otherwise 1585 * compatible. 1586 */ 1587 vma = vma_merge_extend(&vmi, vma, vrm->delta); 1588 if (!vma) { 1589 vrm_uncharge(vrm); 1590 return -ENOMEM; 1591 } 1592 vrm->vma = vma; 1593 1594 vrm_stat_account(vrm, vrm->delta); 1595 1596 return 0; 1597 } 1598 1599 static bool align_hugetlb(struct vma_remap_struct *vrm) 1600 { 1601 struct hstate *h __maybe_unused = hstate_vma(vrm->vma); 1602 1603 vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); 1604 vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); 1605 1606 /* addrs must be huge page aligned */ 1607 if (vrm->addr & ~huge_page_mask(h)) 1608 return false; 1609 if (vrm->new_addr & ~huge_page_mask(h)) 1610 return false; 1611 1612 /* 1613 * Don't allow remap expansion, because the underlying hugetlb 1614 * reservation is not yet capable to handle split reservation. 1615 */ 1616 if (vrm->new_len > vrm->old_len) 1617 return false; 1618 1619 vrm_set_delta(vrm); 1620 1621 return true; 1622 } 1623 1624 /* 1625 * We are mremap()'ing without specifying a fixed address to move to, but are 1626 * requesting that the VMA's size be increased. 1627 * 1628 * Try to do so in-place, if this fails, then move the VMA to a new location to 1629 * action the change. 1630 */ 1631 static unsigned long expand_vma(struct vma_remap_struct *vrm) 1632 { 1633 unsigned long err; 1634 unsigned long addr = vrm->addr; 1635 1636 err = resize_is_valid(vrm); 1637 if (err) 1638 return err; 1639 1640 /* 1641 * [addr, old_len) spans precisely to the end of the VMA, so try to 1642 * expand it in-place. 1643 */ 1644 if (vrm_can_expand_in_place(vrm)) { 1645 err = expand_vma_in_place(vrm); 1646 if (err) 1647 return err; 1648 1649 /* 1650 * We want to populate the newly expanded portion of the VMA to 1651 * satisfy the expectation that mlock()'ing a VMA maintains all 1652 * of its pages in memory. 1653 */ 1654 if (vrm->mlocked) 1655 vrm->new_addr = addr; 1656 1657 /* OK we're done! */ 1658 return addr; 1659 } 1660 1661 /* 1662 * We weren't able to just expand or shrink the area, 1663 * we need to create a new one and move it. 1664 */ 1665 1666 /* We're not allowed to move the VMA, so error out. */ 1667 if (!(vrm->flags & MREMAP_MAYMOVE)) 1668 return -ENOMEM; 1669 1670 /* Find a new location to move the VMA to. */ 1671 err = vrm_set_new_addr(vrm); 1672 if (err) 1673 return err; 1674 1675 return move_vma(vrm); 1676 } 1677 1678 /* 1679 * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the 1680 * first available address to perform the operation. 1681 */ 1682 static unsigned long mremap_at(struct vma_remap_struct *vrm) 1683 { 1684 unsigned long res; 1685 1686 switch (vrm->remap_type) { 1687 case MREMAP_INVALID: 1688 break; 1689 case MREMAP_NO_RESIZE: 1690 /* NO-OP CASE - resizing to the same size. */ 1691 return vrm->addr; 1692 case MREMAP_SHRINK: 1693 /* 1694 * SHRINK CASE. Can always be done in-place. 1695 * 1696 * Simply unmap the shrunken portion of the VMA. This does all 1697 * the needed commit accounting, and we indicate that the mmap 1698 * lock should be dropped. 1699 */ 1700 res = shrink_vma(vrm, /* drop_lock= */true); 1701 if (res) 1702 return res; 1703 1704 return vrm->addr; 1705 case MREMAP_EXPAND: 1706 return expand_vma(vrm); 1707 } 1708 1709 BUG(); 1710 } 1711 1712 static unsigned long do_mremap(struct vma_remap_struct *vrm) 1713 { 1714 struct mm_struct *mm = current->mm; 1715 struct vm_area_struct *vma; 1716 unsigned long ret; 1717 1718 ret = check_mremap_params(vrm); 1719 if (ret) 1720 return ret; 1721 1722 vrm->old_len = PAGE_ALIGN(vrm->old_len); 1723 vrm->new_len = PAGE_ALIGN(vrm->new_len); 1724 vrm_set_delta(vrm); 1725 1726 if (mmap_write_lock_killable(mm)) 1727 return -EINTR; 1728 vrm->mmap_locked = true; 1729 1730 vma = vrm->vma = vma_lookup(mm, vrm->addr); 1731 if (!vma) { 1732 ret = -EFAULT; 1733 goto out; 1734 } 1735 1736 /* If mseal()'d, mremap() is prohibited. */ 1737 if (!can_modify_vma(vma)) { 1738 ret = -EPERM; 1739 goto out; 1740 } 1741 1742 /* Align to hugetlb page size, if required. */ 1743 if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { 1744 ret = -EINVAL; 1745 goto out; 1746 } 1747 1748 vrm->remap_type = vrm_remap_type(vrm); 1749 1750 /* Actually execute mremap. */ 1751 ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); 1752 1753 out: 1754 if (vrm->mmap_locked) { 1755 mmap_write_unlock(mm); 1756 vrm->mmap_locked = false; 1757 1758 if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) 1759 mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); 1760 } 1761 1762 userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); 1763 mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); 1764 userfaultfd_unmap_complete(mm, vrm->uf_unmap); 1765 1766 return ret; 1767 } 1768 1769 /* 1770 * Expand (or shrink) an existing mapping, potentially moving it at the 1771 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1772 * 1773 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1774 * This option implies MREMAP_MAYMOVE. 1775 */ 1776 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1777 unsigned long, new_len, unsigned long, flags, 1778 unsigned long, new_addr) 1779 { 1780 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 1781 LIST_HEAD(uf_unmap_early); 1782 LIST_HEAD(uf_unmap); 1783 /* 1784 * There is a deliberate asymmetry here: we strip the pointer tag 1785 * from the old address but leave the new address alone. This is 1786 * for consistency with mmap(), where we prevent the creation of 1787 * aliasing mappings in userspace by leaving the tag bits of the 1788 * mapping address intact. A non-zero tag will cause the subsequent 1789 * range checks to reject the address as invalid. 1790 * 1791 * See Documentation/arch/arm64/tagged-address-abi.rst for more 1792 * information. 1793 */ 1794 struct vma_remap_struct vrm = { 1795 .addr = untagged_addr(addr), 1796 .old_len = old_len, 1797 .new_len = new_len, 1798 .flags = flags, 1799 .new_addr = new_addr, 1800 1801 .uf = &uf, 1802 .uf_unmap_early = &uf_unmap_early, 1803 .uf_unmap = &uf_unmap, 1804 1805 .remap_type = MREMAP_INVALID, /* We set later. */ 1806 }; 1807 1808 return do_mremap(&vrm); 1809 } 1810