1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/mremap.c 4 * 5 * (C) Copyright 1996 Linus Torvalds 6 * 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/mm_inline.h> 13 #include <linux/hugetlb.h> 14 #include <linux/shm.h> 15 #include <linux/ksm.h> 16 #include <linux/mman.h> 17 #include <linux/swap.h> 18 #include <linux/capability.h> 19 #include <linux/fs.h> 20 #include <linux/swapops.h> 21 #include <linux/highmem.h> 22 #include <linux/security.h> 23 #include <linux/syscalls.h> 24 #include <linux/mmu_notifier.h> 25 #include <linux/uaccess.h> 26 #include <linux/userfaultfd_k.h> 27 #include <linux/mempolicy.h> 28 29 #include <asm/cacheflush.h> 30 #include <asm/tlb.h> 31 #include <asm/pgalloc.h> 32 33 #include "internal.h" 34 35 /* Classify the kind of remap operation being performed. */ 36 enum mremap_type { 37 MREMAP_INVALID, /* Initial state. */ 38 MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ 39 MREMAP_SHRINK, /* old_len > new_len. */ 40 MREMAP_EXPAND, /* old_len < new_len. */ 41 }; 42 43 /* 44 * Describes a VMA mremap() operation and is threaded throughout it. 45 * 46 * Any of the fields may be mutated by the operation, however these values will 47 * always accurately reflect the remap (for instance, we may adjust lengths and 48 * delta to account for hugetlb alignment). 49 */ 50 struct vma_remap_struct { 51 /* User-provided state. */ 52 unsigned long addr; /* User-specified address from which we remap. */ 53 unsigned long old_len; /* Length of range being remapped. */ 54 unsigned long new_len; /* Desired new length of mapping. */ 55 unsigned long flags; /* user-specified MREMAP_* flags. */ 56 unsigned long new_addr; /* Optionally, desired new address. */ 57 58 /* uffd state. */ 59 struct vm_userfaultfd_ctx *uf; 60 struct list_head *uf_unmap_early; 61 struct list_head *uf_unmap; 62 63 /* VMA state, determined in do_mremap(). */ 64 struct vm_area_struct *vma; 65 66 /* Internal state, determined in do_mremap(). */ 67 unsigned long delta; /* Absolute delta of old_len,new_len. */ 68 bool mlocked; /* Was the VMA mlock()'d? */ 69 enum mremap_type remap_type; /* expand, shrink, etc. */ 70 bool mmap_locked; /* Is mm currently write-locked? */ 71 unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ 72 }; 73 74 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) 75 { 76 pgd_t *pgd; 77 p4d_t *p4d; 78 pud_t *pud; 79 80 pgd = pgd_offset(mm, addr); 81 if (pgd_none_or_clear_bad(pgd)) 82 return NULL; 83 84 p4d = p4d_offset(pgd, addr); 85 if (p4d_none_or_clear_bad(p4d)) 86 return NULL; 87 88 pud = pud_offset(p4d, addr); 89 if (pud_none_or_clear_bad(pud)) 90 return NULL; 91 92 return pud; 93 } 94 95 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 96 { 97 pud_t *pud; 98 pmd_t *pmd; 99 100 pud = get_old_pud(mm, addr); 101 if (!pud) 102 return NULL; 103 104 pmd = pmd_offset(pud, addr); 105 if (pmd_none(*pmd)) 106 return NULL; 107 108 return pmd; 109 } 110 111 static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) 112 { 113 pgd_t *pgd; 114 p4d_t *p4d; 115 116 pgd = pgd_offset(mm, addr); 117 p4d = p4d_alloc(mm, pgd, addr); 118 if (!p4d) 119 return NULL; 120 121 return pud_alloc(mm, p4d, addr); 122 } 123 124 static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 125 { 126 pud_t *pud; 127 pmd_t *pmd; 128 129 pud = alloc_new_pud(mm, addr); 130 if (!pud) 131 return NULL; 132 133 pmd = pmd_alloc(mm, pud, addr); 134 if (!pmd) 135 return NULL; 136 137 VM_BUG_ON(pmd_trans_huge(*pmd)); 138 139 return pmd; 140 } 141 142 static void take_rmap_locks(struct vm_area_struct *vma) 143 { 144 if (vma->vm_file) 145 i_mmap_lock_write(vma->vm_file->f_mapping); 146 if (vma->anon_vma) 147 anon_vma_lock_write(vma->anon_vma); 148 } 149 150 static void drop_rmap_locks(struct vm_area_struct *vma) 151 { 152 if (vma->anon_vma) 153 anon_vma_unlock_write(vma->anon_vma); 154 if (vma->vm_file) 155 i_mmap_unlock_write(vma->vm_file->f_mapping); 156 } 157 158 static pte_t move_soft_dirty_pte(pte_t pte) 159 { 160 /* 161 * Set soft dirty bit so we can notice 162 * in userspace the ptes were moved. 163 */ 164 #ifdef CONFIG_MEM_SOFT_DIRTY 165 if (pte_present(pte)) 166 pte = pte_mksoft_dirty(pte); 167 else if (is_swap_pte(pte)) 168 pte = pte_swp_mksoft_dirty(pte); 169 #endif 170 return pte; 171 } 172 173 static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, 174 pte_t *ptep, pte_t pte, int max_nr) 175 { 176 const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 177 struct folio *folio; 178 179 if (max_nr == 1) 180 return 1; 181 182 folio = vm_normal_folio(vma, addr, pte); 183 if (!folio || !folio_test_large(folio)) 184 return 1; 185 186 return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL, 187 NULL, NULL); 188 } 189 190 static int move_ptes(struct pagetable_move_control *pmc, 191 unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) 192 { 193 struct vm_area_struct *vma = pmc->old; 194 bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); 195 struct mm_struct *mm = vma->vm_mm; 196 pte_t *old_ptep, *new_ptep; 197 pte_t old_pte, pte; 198 pmd_t dummy_pmdval; 199 spinlock_t *old_ptl, *new_ptl; 200 bool force_flush = false; 201 unsigned long old_addr = pmc->old_addr; 202 unsigned long new_addr = pmc->new_addr; 203 unsigned long old_end = old_addr + extent; 204 unsigned long len = old_end - old_addr; 205 int max_nr_ptes; 206 int nr_ptes; 207 int err = 0; 208 209 /* 210 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma 211 * locks to ensure that rmap will always observe either the old or the 212 * new ptes. This is the easiest way to avoid races with 213 * truncate_pagecache(), page migration, etc... 214 * 215 * When need_rmap_locks is false, we use other ways to avoid 216 * such races: 217 * 218 * - During exec() shift_arg_pages(), we use a specially tagged vma 219 * which rmap call sites look for using vma_is_temporary_stack(). 220 * 221 * - During mremap(), new_vma is often known to be placed after vma 222 * in rmap traversal order. This ensures rmap will always observe 223 * either the old pte, or the new pte, or both (the page table locks 224 * serialize access to individual ptes, but only rmap traversal 225 * order guarantees that we won't miss both the old and new ptes). 226 */ 227 if (pmc->need_rmap_locks) 228 take_rmap_locks(vma); 229 230 /* 231 * We don't have to worry about the ordering of src and dst 232 * pte locks because exclusive mmap_lock prevents deadlock. 233 */ 234 old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 235 if (!old_ptep) { 236 err = -EAGAIN; 237 goto out; 238 } 239 /* 240 * Now new_pte is none, so hpage_collapse_scan_file() path can not find 241 * this by traversing file->f_mapping, so there is no concurrency with 242 * retract_page_tables(). In addition, we already hold the exclusive 243 * mmap_lock, so this new_pte page is stable, so there is no need to get 244 * pmdval and do pmd_same() check. 245 */ 246 new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, 247 &new_ptl); 248 if (!new_ptep) { 249 pte_unmap_unlock(old_ptep, old_ptl); 250 err = -EAGAIN; 251 goto out; 252 } 253 if (new_ptl != old_ptl) 254 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 255 flush_tlb_batched_pending(vma->vm_mm); 256 arch_enter_lazy_mmu_mode(); 257 258 for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, 259 new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { 260 VM_WARN_ON_ONCE(!pte_none(*new_ptep)); 261 262 nr_ptes = 1; 263 max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT; 264 old_pte = ptep_get(old_ptep); 265 if (pte_none(old_pte)) 266 continue; 267 268 /* 269 * If we are remapping a valid PTE, make sure 270 * to flush TLB before we drop the PTL for the 271 * PTE. 272 * 273 * NOTE! Both old and new PTL matter: the old one 274 * for racing with folio_mkclean(), the new one to 275 * make sure the physical page stays valid until 276 * the TLB entry for the old mapping has been 277 * flushed. 278 */ 279 if (pte_present(old_pte)) { 280 nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, 281 old_pte, max_nr_ptes); 282 force_flush = true; 283 } 284 pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); 285 pte = move_pte(pte, old_addr, new_addr); 286 pte = move_soft_dirty_pte(pte); 287 288 if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) 289 pte_clear(mm, new_addr, new_ptep); 290 else { 291 if (need_clear_uffd_wp) { 292 if (pte_present(pte)) 293 pte = pte_clear_uffd_wp(pte); 294 else if (is_swap_pte(pte)) 295 pte = pte_swp_clear_uffd_wp(pte); 296 } 297 set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); 298 } 299 } 300 301 arch_leave_lazy_mmu_mode(); 302 if (force_flush) 303 flush_tlb_range(vma, old_end - len, old_end); 304 if (new_ptl != old_ptl) 305 spin_unlock(new_ptl); 306 pte_unmap(new_ptep - 1); 307 pte_unmap_unlock(old_ptep - 1, old_ptl); 308 out: 309 if (pmc->need_rmap_locks) 310 drop_rmap_locks(vma); 311 return err; 312 } 313 314 #ifndef arch_supports_page_table_move 315 #define arch_supports_page_table_move arch_supports_page_table_move 316 static inline bool arch_supports_page_table_move(void) 317 { 318 return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || 319 IS_ENABLED(CONFIG_HAVE_MOVE_PUD); 320 } 321 #endif 322 323 #ifdef CONFIG_HAVE_MOVE_PMD 324 static bool move_normal_pmd(struct pagetable_move_control *pmc, 325 pmd_t *old_pmd, pmd_t *new_pmd) 326 { 327 spinlock_t *old_ptl, *new_ptl; 328 struct vm_area_struct *vma = pmc->old; 329 struct mm_struct *mm = vma->vm_mm; 330 bool res = false; 331 pmd_t pmd; 332 333 if (!arch_supports_page_table_move()) 334 return false; 335 /* 336 * The destination pmd shouldn't be established, free_pgtables() 337 * should have released it. 338 * 339 * However, there's a case during execve() where we use mremap 340 * to move the initial stack, and in that case the target area 341 * may overlap the source area (always moving down). 342 * 343 * If everything is PMD-aligned, that works fine, as moving 344 * each pmd down will clear the source pmd. But if we first 345 * have a few 4kB-only pages that get moved down, and then 346 * hit the "now the rest is PMD-aligned, let's do everything 347 * one pmd at a time", we will still have the old (now empty 348 * of any 4kB pages, but still there) PMD in the page table 349 * tree. 350 * 351 * Warn on it once - because we really should try to figure 352 * out how to do this better - but then say "I won't move 353 * this pmd". 354 * 355 * One alternative might be to just unmap the target pmd at 356 * this point, and verify that it really is empty. We'll see. 357 */ 358 if (WARN_ON_ONCE(!pmd_none(*new_pmd))) 359 return false; 360 361 /* If this pmd belongs to a uffd vma with remap events disabled, we need 362 * to ensure that the uffd-wp state is cleared from all pgtables. This 363 * means recursing into lower page tables in move_page_tables(), and we 364 * can reuse the existing code if we simply treat the entry as "not 365 * moved". 366 */ 367 if (vma_has_uffd_without_event_remap(vma)) 368 return false; 369 370 /* 371 * We don't have to worry about the ordering of src and dst 372 * ptlocks because exclusive mmap_lock prevents deadlock. 373 */ 374 old_ptl = pmd_lock(mm, old_pmd); 375 new_ptl = pmd_lockptr(mm, new_pmd); 376 if (new_ptl != old_ptl) 377 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 378 379 pmd = *old_pmd; 380 381 /* Racing with collapse? */ 382 if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) 383 goto out_unlock; 384 /* Clear the pmd */ 385 pmd_clear(old_pmd); 386 res = true; 387 388 VM_BUG_ON(!pmd_none(*new_pmd)); 389 390 pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); 391 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); 392 out_unlock: 393 if (new_ptl != old_ptl) 394 spin_unlock(new_ptl); 395 spin_unlock(old_ptl); 396 397 return res; 398 } 399 #else 400 static inline bool move_normal_pmd(struct pagetable_move_control *pmc, 401 pmd_t *old_pmd, pmd_t *new_pmd) 402 { 403 return false; 404 } 405 #endif 406 407 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) 408 static bool move_normal_pud(struct pagetable_move_control *pmc, 409 pud_t *old_pud, pud_t *new_pud) 410 { 411 spinlock_t *old_ptl, *new_ptl; 412 struct vm_area_struct *vma = pmc->old; 413 struct mm_struct *mm = vma->vm_mm; 414 pud_t pud; 415 416 if (!arch_supports_page_table_move()) 417 return false; 418 /* 419 * The destination pud shouldn't be established, free_pgtables() 420 * should have released it. 421 */ 422 if (WARN_ON_ONCE(!pud_none(*new_pud))) 423 return false; 424 425 /* If this pud belongs to a uffd vma with remap events disabled, we need 426 * to ensure that the uffd-wp state is cleared from all pgtables. This 427 * means recursing into lower page tables in move_page_tables(), and we 428 * can reuse the existing code if we simply treat the entry as "not 429 * moved". 430 */ 431 if (vma_has_uffd_without_event_remap(vma)) 432 return false; 433 434 /* 435 * We don't have to worry about the ordering of src and dst 436 * ptlocks because exclusive mmap_lock prevents deadlock. 437 */ 438 old_ptl = pud_lock(mm, old_pud); 439 new_ptl = pud_lockptr(mm, new_pud); 440 if (new_ptl != old_ptl) 441 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 442 443 /* Clear the pud */ 444 pud = *old_pud; 445 pud_clear(old_pud); 446 447 VM_BUG_ON(!pud_none(*new_pud)); 448 449 pud_populate(mm, new_pud, pud_pgtable(pud)); 450 flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); 451 if (new_ptl != old_ptl) 452 spin_unlock(new_ptl); 453 spin_unlock(old_ptl); 454 455 return true; 456 } 457 #else 458 static inline bool move_normal_pud(struct pagetable_move_control *pmc, 459 pud_t *old_pud, pud_t *new_pud) 460 { 461 return false; 462 } 463 #endif 464 465 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 466 static bool move_huge_pud(struct pagetable_move_control *pmc, 467 pud_t *old_pud, pud_t *new_pud) 468 { 469 spinlock_t *old_ptl, *new_ptl; 470 struct vm_area_struct *vma = pmc->old; 471 struct mm_struct *mm = vma->vm_mm; 472 pud_t pud; 473 474 /* 475 * The destination pud shouldn't be established, free_pgtables() 476 * should have released it. 477 */ 478 if (WARN_ON_ONCE(!pud_none(*new_pud))) 479 return false; 480 481 /* 482 * We don't have to worry about the ordering of src and dst 483 * ptlocks because exclusive mmap_lock prevents deadlock. 484 */ 485 old_ptl = pud_lock(mm, old_pud); 486 new_ptl = pud_lockptr(mm, new_pud); 487 if (new_ptl != old_ptl) 488 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 489 490 /* Clear the pud */ 491 pud = *old_pud; 492 pud_clear(old_pud); 493 494 VM_BUG_ON(!pud_none(*new_pud)); 495 496 /* Set the new pud */ 497 /* mark soft_ditry when we add pud level soft dirty support */ 498 set_pud_at(mm, pmc->new_addr, new_pud, pud); 499 flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); 500 if (new_ptl != old_ptl) 501 spin_unlock(new_ptl); 502 spin_unlock(old_ptl); 503 504 return true; 505 } 506 #else 507 static bool move_huge_pud(struct pagetable_move_control *pmc, 508 pud_t *old_pud, pud_t *new_pud) 509 510 { 511 WARN_ON_ONCE(1); 512 return false; 513 514 } 515 #endif 516 517 enum pgt_entry { 518 NORMAL_PMD, 519 HPAGE_PMD, 520 NORMAL_PUD, 521 HPAGE_PUD, 522 }; 523 524 /* 525 * Returns an extent of the corresponding size for the pgt_entry specified if 526 * valid. Else returns a smaller extent bounded by the end of the source and 527 * destination pgt_entry. 528 */ 529 static __always_inline unsigned long get_extent(enum pgt_entry entry, 530 struct pagetable_move_control *pmc) 531 { 532 unsigned long next, extent, mask, size; 533 unsigned long old_addr = pmc->old_addr; 534 unsigned long old_end = pmc->old_end; 535 unsigned long new_addr = pmc->new_addr; 536 537 switch (entry) { 538 case HPAGE_PMD: 539 case NORMAL_PMD: 540 mask = PMD_MASK; 541 size = PMD_SIZE; 542 break; 543 case HPAGE_PUD: 544 case NORMAL_PUD: 545 mask = PUD_MASK; 546 size = PUD_SIZE; 547 break; 548 default: 549 BUILD_BUG(); 550 break; 551 } 552 553 next = (old_addr + size) & mask; 554 /* even if next overflowed, extent below will be ok */ 555 extent = next - old_addr; 556 if (extent > old_end - old_addr) 557 extent = old_end - old_addr; 558 next = (new_addr + size) & mask; 559 if (extent > next - new_addr) 560 extent = next - new_addr; 561 return extent; 562 } 563 564 /* 565 * Should move_pgt_entry() acquire the rmap locks? This is either expressed in 566 * the PMC, or overridden in the case of normal, larger page tables. 567 */ 568 static bool should_take_rmap_locks(struct pagetable_move_control *pmc, 569 enum pgt_entry entry) 570 { 571 switch (entry) { 572 case NORMAL_PMD: 573 case NORMAL_PUD: 574 return true; 575 default: 576 return pmc->need_rmap_locks; 577 } 578 } 579 580 /* 581 * Attempts to speedup the move by moving entry at the level corresponding to 582 * pgt_entry. Returns true if the move was successful, else false. 583 */ 584 static bool move_pgt_entry(struct pagetable_move_control *pmc, 585 enum pgt_entry entry, void *old_entry, void *new_entry) 586 { 587 bool moved = false; 588 bool need_rmap_locks = should_take_rmap_locks(pmc, entry); 589 590 /* See comment in move_ptes() */ 591 if (need_rmap_locks) 592 take_rmap_locks(pmc->old); 593 594 switch (entry) { 595 case NORMAL_PMD: 596 moved = move_normal_pmd(pmc, old_entry, new_entry); 597 break; 598 case NORMAL_PUD: 599 moved = move_normal_pud(pmc, old_entry, new_entry); 600 break; 601 case HPAGE_PMD: 602 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 603 move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, 604 new_entry); 605 break; 606 case HPAGE_PUD: 607 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 608 move_huge_pud(pmc, old_entry, new_entry); 609 break; 610 611 default: 612 WARN_ON_ONCE(1); 613 break; 614 } 615 616 if (need_rmap_locks) 617 drop_rmap_locks(pmc->old); 618 619 return moved; 620 } 621 622 /* 623 * A helper to check if aligning down is OK. The aligned address should fall 624 * on *no mapping*. For the stack moving down, that's a special move within 625 * the VMA that is created to span the source and destination of the move, 626 * so we make an exception for it. 627 */ 628 static bool can_align_down(struct pagetable_move_control *pmc, 629 struct vm_area_struct *vma, unsigned long addr_to_align, 630 unsigned long mask) 631 { 632 unsigned long addr_masked = addr_to_align & mask; 633 634 /* 635 * If @addr_to_align of either source or destination is not the beginning 636 * of the corresponding VMA, we can't align down or we will destroy part 637 * of the current mapping. 638 */ 639 if (!pmc->for_stack && vma->vm_start != addr_to_align) 640 return false; 641 642 /* In the stack case we explicitly permit in-VMA alignment. */ 643 if (pmc->for_stack && addr_masked >= vma->vm_start) 644 return true; 645 646 /* 647 * Make sure the realignment doesn't cause the address to fall on an 648 * existing mapping. 649 */ 650 return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; 651 } 652 653 /* 654 * Determine if are in fact able to realign for efficiency to a higher page 655 * table boundary. 656 */ 657 static bool can_realign_addr(struct pagetable_move_control *pmc, 658 unsigned long pagetable_mask) 659 { 660 unsigned long align_mask = ~pagetable_mask; 661 unsigned long old_align = pmc->old_addr & align_mask; 662 unsigned long new_align = pmc->new_addr & align_mask; 663 unsigned long pagetable_size = align_mask + 1; 664 unsigned long old_align_next = pagetable_size - old_align; 665 666 /* 667 * We don't want to have to go hunting for VMAs from the end of the old 668 * VMA to the next page table boundary, also we want to make sure the 669 * operation is wortwhile. 670 * 671 * So ensure that we only perform this realignment if the end of the 672 * range being copied reaches or crosses the page table boundary. 673 * 674 * boundary boundary 675 * .<- old_align -> . 676 * . |----------------.-----------| 677 * . | vma . | 678 * . |----------------.-----------| 679 * . <----------------.-----------> 680 * . len_in 681 * <-------------------------------> 682 * . pagetable_size . 683 * . <----------------> 684 * . old_align_next . 685 */ 686 if (pmc->len_in < old_align_next) 687 return false; 688 689 /* Skip if the addresses are already aligned. */ 690 if (old_align == 0) 691 return false; 692 693 /* Only realign if the new and old addresses are mutually aligned. */ 694 if (old_align != new_align) 695 return false; 696 697 /* Ensure realignment doesn't cause overlap with existing mappings. */ 698 if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || 699 !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) 700 return false; 701 702 return true; 703 } 704 705 /* 706 * Opportunistically realign to specified boundary for faster copy. 707 * 708 * Consider an mremap() of a VMA with page table boundaries as below, and no 709 * preceding VMAs from the lower page table boundary to the start of the VMA, 710 * with the end of the range reaching or crossing the page table boundary. 711 * 712 * boundary boundary 713 * . |----------------.-----------| 714 * . | vma . | 715 * . |----------------.-----------| 716 * . pmc->old_addr . pmc->old_end 717 * . <----------------------------> 718 * . move these page tables 719 * 720 * If we proceed with moving page tables in this scenario, we will have a lot of 721 * work to do traversing old page tables and establishing new ones in the 722 * destination across multiple lower level page tables. 723 * 724 * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the 725 * page table boundary, so we can simply copy a single page table entry for the 726 * aligned portion of the VMA instead: 727 * 728 * boundary boundary 729 * . |----------------.-----------| 730 * . | vma . | 731 * . |----------------.-----------| 732 * pmc->old_addr . pmc->old_end 733 * <-------------------------------------------> 734 * . move these page tables 735 */ 736 static void try_realign_addr(struct pagetable_move_control *pmc, 737 unsigned long pagetable_mask) 738 { 739 740 if (!can_realign_addr(pmc, pagetable_mask)) 741 return; 742 743 /* 744 * Simply align to page table boundaries. Note that we do NOT update the 745 * pmc->old_end value, and since the move_page_tables() operation spans 746 * from [old_addr, old_end) (offsetting new_addr as it is performed), 747 * this simply changes the start of the copy, not the end. 748 */ 749 pmc->old_addr &= pagetable_mask; 750 pmc->new_addr &= pagetable_mask; 751 } 752 753 /* Is the page table move operation done? */ 754 static bool pmc_done(struct pagetable_move_control *pmc) 755 { 756 return pmc->old_addr >= pmc->old_end; 757 } 758 759 /* Advance to the next page table, offset by extent bytes. */ 760 static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) 761 { 762 pmc->old_addr += extent; 763 pmc->new_addr += extent; 764 } 765 766 /* 767 * Determine how many bytes in the specified input range have had their page 768 * tables moved so far. 769 */ 770 static unsigned long pmc_progress(struct pagetable_move_control *pmc) 771 { 772 unsigned long orig_old_addr = pmc->old_end - pmc->len_in; 773 unsigned long old_addr = pmc->old_addr; 774 775 /* 776 * Prevent negative return values when {old,new}_addr was realigned but 777 * we broke out of the loop in move_page_tables() for the first PMD 778 * itself. 779 */ 780 return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; 781 } 782 783 unsigned long move_page_tables(struct pagetable_move_control *pmc) 784 { 785 unsigned long extent; 786 struct mmu_notifier_range range; 787 pmd_t *old_pmd, *new_pmd; 788 pud_t *old_pud, *new_pud; 789 struct mm_struct *mm = pmc->old->vm_mm; 790 791 if (!pmc->len_in) 792 return 0; 793 794 if (is_vm_hugetlb_page(pmc->old)) 795 return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, 796 pmc->new_addr, pmc->len_in); 797 798 /* 799 * If possible, realign addresses to PMD boundary for faster copy. 800 * Only realign if the mremap copying hits a PMD boundary. 801 */ 802 try_realign_addr(pmc, PMD_MASK); 803 804 flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); 805 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, 806 pmc->old_addr, pmc->old_end); 807 mmu_notifier_invalidate_range_start(&range); 808 809 for (; !pmc_done(pmc); pmc_next(pmc, extent)) { 810 cond_resched(); 811 /* 812 * If extent is PUD-sized try to speed up the move by moving at the 813 * PUD level if possible. 814 */ 815 extent = get_extent(NORMAL_PUD, pmc); 816 817 old_pud = get_old_pud(mm, pmc->old_addr); 818 if (!old_pud) 819 continue; 820 new_pud = alloc_new_pud(mm, pmc->new_addr); 821 if (!new_pud) 822 break; 823 if (pud_trans_huge(*old_pud)) { 824 if (extent == HPAGE_PUD_SIZE) { 825 move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); 826 /* We ignore and continue on error? */ 827 continue; 828 } 829 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { 830 if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) 831 continue; 832 } 833 834 extent = get_extent(NORMAL_PMD, pmc); 835 old_pmd = get_old_pmd(mm, pmc->old_addr); 836 if (!old_pmd) 837 continue; 838 new_pmd = alloc_new_pmd(mm, pmc->new_addr); 839 if (!new_pmd) 840 break; 841 again: 842 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { 843 if (extent == HPAGE_PMD_SIZE && 844 move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) 845 continue; 846 split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); 847 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && 848 extent == PMD_SIZE) { 849 /* 850 * If the extent is PMD-sized, try to speed the move by 851 * moving at the PMD level if possible. 852 */ 853 if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) 854 continue; 855 } 856 if (pmd_none(*old_pmd)) 857 continue; 858 if (pte_alloc(pmc->new->vm_mm, new_pmd)) 859 break; 860 if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) 861 goto again; 862 } 863 864 mmu_notifier_invalidate_range_end(&range); 865 866 return pmc_progress(pmc); 867 } 868 869 /* Set vrm->delta to the difference in VMA size specified by user. */ 870 static void vrm_set_delta(struct vma_remap_struct *vrm) 871 { 872 vrm->delta = abs_diff(vrm->old_len, vrm->new_len); 873 } 874 875 /* Determine what kind of remap this is - shrink, expand or no resize at all. */ 876 static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) 877 { 878 if (vrm->delta == 0) 879 return MREMAP_NO_RESIZE; 880 881 if (vrm->old_len > vrm->new_len) 882 return MREMAP_SHRINK; 883 884 return MREMAP_EXPAND; 885 } 886 887 /* 888 * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs 889 * overlapping? 890 */ 891 static bool vrm_overlaps(struct vma_remap_struct *vrm) 892 { 893 unsigned long start_old = vrm->addr; 894 unsigned long start_new = vrm->new_addr; 895 unsigned long end_old = vrm->addr + vrm->old_len; 896 unsigned long end_new = vrm->new_addr + vrm->new_len; 897 898 /* 899 * start_old end_old 900 * |-----------| 901 * | | 902 * |-----------| 903 * |-------------| 904 * | | 905 * |-------------| 906 * start_new end_new 907 */ 908 if (end_old > start_new && end_new > start_old) 909 return true; 910 911 return false; 912 } 913 914 /* Do the mremap() flags require that the new_addr parameter be specified? */ 915 static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) 916 { 917 return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); 918 } 919 920 /* 921 * Find an unmapped area for the requested vrm->new_addr. 922 * 923 * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only 924 * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to 925 * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. 926 * 927 * Returns 0 on success (with vrm->new_addr updated), or an error code upon 928 * failure. 929 */ 930 static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) 931 { 932 struct vm_area_struct *vma = vrm->vma; 933 unsigned long map_flags = 0; 934 /* Page Offset _into_ the VMA. */ 935 pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; 936 pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; 937 unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; 938 unsigned long res; 939 940 if (vrm->flags & MREMAP_FIXED) 941 map_flags |= MAP_FIXED; 942 if (vma->vm_flags & VM_MAYSHARE) 943 map_flags |= MAP_SHARED; 944 945 res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, 946 map_flags); 947 if (IS_ERR_VALUE(res)) 948 return res; 949 950 vrm->new_addr = res; 951 return 0; 952 } 953 954 /* 955 * Keep track of pages which have been added to the memory mapping. If the VMA 956 * is accounted, also check to see if there is sufficient memory. 957 * 958 * Returns true on success, false if insufficient memory to charge. 959 */ 960 static bool vrm_charge(struct vma_remap_struct *vrm) 961 { 962 unsigned long charged; 963 964 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 965 return true; 966 967 /* 968 * If we don't unmap the old mapping, then we account the entirety of 969 * the length of the new one. Otherwise it's just the delta in size. 970 */ 971 if (vrm->flags & MREMAP_DONTUNMAP) 972 charged = vrm->new_len >> PAGE_SHIFT; 973 else 974 charged = vrm->delta >> PAGE_SHIFT; 975 976 977 /* This accounts 'charged' pages of memory. */ 978 if (security_vm_enough_memory_mm(current->mm, charged)) 979 return false; 980 981 vrm->charged = charged; 982 return true; 983 } 984 985 /* 986 * an error has occurred so we will not be using vrm->charged memory. Unaccount 987 * this memory if the VMA is accounted. 988 */ 989 static void vrm_uncharge(struct vma_remap_struct *vrm) 990 { 991 if (!(vrm->vma->vm_flags & VM_ACCOUNT)) 992 return; 993 994 vm_unacct_memory(vrm->charged); 995 vrm->charged = 0; 996 } 997 998 /* 999 * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to 1000 * account for 'bytes' memory used, and if locked, indicate this in the VRM so 1001 * we can handle this correctly later. 1002 */ 1003 static void vrm_stat_account(struct vma_remap_struct *vrm, 1004 unsigned long bytes) 1005 { 1006 unsigned long pages = bytes >> PAGE_SHIFT; 1007 struct mm_struct *mm = current->mm; 1008 struct vm_area_struct *vma = vrm->vma; 1009 1010 vm_stat_account(mm, vma->vm_flags, pages); 1011 if (vma->vm_flags & VM_LOCKED) { 1012 mm->locked_vm += pages; 1013 vrm->mlocked = true; 1014 } 1015 } 1016 1017 /* 1018 * Perform checks before attempting to write a VMA prior to it being 1019 * moved. 1020 */ 1021 static unsigned long prep_move_vma(struct vma_remap_struct *vrm) 1022 { 1023 unsigned long err = 0; 1024 struct vm_area_struct *vma = vrm->vma; 1025 unsigned long old_addr = vrm->addr; 1026 unsigned long old_len = vrm->old_len; 1027 vm_flags_t dummy = vma->vm_flags; 1028 1029 /* 1030 * We'd prefer to avoid failure later on in do_munmap: 1031 * which may split one vma into three before unmapping. 1032 */ 1033 if (current->mm->map_count >= sysctl_max_map_count - 3) 1034 return -ENOMEM; 1035 1036 if (vma->vm_ops && vma->vm_ops->may_split) { 1037 if (vma->vm_start != old_addr) 1038 err = vma->vm_ops->may_split(vma, old_addr); 1039 if (!err && vma->vm_end != old_addr + old_len) 1040 err = vma->vm_ops->may_split(vma, old_addr + old_len); 1041 if (err) 1042 return err; 1043 } 1044 1045 /* 1046 * Advise KSM to break any KSM pages in the area to be moved: 1047 * it would be confusing if they were to turn up at the new 1048 * location, where they happen to coincide with different KSM 1049 * pages recently unmapped. But leave vma->vm_flags as it was, 1050 * so KSM can come around to merge on vma and new_vma afterwards. 1051 */ 1052 err = ksm_madvise(vma, old_addr, old_addr + old_len, 1053 MADV_UNMERGEABLE, &dummy); 1054 if (err) 1055 return err; 1056 1057 return 0; 1058 } 1059 1060 /* 1061 * Unmap source VMA for VMA move, turning it from a copy to a move, being 1062 * careful to ensure we do not underflow memory account while doing so if an 1063 * accountable move. 1064 * 1065 * This is best effort, if we fail to unmap then we simply try to correct 1066 * accounting and exit. 1067 */ 1068 static void unmap_source_vma(struct vma_remap_struct *vrm) 1069 { 1070 struct mm_struct *mm = current->mm; 1071 unsigned long addr = vrm->addr; 1072 unsigned long len = vrm->old_len; 1073 struct vm_area_struct *vma = vrm->vma; 1074 VMA_ITERATOR(vmi, mm, addr); 1075 int err; 1076 unsigned long vm_start; 1077 unsigned long vm_end; 1078 /* 1079 * It might seem odd that we check for MREMAP_DONTUNMAP here, given this 1080 * function implies that we unmap the original VMA, which seems 1081 * contradictory. 1082 * 1083 * However, this occurs when this operation was attempted and an error 1084 * arose, in which case we _do_ wish to unmap the _new_ VMA, which means 1085 * we actually _do_ want it be unaccounted. 1086 */ 1087 bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && 1088 !(vrm->flags & MREMAP_DONTUNMAP); 1089 1090 /* 1091 * So we perform a trick here to prevent incorrect accounting. Any merge 1092 * or new VMA allocation performed in copy_vma() does not adjust 1093 * accounting, it is expected that callers handle this. 1094 * 1095 * And indeed we already have, accounting appropriately in the case of 1096 * both in vrm_charge(). 1097 * 1098 * However, when we unmap the existing VMA (to effect the move), this 1099 * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount 1100 * removed pages. 1101 * 1102 * To avoid this we temporarily clear this flag, reinstating on any 1103 * portions of the original VMA that remain. 1104 */ 1105 if (accountable_move) { 1106 vm_flags_clear(vma, VM_ACCOUNT); 1107 /* We are about to split vma, so store the start/end. */ 1108 vm_start = vma->vm_start; 1109 vm_end = vma->vm_end; 1110 } 1111 1112 err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); 1113 vrm->vma = NULL; /* Invalidated. */ 1114 if (err) { 1115 /* OOM: unable to split vma, just get accounts right */ 1116 vm_acct_memory(len >> PAGE_SHIFT); 1117 return; 1118 } 1119 1120 /* 1121 * If we mremap() from a VMA like this: 1122 * 1123 * addr end 1124 * | | 1125 * v v 1126 * |-------------| 1127 * | | 1128 * |-------------| 1129 * 1130 * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above 1131 * we'll end up with: 1132 * 1133 * addr end 1134 * | | 1135 * v v 1136 * |---| |---| 1137 * | A | | B | 1138 * |---| |---| 1139 * 1140 * The VMI is still pointing at addr, so vma_prev() will give us A, and 1141 * a subsequent or lone vma_next() will give as B. 1142 * 1143 * do_vmi_munmap() will have restored the VMI back to addr. 1144 */ 1145 if (accountable_move) { 1146 unsigned long end = addr + len; 1147 1148 if (vm_start < addr) { 1149 struct vm_area_struct *prev = vma_prev(&vmi); 1150 1151 vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ 1152 } 1153 1154 if (vm_end > end) { 1155 struct vm_area_struct *next = vma_next(&vmi); 1156 1157 vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ 1158 } 1159 } 1160 } 1161 1162 /* 1163 * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the 1164 * process. Additionally handle an error occurring on moving of page tables, 1165 * where we reset vrm state to cause unmapping of the new VMA. 1166 * 1167 * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an 1168 * error code. 1169 */ 1170 static int copy_vma_and_data(struct vma_remap_struct *vrm, 1171 struct vm_area_struct **new_vma_ptr) 1172 { 1173 unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; 1174 unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; 1175 unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; 1176 unsigned long moved_len; 1177 struct vm_area_struct *vma = vrm->vma; 1178 struct vm_area_struct *new_vma; 1179 int err = 0; 1180 PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); 1181 1182 new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, 1183 &pmc.need_rmap_locks); 1184 if (!new_vma) { 1185 vrm_uncharge(vrm); 1186 *new_vma_ptr = NULL; 1187 return -ENOMEM; 1188 } 1189 vrm->vma = vma; 1190 pmc.old = vma; 1191 pmc.new = new_vma; 1192 1193 moved_len = move_page_tables(&pmc); 1194 if (moved_len < vrm->old_len) 1195 err = -ENOMEM; 1196 else if (vma->vm_ops && vma->vm_ops->mremap) 1197 err = vma->vm_ops->mremap(new_vma); 1198 1199 if (unlikely(err)) { 1200 PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, 1201 vrm->addr, moved_len); 1202 1203 /* 1204 * On error, move entries back from new area to old, 1205 * which will succeed since page tables still there, 1206 * and then proceed to unmap new area instead of old. 1207 */ 1208 pmc_revert.need_rmap_locks = true; 1209 move_page_tables(&pmc_revert); 1210 1211 vrm->vma = new_vma; 1212 vrm->old_len = vrm->new_len; 1213 vrm->addr = vrm->new_addr; 1214 } else { 1215 mremap_userfaultfd_prep(new_vma, vrm->uf); 1216 } 1217 1218 fixup_hugetlb_reservations(vma); 1219 1220 *new_vma_ptr = new_vma; 1221 return err; 1222 } 1223 1224 /* 1225 * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and 1226 * account flags on remaining VMA by convention (it cannot be mlock()'d any 1227 * longer, as pages in range are no longer mapped), and removing anon_vma_chain 1228 * links from it (if the entire VMA was copied over). 1229 */ 1230 static void dontunmap_complete(struct vma_remap_struct *vrm, 1231 struct vm_area_struct *new_vma) 1232 { 1233 unsigned long start = vrm->addr; 1234 unsigned long end = vrm->addr + vrm->old_len; 1235 unsigned long old_start = vrm->vma->vm_start; 1236 unsigned long old_end = vrm->vma->vm_end; 1237 1238 /* 1239 * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old 1240 * vma. 1241 */ 1242 vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); 1243 1244 /* 1245 * anon_vma links of the old vma is no longer needed after its page 1246 * table has been moved. 1247 */ 1248 if (new_vma != vrm->vma && start == old_start && end == old_end) 1249 unlink_anon_vmas(vrm->vma); 1250 1251 /* Because we won't unmap we don't need to touch locked_vm. */ 1252 } 1253 1254 static unsigned long move_vma(struct vma_remap_struct *vrm) 1255 { 1256 struct mm_struct *mm = current->mm; 1257 struct vm_area_struct *new_vma; 1258 unsigned long hiwater_vm; 1259 int err; 1260 1261 err = prep_move_vma(vrm); 1262 if (err) 1263 return err; 1264 1265 /* If accounted, charge the number of bytes the operation will use. */ 1266 if (!vrm_charge(vrm)) 1267 return -ENOMEM; 1268 1269 /* We don't want racing faults. */ 1270 vma_start_write(vrm->vma); 1271 1272 /* Perform copy step. */ 1273 err = copy_vma_and_data(vrm, &new_vma); 1274 /* 1275 * If we established the copied-to VMA, we attempt to recover from the 1276 * error by setting the destination VMA to the source VMA and unmapping 1277 * it below. 1278 */ 1279 if (err && !new_vma) 1280 return err; 1281 1282 /* 1283 * If we failed to move page tables we still do total_vm increment 1284 * since do_munmap() will decrement it by old_len == new_len. 1285 * 1286 * Since total_vm is about to be raised artificially high for a 1287 * moment, we need to restore high watermark afterwards: if stats 1288 * are taken meanwhile, total_vm and hiwater_vm appear too high. 1289 * If this were a serious issue, we'd add a flag to do_munmap(). 1290 */ 1291 hiwater_vm = mm->hiwater_vm; 1292 1293 vrm_stat_account(vrm, vrm->new_len); 1294 if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) 1295 dontunmap_complete(vrm, new_vma); 1296 else 1297 unmap_source_vma(vrm); 1298 1299 mm->hiwater_vm = hiwater_vm; 1300 1301 return err ? (unsigned long)err : vrm->new_addr; 1302 } 1303 1304 /* 1305 * resize_is_valid() - Ensure the vma can be resized to the new length at the give 1306 * address. 1307 * 1308 * Return 0 on success, error otherwise. 1309 */ 1310 static int resize_is_valid(struct vma_remap_struct *vrm) 1311 { 1312 struct mm_struct *mm = current->mm; 1313 struct vm_area_struct *vma = vrm->vma; 1314 unsigned long addr = vrm->addr; 1315 unsigned long old_len = vrm->old_len; 1316 unsigned long new_len = vrm->new_len; 1317 unsigned long pgoff; 1318 1319 /* 1320 * !old_len is a special case where an attempt is made to 'duplicate' 1321 * a mapping. This makes no sense for private mappings as it will 1322 * instead create a fresh/new mapping unrelated to the original. This 1323 * is contrary to the basic idea of mremap which creates new mappings 1324 * based on the original. There are no known use cases for this 1325 * behavior. As a result, fail such attempts. 1326 */ 1327 if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { 1328 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", 1329 current->comm, current->pid); 1330 return -EINVAL; 1331 } 1332 1333 if ((vrm->flags & MREMAP_DONTUNMAP) && 1334 (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) 1335 return -EINVAL; 1336 1337 /* We can't remap across vm area boundaries */ 1338 if (old_len > vma->vm_end - addr) 1339 return -EFAULT; 1340 1341 if (new_len == old_len) 1342 return 0; 1343 1344 /* Need to be careful about a growing mapping */ 1345 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 1346 pgoff += vma->vm_pgoff; 1347 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 1348 return -EINVAL; 1349 1350 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 1351 return -EFAULT; 1352 1353 if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) 1354 return -EAGAIN; 1355 1356 if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) 1357 return -ENOMEM; 1358 1359 return 0; 1360 } 1361 1362 /* 1363 * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so 1364 * execute this, optionally dropping the mmap lock when we do so. 1365 * 1366 * In both cases this invalidates the VMA, however if we don't drop the lock, 1367 * then load the correct VMA into vrm->vma afterwards. 1368 */ 1369 static unsigned long shrink_vma(struct vma_remap_struct *vrm, 1370 bool drop_lock) 1371 { 1372 struct mm_struct *mm = current->mm; 1373 unsigned long unmap_start = vrm->addr + vrm->new_len; 1374 unsigned long unmap_bytes = vrm->delta; 1375 unsigned long res; 1376 VMA_ITERATOR(vmi, mm, unmap_start); 1377 1378 VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); 1379 1380 res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, 1381 vrm->uf_unmap, drop_lock); 1382 vrm->vma = NULL; /* Invalidated. */ 1383 if (res) 1384 return res; 1385 1386 /* 1387 * If we've not dropped the lock, then we should reload the VMA to 1388 * replace the invalidated VMA with the one that may have now been 1389 * split. 1390 */ 1391 if (drop_lock) { 1392 vrm->mmap_locked = false; 1393 } else { 1394 vrm->vma = vma_lookup(mm, vrm->addr); 1395 if (!vrm->vma) 1396 return -EFAULT; 1397 } 1398 1399 return 0; 1400 } 1401 1402 /* 1403 * mremap_to() - remap a vma to a new location. 1404 * Returns: The new address of the vma or an error. 1405 */ 1406 static unsigned long mremap_to(struct vma_remap_struct *vrm) 1407 { 1408 struct mm_struct *mm = current->mm; 1409 unsigned long err; 1410 1411 /* Is the new length or address silly? */ 1412 if (vrm->new_len > TASK_SIZE || 1413 vrm->new_addr > TASK_SIZE - vrm->new_len) 1414 return -EINVAL; 1415 1416 if (vrm_overlaps(vrm)) 1417 return -EINVAL; 1418 1419 if (vrm->flags & MREMAP_FIXED) { 1420 /* 1421 * In mremap_to(). 1422 * VMA is moved to dst address, and munmap dst first. 1423 * do_munmap will check if dst is sealed. 1424 */ 1425 err = do_munmap(mm, vrm->new_addr, vrm->new_len, 1426 vrm->uf_unmap_early); 1427 vrm->vma = NULL; /* Invalidated. */ 1428 if (err) 1429 return err; 1430 1431 /* 1432 * If we remap a portion of a VMA elsewhere in the same VMA, 1433 * this can invalidate the old VMA. Reset. 1434 */ 1435 vrm->vma = vma_lookup(mm, vrm->addr); 1436 if (!vrm->vma) 1437 return -EFAULT; 1438 } 1439 1440 if (vrm->remap_type == MREMAP_SHRINK) { 1441 err = shrink_vma(vrm, /* drop_lock= */false); 1442 if (err) 1443 return err; 1444 1445 /* Set up for the move now shrink has been executed. */ 1446 vrm->old_len = vrm->new_len; 1447 } 1448 1449 err = resize_is_valid(vrm); 1450 if (err) 1451 return err; 1452 1453 /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ 1454 if (vrm->flags & MREMAP_DONTUNMAP) { 1455 vm_flags_t vm_flags = vrm->vma->vm_flags; 1456 unsigned long pages = vrm->old_len >> PAGE_SHIFT; 1457 1458 if (!may_expand_vm(mm, vm_flags, pages)) 1459 return -ENOMEM; 1460 } 1461 1462 err = vrm_set_new_addr(vrm); 1463 if (err) 1464 return err; 1465 1466 return move_vma(vrm); 1467 } 1468 1469 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) 1470 { 1471 unsigned long end = vma->vm_end + delta; 1472 1473 if (end < vma->vm_end) /* overflow */ 1474 return 0; 1475 if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) 1476 return 0; 1477 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 1478 0, MAP_FIXED) & ~PAGE_MASK) 1479 return 0; 1480 return 1; 1481 } 1482 1483 /* Determine whether we are actually able to execute an in-place expansion. */ 1484 static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) 1485 { 1486 /* Number of bytes from vrm->addr to end of VMA. */ 1487 unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; 1488 1489 /* If end of range aligns to end of VMA, we can just expand in-place. */ 1490 if (suffix_bytes != vrm->old_len) 1491 return false; 1492 1493 /* Check whether this is feasible. */ 1494 if (!vma_expandable(vrm->vma, vrm->delta)) 1495 return false; 1496 1497 return true; 1498 } 1499 1500 /* 1501 * Are the parameters passed to mremap() valid? If so return 0, otherwise return 1502 * error. 1503 */ 1504 static unsigned long check_mremap_params(struct vma_remap_struct *vrm) 1505 1506 { 1507 unsigned long addr = vrm->addr; 1508 unsigned long flags = vrm->flags; 1509 1510 /* Ensure no unexpected flag values. */ 1511 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) 1512 return -EINVAL; 1513 1514 /* Start address must be page-aligned. */ 1515 if (offset_in_page(addr)) 1516 return -EINVAL; 1517 1518 /* 1519 * We allow a zero old-len as a special case 1520 * for DOS-emu "duplicate shm area" thing. But 1521 * a zero new-len is nonsensical. 1522 */ 1523 if (!PAGE_ALIGN(vrm->new_len)) 1524 return -EINVAL; 1525 1526 /* Remainder of checks are for cases with specific new_addr. */ 1527 if (!vrm_implies_new_addr(vrm)) 1528 return 0; 1529 1530 /* The new address must be page-aligned. */ 1531 if (offset_in_page(vrm->new_addr)) 1532 return -EINVAL; 1533 1534 /* A fixed address implies a move. */ 1535 if (!(flags & MREMAP_MAYMOVE)) 1536 return -EINVAL; 1537 1538 /* MREMAP_DONTUNMAP does not allow resizing in the process. */ 1539 if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) 1540 return -EINVAL; 1541 1542 /* 1543 * move_vma() need us to stay 4 maps below the threshold, otherwise 1544 * it will bail out at the very beginning. 1545 * That is a problem if we have already unmaped the regions here 1546 * (new_addr, and old_addr), because userspace will not know the 1547 * state of the vma's after it gets -ENOMEM. 1548 * So, to avoid such scenario we can pre-compute if the whole 1549 * operation has high chances to success map-wise. 1550 * Worst-scenario case is when both vma's (new_addr and old_addr) get 1551 * split in 3 before unmapping it. 1552 * That means 2 more maps (1 for each) to the ones we already hold. 1553 * Check whether current map count plus 2 still leads us to 4 maps below 1554 * the threshold, otherwise return -ENOMEM here to be more safe. 1555 */ 1556 if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) 1557 return -ENOMEM; 1558 1559 return 0; 1560 } 1561 1562 /* 1563 * We know we can expand the VMA in-place by delta pages, so do so. 1564 * 1565 * If we discover the VMA is locked, update mm_struct statistics accordingly and 1566 * indicate so to the caller. 1567 */ 1568 static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) 1569 { 1570 struct mm_struct *mm = current->mm; 1571 struct vm_area_struct *vma = vrm->vma; 1572 VMA_ITERATOR(vmi, mm, vma->vm_end); 1573 1574 if (!vrm_charge(vrm)) 1575 return -ENOMEM; 1576 1577 /* 1578 * Function vma_merge_extend() is called on the 1579 * extension we are adding to the already existing vma, 1580 * vma_merge_extend() will merge this extension with the 1581 * already existing vma (expand operation itself) and 1582 * possibly also with the next vma if it becomes 1583 * adjacent to the expanded vma and otherwise 1584 * compatible. 1585 */ 1586 vma = vma_merge_extend(&vmi, vma, vrm->delta); 1587 if (!vma) { 1588 vrm_uncharge(vrm); 1589 return -ENOMEM; 1590 } 1591 vrm->vma = vma; 1592 1593 vrm_stat_account(vrm, vrm->delta); 1594 1595 return 0; 1596 } 1597 1598 static bool align_hugetlb(struct vma_remap_struct *vrm) 1599 { 1600 struct hstate *h __maybe_unused = hstate_vma(vrm->vma); 1601 1602 vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); 1603 vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); 1604 1605 /* addrs must be huge page aligned */ 1606 if (vrm->addr & ~huge_page_mask(h)) 1607 return false; 1608 if (vrm->new_addr & ~huge_page_mask(h)) 1609 return false; 1610 1611 /* 1612 * Don't allow remap expansion, because the underlying hugetlb 1613 * reservation is not yet capable to handle split reservation. 1614 */ 1615 if (vrm->new_len > vrm->old_len) 1616 return false; 1617 1618 vrm_set_delta(vrm); 1619 1620 return true; 1621 } 1622 1623 /* 1624 * We are mremap()'ing without specifying a fixed address to move to, but are 1625 * requesting that the VMA's size be increased. 1626 * 1627 * Try to do so in-place, if this fails, then move the VMA to a new location to 1628 * action the change. 1629 */ 1630 static unsigned long expand_vma(struct vma_remap_struct *vrm) 1631 { 1632 unsigned long err; 1633 unsigned long addr = vrm->addr; 1634 1635 err = resize_is_valid(vrm); 1636 if (err) 1637 return err; 1638 1639 /* 1640 * [addr, old_len) spans precisely to the end of the VMA, so try to 1641 * expand it in-place. 1642 */ 1643 if (vrm_can_expand_in_place(vrm)) { 1644 err = expand_vma_in_place(vrm); 1645 if (err) 1646 return err; 1647 1648 /* 1649 * We want to populate the newly expanded portion of the VMA to 1650 * satisfy the expectation that mlock()'ing a VMA maintains all 1651 * of its pages in memory. 1652 */ 1653 if (vrm->mlocked) 1654 vrm->new_addr = addr; 1655 1656 /* OK we're done! */ 1657 return addr; 1658 } 1659 1660 /* 1661 * We weren't able to just expand or shrink the area, 1662 * we need to create a new one and move it. 1663 */ 1664 1665 /* We're not allowed to move the VMA, so error out. */ 1666 if (!(vrm->flags & MREMAP_MAYMOVE)) 1667 return -ENOMEM; 1668 1669 /* Find a new location to move the VMA to. */ 1670 err = vrm_set_new_addr(vrm); 1671 if (err) 1672 return err; 1673 1674 return move_vma(vrm); 1675 } 1676 1677 /* 1678 * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the 1679 * first available address to perform the operation. 1680 */ 1681 static unsigned long mremap_at(struct vma_remap_struct *vrm) 1682 { 1683 unsigned long res; 1684 1685 switch (vrm->remap_type) { 1686 case MREMAP_INVALID: 1687 break; 1688 case MREMAP_NO_RESIZE: 1689 /* NO-OP CASE - resizing to the same size. */ 1690 return vrm->addr; 1691 case MREMAP_SHRINK: 1692 /* 1693 * SHRINK CASE. Can always be done in-place. 1694 * 1695 * Simply unmap the shrunken portion of the VMA. This does all 1696 * the needed commit accounting, and we indicate that the mmap 1697 * lock should be dropped. 1698 */ 1699 res = shrink_vma(vrm, /* drop_lock= */true); 1700 if (res) 1701 return res; 1702 1703 return vrm->addr; 1704 case MREMAP_EXPAND: 1705 return expand_vma(vrm); 1706 } 1707 1708 BUG(); 1709 } 1710 1711 static unsigned long do_mremap(struct vma_remap_struct *vrm) 1712 { 1713 struct mm_struct *mm = current->mm; 1714 struct vm_area_struct *vma; 1715 unsigned long ret; 1716 1717 ret = check_mremap_params(vrm); 1718 if (ret) 1719 return ret; 1720 1721 vrm->old_len = PAGE_ALIGN(vrm->old_len); 1722 vrm->new_len = PAGE_ALIGN(vrm->new_len); 1723 vrm_set_delta(vrm); 1724 1725 if (mmap_write_lock_killable(mm)) 1726 return -EINTR; 1727 vrm->mmap_locked = true; 1728 1729 vma = vrm->vma = vma_lookup(mm, vrm->addr); 1730 if (!vma) { 1731 ret = -EFAULT; 1732 goto out; 1733 } 1734 1735 /* If mseal()'d, mremap() is prohibited. */ 1736 if (!can_modify_vma(vma)) { 1737 ret = -EPERM; 1738 goto out; 1739 } 1740 1741 /* Align to hugetlb page size, if required. */ 1742 if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { 1743 ret = -EINVAL; 1744 goto out; 1745 } 1746 1747 vrm->remap_type = vrm_remap_type(vrm); 1748 1749 /* Actually execute mremap. */ 1750 ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); 1751 1752 out: 1753 if (vrm->mmap_locked) { 1754 mmap_write_unlock(mm); 1755 vrm->mmap_locked = false; 1756 1757 if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) 1758 mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); 1759 } 1760 1761 userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); 1762 mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); 1763 userfaultfd_unmap_complete(mm, vrm->uf_unmap); 1764 1765 return ret; 1766 } 1767 1768 /* 1769 * Expand (or shrink) an existing mapping, potentially moving it at the 1770 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1771 * 1772 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1773 * This option implies MREMAP_MAYMOVE. 1774 */ 1775 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1776 unsigned long, new_len, unsigned long, flags, 1777 unsigned long, new_addr) 1778 { 1779 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 1780 LIST_HEAD(uf_unmap_early); 1781 LIST_HEAD(uf_unmap); 1782 /* 1783 * There is a deliberate asymmetry here: we strip the pointer tag 1784 * from the old address but leave the new address alone. This is 1785 * for consistency with mmap(), where we prevent the creation of 1786 * aliasing mappings in userspace by leaving the tag bits of the 1787 * mapping address intact. A non-zero tag will cause the subsequent 1788 * range checks to reject the address as invalid. 1789 * 1790 * See Documentation/arch/arm64/tagged-address-abi.rst for more 1791 * information. 1792 */ 1793 struct vma_remap_struct vrm = { 1794 .addr = untagged_addr(addr), 1795 .old_len = old_len, 1796 .new_len = new_len, 1797 .flags = flags, 1798 .new_addr = new_addr, 1799 1800 .uf = &uf, 1801 .uf_unmap_early = &uf_unmap_early, 1802 .uf_unmap = &uf_unmap, 1803 1804 .remap_type = MREMAP_INVALID, /* We set later. */ 1805 }; 1806 1807 return do_mremap(&vrm); 1808 } 1809