1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/mprotect.c 4 * 5 * (C) Copyright 1994 Linus Torvalds 6 * (C) Copyright 2002 Christoph Hellwig 7 * 8 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 9 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 10 */ 11 12 #include <linux/pagewalk.h> 13 #include <linux/hugetlb.h> 14 #include <linux/shm.h> 15 #include <linux/mman.h> 16 #include <linux/fs.h> 17 #include <linux/highmem.h> 18 #include <linux/security.h> 19 #include <linux/mempolicy.h> 20 #include <linux/personality.h> 21 #include <linux/syscalls.h> 22 #include <linux/swap.h> 23 #include <linux/swapops.h> 24 #include <linux/mmu_notifier.h> 25 #include <linux/migrate.h> 26 #include <linux/perf_event.h> 27 #include <linux/pkeys.h> 28 #include <linux/ksm.h> 29 #include <linux/uaccess.h> 30 #include <linux/mm_inline.h> 31 #include <linux/pgtable.h> 32 #include <linux/sched/sysctl.h> 33 #include <linux/userfaultfd_k.h> 34 #include <asm/cacheflush.h> 35 #include <asm/mmu_context.h> 36 #include <asm/tlbflush.h> 37 #include <asm/tlb.h> 38 39 #include "internal.h" 40 41 static inline bool can_change_pte_writable(struct vm_area_struct *vma, 42 unsigned long addr, pte_t pte) 43 { 44 struct page *page; 45 46 VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); 47 48 if (pte_protnone(pte) || !pte_dirty(pte)) 49 return false; 50 51 /* Do we need write faults for softdirty tracking? */ 52 if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) 53 return false; 54 55 /* Do we need write faults for uffd-wp tracking? */ 56 if (userfaultfd_pte_wp(vma, pte)) 57 return false; 58 59 if (!(vma->vm_flags & VM_SHARED)) { 60 /* 61 * We can only special-case on exclusive anonymous pages, 62 * because we know that our write-fault handler similarly would 63 * map them writable without any additional checks while holding 64 * the PT lock. 65 */ 66 page = vm_normal_page(vma, addr, pte); 67 if (!page || !PageAnon(page) || !PageAnonExclusive(page)) 68 return false; 69 } 70 71 return true; 72 } 73 74 static unsigned long change_pte_range(struct mmu_gather *tlb, 75 struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, 76 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 77 { 78 pte_t *pte, oldpte; 79 spinlock_t *ptl; 80 unsigned long pages = 0; 81 int target_node = NUMA_NO_NODE; 82 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 83 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 84 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 85 86 tlb_change_page_size(tlb, PAGE_SIZE); 87 88 /* 89 * Can be called with only the mmap_lock for reading by 90 * prot_numa so we must check the pmd isn't constantly 91 * changing from under us from pmd_none to pmd_trans_huge 92 * and/or the other way around. 93 */ 94 if (pmd_trans_unstable(pmd)) 95 return 0; 96 97 /* 98 * The pmd points to a regular pte so the pmd can't change 99 * from under us even if the mmap_lock is only hold for 100 * reading. 101 */ 102 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 103 104 /* Get target node for single threaded private VMAs */ 105 if (prot_numa && !(vma->vm_flags & VM_SHARED) && 106 atomic_read(&vma->vm_mm->mm_users) == 1) 107 target_node = numa_node_id(); 108 109 flush_tlb_batched_pending(vma->vm_mm); 110 arch_enter_lazy_mmu_mode(); 111 do { 112 oldpte = *pte; 113 if (pte_present(oldpte)) { 114 pte_t ptent; 115 bool preserve_write = prot_numa && pte_write(oldpte); 116 117 /* 118 * Avoid trapping faults against the zero or KSM 119 * pages. See similar comment in change_huge_pmd. 120 */ 121 if (prot_numa) { 122 struct page *page; 123 int nid; 124 125 /* Avoid TLB flush if possible */ 126 if (pte_protnone(oldpte)) 127 continue; 128 129 page = vm_normal_page(vma, addr, oldpte); 130 if (!page || is_zone_device_page(page) || PageKsm(page)) 131 continue; 132 133 /* Also skip shared copy-on-write pages */ 134 if (is_cow_mapping(vma->vm_flags) && 135 page_count(page) != 1) 136 continue; 137 138 /* 139 * While migration can move some dirty pages, 140 * it cannot move them all from MIGRATE_ASYNC 141 * context. 142 */ 143 if (page_is_file_lru(page) && PageDirty(page)) 144 continue; 145 146 /* 147 * Don't mess with PTEs if page is already on the node 148 * a single-threaded process is running on. 149 */ 150 nid = page_to_nid(page); 151 if (target_node == nid) 152 continue; 153 154 /* 155 * Skip scanning top tier node if normal numa 156 * balancing is disabled 157 */ 158 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 159 node_is_toptier(nid)) 160 continue; 161 } 162 163 oldpte = ptep_modify_prot_start(vma, addr, pte); 164 ptent = pte_modify(oldpte, newprot); 165 if (preserve_write) 166 ptent = pte_mk_savedwrite(ptent); 167 168 if (uffd_wp) { 169 ptent = pte_wrprotect(ptent); 170 ptent = pte_mkuffd_wp(ptent); 171 } else if (uffd_wp_resolve) { 172 ptent = pte_clear_uffd_wp(ptent); 173 } 174 175 /* 176 * In some writable, shared mappings, we might want 177 * to catch actual write access -- see 178 * vma_wants_writenotify(). 179 * 180 * In all writable, private mappings, we have to 181 * properly handle COW. 182 * 183 * In both cases, we can sometimes still change PTEs 184 * writable and avoid the write-fault handler, for 185 * example, if a PTE is already dirty and no other 186 * COW or special handling is required. 187 */ 188 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && 189 !pte_write(ptent) && 190 can_change_pte_writable(vma, addr, ptent)) 191 ptent = pte_mkwrite(ptent); 192 193 ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); 194 if (pte_needs_flush(oldpte, ptent)) 195 tlb_flush_pte_range(tlb, addr, PAGE_SIZE); 196 pages++; 197 } else if (is_swap_pte(oldpte)) { 198 swp_entry_t entry = pte_to_swp_entry(oldpte); 199 struct page *page = pfn_swap_entry_to_page(entry); 200 pte_t newpte; 201 202 if (is_writable_migration_entry(entry)) { 203 /* 204 * A protection check is difficult so 205 * just be safe and disable write 206 */ 207 if (PageAnon(page)) 208 entry = make_readable_exclusive_migration_entry( 209 swp_offset(entry)); 210 else 211 entry = make_readable_migration_entry(swp_offset(entry)); 212 newpte = swp_entry_to_pte(entry); 213 if (pte_swp_soft_dirty(oldpte)) 214 newpte = pte_swp_mksoft_dirty(newpte); 215 if (pte_swp_uffd_wp(oldpte)) 216 newpte = pte_swp_mkuffd_wp(newpte); 217 } else if (is_writable_device_private_entry(entry)) { 218 /* 219 * We do not preserve soft-dirtiness. See 220 * copy_one_pte() for explanation. 221 */ 222 entry = make_readable_device_private_entry( 223 swp_offset(entry)); 224 newpte = swp_entry_to_pte(entry); 225 if (pte_swp_uffd_wp(oldpte)) 226 newpte = pte_swp_mkuffd_wp(newpte); 227 } else if (is_writable_device_exclusive_entry(entry)) { 228 entry = make_readable_device_exclusive_entry( 229 swp_offset(entry)); 230 newpte = swp_entry_to_pte(entry); 231 if (pte_swp_soft_dirty(oldpte)) 232 newpte = pte_swp_mksoft_dirty(newpte); 233 if (pte_swp_uffd_wp(oldpte)) 234 newpte = pte_swp_mkuffd_wp(newpte); 235 } else if (pte_marker_entry_uffd_wp(entry)) { 236 /* 237 * If this is uffd-wp pte marker and we'd like 238 * to unprotect it, drop it; the next page 239 * fault will trigger without uffd trapping. 240 */ 241 if (uffd_wp_resolve) { 242 pte_clear(vma->vm_mm, addr, pte); 243 pages++; 244 } 245 continue; 246 } else { 247 newpte = oldpte; 248 } 249 250 if (uffd_wp) 251 newpte = pte_swp_mkuffd_wp(newpte); 252 else if (uffd_wp_resolve) 253 newpte = pte_swp_clear_uffd_wp(newpte); 254 255 if (!pte_same(oldpte, newpte)) { 256 set_pte_at(vma->vm_mm, addr, pte, newpte); 257 pages++; 258 } 259 } else { 260 /* It must be an none page, or what else?.. */ 261 WARN_ON_ONCE(!pte_none(oldpte)); 262 if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { 263 /* 264 * For file-backed mem, we need to be able to 265 * wr-protect a none pte, because even if the 266 * pte is none, the page/swap cache could 267 * exist. Doing that by install a marker. 268 */ 269 set_pte_at(vma->vm_mm, addr, pte, 270 make_pte_marker(PTE_MARKER_UFFD_WP)); 271 pages++; 272 } 273 } 274 } while (pte++, addr += PAGE_SIZE, addr != end); 275 arch_leave_lazy_mmu_mode(); 276 pte_unmap_unlock(pte - 1, ptl); 277 278 return pages; 279 } 280 281 /* 282 * Used when setting automatic NUMA hinting protection where it is 283 * critical that a numa hinting PMD is not confused with a bad PMD. 284 */ 285 static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) 286 { 287 pmd_t pmdval = pmd_read_atomic(pmd); 288 289 /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ 290 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 291 barrier(); 292 #endif 293 294 if (pmd_none(pmdval)) 295 return 1; 296 if (pmd_trans_huge(pmdval)) 297 return 0; 298 if (unlikely(pmd_bad(pmdval))) { 299 pmd_clear_bad(pmd); 300 return 1; 301 } 302 303 return 0; 304 } 305 306 /* Return true if we're uffd wr-protecting file-backed memory, or false */ 307 static inline bool 308 uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) 309 { 310 return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); 311 } 312 313 /* 314 * If wr-protecting the range for file-backed, populate pgtable for the case 315 * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() 316 * failed it means no memory, we don't have a better option but stop. 317 */ 318 #define change_pmd_prepare(vma, pmd, cp_flags) \ 319 do { \ 320 if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ 321 if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ 322 break; \ 323 } \ 324 } while (0) 325 /* 326 * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to 327 * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, 328 * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. 329 */ 330 #define change_prepare(vma, high, low, addr, cp_flags) \ 331 do { \ 332 if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ 333 low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ 334 if (WARN_ON_ONCE(p == NULL)) \ 335 break; \ 336 } \ 337 } while (0) 338 339 static inline unsigned long change_pmd_range(struct mmu_gather *tlb, 340 struct vm_area_struct *vma, pud_t *pud, unsigned long addr, 341 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 342 { 343 pmd_t *pmd; 344 unsigned long next; 345 unsigned long pages = 0; 346 unsigned long nr_huge_updates = 0; 347 struct mmu_notifier_range range; 348 349 range.start = 0; 350 351 pmd = pmd_offset(pud, addr); 352 do { 353 unsigned long this_pages; 354 355 next = pmd_addr_end(addr, end); 356 357 change_pmd_prepare(vma, pmd, cp_flags); 358 /* 359 * Automatic NUMA balancing walks the tables with mmap_lock 360 * held for read. It's possible a parallel update to occur 361 * between pmd_trans_huge() and a pmd_none_or_clear_bad() 362 * check leading to a false positive and clearing. 363 * Hence, it's necessary to atomically read the PMD value 364 * for all the checks. 365 */ 366 if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && 367 pmd_none_or_clear_bad_unless_trans_huge(pmd)) 368 goto next; 369 370 /* invoke the mmu notifier if the pmd is populated */ 371 if (!range.start) { 372 mmu_notifier_range_init(&range, 373 MMU_NOTIFY_PROTECTION_VMA, 0, 374 vma, vma->vm_mm, addr, end); 375 mmu_notifier_invalidate_range_start(&range); 376 } 377 378 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 379 if ((next - addr != HPAGE_PMD_SIZE) || 380 uffd_wp_protect_file(vma, cp_flags)) { 381 __split_huge_pmd(vma, pmd, addr, false, NULL); 382 /* 383 * For file-backed, the pmd could have been 384 * cleared; make sure pmd populated if 385 * necessary, then fall-through to pte level. 386 */ 387 change_pmd_prepare(vma, pmd, cp_flags); 388 } else { 389 /* 390 * change_huge_pmd() does not defer TLB flushes, 391 * so no need to propagate the tlb argument. 392 */ 393 int nr_ptes = change_huge_pmd(tlb, vma, pmd, 394 addr, newprot, cp_flags); 395 396 if (nr_ptes) { 397 if (nr_ptes == HPAGE_PMD_NR) { 398 pages += HPAGE_PMD_NR; 399 nr_huge_updates++; 400 } 401 402 /* huge pmd was handled */ 403 goto next; 404 } 405 } 406 /* fall through, the trans huge pmd just split */ 407 } 408 this_pages = change_pte_range(tlb, vma, pmd, addr, next, 409 newprot, cp_flags); 410 pages += this_pages; 411 next: 412 cond_resched(); 413 } while (pmd++, addr = next, addr != end); 414 415 if (range.start) 416 mmu_notifier_invalidate_range_end(&range); 417 418 if (nr_huge_updates) 419 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); 420 return pages; 421 } 422 423 static inline unsigned long change_pud_range(struct mmu_gather *tlb, 424 struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, 425 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 426 { 427 pud_t *pud; 428 unsigned long next; 429 unsigned long pages = 0; 430 431 pud = pud_offset(p4d, addr); 432 do { 433 next = pud_addr_end(addr, end); 434 change_prepare(vma, pud, pmd, addr, cp_flags); 435 if (pud_none_or_clear_bad(pud)) 436 continue; 437 pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, 438 cp_flags); 439 } while (pud++, addr = next, addr != end); 440 441 return pages; 442 } 443 444 static inline unsigned long change_p4d_range(struct mmu_gather *tlb, 445 struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, 446 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 447 { 448 p4d_t *p4d; 449 unsigned long next; 450 unsigned long pages = 0; 451 452 p4d = p4d_offset(pgd, addr); 453 do { 454 next = p4d_addr_end(addr, end); 455 change_prepare(vma, p4d, pud, addr, cp_flags); 456 if (p4d_none_or_clear_bad(p4d)) 457 continue; 458 pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, 459 cp_flags); 460 } while (p4d++, addr = next, addr != end); 461 462 return pages; 463 } 464 465 static unsigned long change_protection_range(struct mmu_gather *tlb, 466 struct vm_area_struct *vma, unsigned long addr, 467 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 468 { 469 struct mm_struct *mm = vma->vm_mm; 470 pgd_t *pgd; 471 unsigned long next; 472 unsigned long pages = 0; 473 474 BUG_ON(addr >= end); 475 pgd = pgd_offset(mm, addr); 476 tlb_start_vma(tlb, vma); 477 do { 478 next = pgd_addr_end(addr, end); 479 change_prepare(vma, pgd, p4d, addr, cp_flags); 480 if (pgd_none_or_clear_bad(pgd)) 481 continue; 482 pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, 483 cp_flags); 484 } while (pgd++, addr = next, addr != end); 485 486 tlb_end_vma(tlb, vma); 487 488 return pages; 489 } 490 491 unsigned long change_protection(struct mmu_gather *tlb, 492 struct vm_area_struct *vma, unsigned long start, 493 unsigned long end, pgprot_t newprot, 494 unsigned long cp_flags) 495 { 496 unsigned long pages; 497 498 BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); 499 500 if (is_vm_hugetlb_page(vma)) 501 pages = hugetlb_change_protection(vma, start, end, newprot, 502 cp_flags); 503 else 504 pages = change_protection_range(tlb, vma, start, end, newprot, 505 cp_flags); 506 507 return pages; 508 } 509 510 static int prot_none_pte_entry(pte_t *pte, unsigned long addr, 511 unsigned long next, struct mm_walk *walk) 512 { 513 return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 514 0 : -EACCES; 515 } 516 517 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, 518 unsigned long addr, unsigned long next, 519 struct mm_walk *walk) 520 { 521 return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 522 0 : -EACCES; 523 } 524 525 static int prot_none_test(unsigned long addr, unsigned long next, 526 struct mm_walk *walk) 527 { 528 return 0; 529 } 530 531 static const struct mm_walk_ops prot_none_walk_ops = { 532 .pte_entry = prot_none_pte_entry, 533 .hugetlb_entry = prot_none_hugetlb_entry, 534 .test_walk = prot_none_test, 535 }; 536 537 int 538 mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, 539 struct vm_area_struct **pprev, unsigned long start, 540 unsigned long end, unsigned long newflags) 541 { 542 struct mm_struct *mm = vma->vm_mm; 543 unsigned long oldflags = vma->vm_flags; 544 long nrpages = (end - start) >> PAGE_SHIFT; 545 unsigned long charged = 0; 546 bool try_change_writable; 547 pgoff_t pgoff; 548 int error; 549 550 if (newflags == oldflags) { 551 *pprev = vma; 552 return 0; 553 } 554 555 /* 556 * Do PROT_NONE PFN permission checks here when we can still 557 * bail out without undoing a lot of state. This is a rather 558 * uncommon case, so doesn't need to be very optimized. 559 */ 560 if (arch_has_pfn_modify_check() && 561 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 562 (newflags & VM_ACCESS_FLAGS) == 0) { 563 pgprot_t new_pgprot = vm_get_page_prot(newflags); 564 565 error = walk_page_range(current->mm, start, end, 566 &prot_none_walk_ops, &new_pgprot); 567 if (error) 568 return error; 569 } 570 571 /* 572 * If we make a private mapping writable we increase our commit; 573 * but (without finer accounting) cannot reduce our commit if we 574 * make it unwritable again. hugetlb mapping were accounted for 575 * even if read-only so there is no need to account for them here 576 */ 577 if (newflags & VM_WRITE) { 578 /* Check space limits when area turns into data. */ 579 if (!may_expand_vm(mm, newflags, nrpages) && 580 may_expand_vm(mm, oldflags, nrpages)) 581 return -ENOMEM; 582 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 583 VM_SHARED|VM_NORESERVE))) { 584 charged = nrpages; 585 if (security_vm_enough_memory_mm(mm, charged)) 586 return -ENOMEM; 587 newflags |= VM_ACCOUNT; 588 } 589 } 590 591 /* 592 * First try to merge with previous and/or next vma. 593 */ 594 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 595 *pprev = vma_merge(mm, *pprev, start, end, newflags, 596 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), 597 vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 598 if (*pprev) { 599 vma = *pprev; 600 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); 601 goto success; 602 } 603 604 *pprev = vma; 605 606 if (start != vma->vm_start) { 607 error = split_vma(mm, vma, start, 1); 608 if (error) 609 goto fail; 610 } 611 612 if (end != vma->vm_end) { 613 error = split_vma(mm, vma, end, 0); 614 if (error) 615 goto fail; 616 } 617 618 success: 619 /* 620 * vm_flags and vm_page_prot are protected by the mmap_lock 621 * held in write mode. 622 */ 623 vma->vm_flags = newflags; 624 /* 625 * We want to check manually if we can change individual PTEs writable 626 * if we can't do that automatically for all PTEs in a mapping. For 627 * private mappings, that's always the case when we have write 628 * permissions as we properly have to handle COW. 629 */ 630 if (vma->vm_flags & VM_SHARED) 631 try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); 632 else 633 try_change_writable = !!(vma->vm_flags & VM_WRITE); 634 vma_set_page_prot(vma); 635 636 change_protection(tlb, vma, start, end, vma->vm_page_prot, 637 try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); 638 639 /* 640 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major 641 * fault on access. 642 */ 643 if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && 644 (newflags & VM_WRITE)) { 645 populate_vma_page_range(vma, start, end, NULL); 646 } 647 648 vm_stat_account(mm, oldflags, -nrpages); 649 vm_stat_account(mm, newflags, nrpages); 650 perf_event_mmap(vma); 651 return 0; 652 653 fail: 654 vm_unacct_memory(charged); 655 return error; 656 } 657 658 /* 659 * pkey==-1 when doing a legacy mprotect() 660 */ 661 static int do_mprotect_pkey(unsigned long start, size_t len, 662 unsigned long prot, int pkey) 663 { 664 unsigned long nstart, end, tmp, reqprot; 665 struct vm_area_struct *vma, *prev; 666 int error; 667 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 668 const bool rier = (current->personality & READ_IMPLIES_EXEC) && 669 (prot & PROT_READ); 670 struct mmu_gather tlb; 671 672 start = untagged_addr(start); 673 674 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); 675 if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ 676 return -EINVAL; 677 678 if (start & ~PAGE_MASK) 679 return -EINVAL; 680 if (!len) 681 return 0; 682 len = PAGE_ALIGN(len); 683 end = start + len; 684 if (end <= start) 685 return -ENOMEM; 686 if (!arch_validate_prot(prot, start)) 687 return -EINVAL; 688 689 reqprot = prot; 690 691 if (mmap_write_lock_killable(current->mm)) 692 return -EINTR; 693 694 /* 695 * If userspace did not allocate the pkey, do not let 696 * them use it here. 697 */ 698 error = -EINVAL; 699 if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) 700 goto out; 701 702 vma = find_vma(current->mm, start); 703 error = -ENOMEM; 704 if (!vma) 705 goto out; 706 707 if (unlikely(grows & PROT_GROWSDOWN)) { 708 if (vma->vm_start >= end) 709 goto out; 710 start = vma->vm_start; 711 error = -EINVAL; 712 if (!(vma->vm_flags & VM_GROWSDOWN)) 713 goto out; 714 } else { 715 if (vma->vm_start > start) 716 goto out; 717 if (unlikely(grows & PROT_GROWSUP)) { 718 end = vma->vm_end; 719 error = -EINVAL; 720 if (!(vma->vm_flags & VM_GROWSUP)) 721 goto out; 722 } 723 } 724 725 if (start > vma->vm_start) 726 prev = vma; 727 else 728 prev = vma->vm_prev; 729 730 tlb_gather_mmu(&tlb, current->mm); 731 for (nstart = start ; ; ) { 732 unsigned long mask_off_old_flags; 733 unsigned long newflags; 734 int new_vma_pkey; 735 736 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 737 738 /* Does the application expect PROT_READ to imply PROT_EXEC */ 739 if (rier && (vma->vm_flags & VM_MAYEXEC)) 740 prot |= PROT_EXEC; 741 742 /* 743 * Each mprotect() call explicitly passes r/w/x permissions. 744 * If a permission is not passed to mprotect(), it must be 745 * cleared from the VMA. 746 */ 747 mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | 748 VM_FLAGS_CLEAR; 749 750 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); 751 newflags = calc_vm_prot_bits(prot, new_vma_pkey); 752 newflags |= (vma->vm_flags & ~mask_off_old_flags); 753 754 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 755 if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { 756 error = -EACCES; 757 break; 758 } 759 760 /* Allow architectures to sanity-check the new flags */ 761 if (!arch_validate_flags(newflags)) { 762 error = -EINVAL; 763 break; 764 } 765 766 error = security_file_mprotect(vma, reqprot, prot); 767 if (error) 768 break; 769 770 tmp = vma->vm_end; 771 if (tmp > end) 772 tmp = end; 773 774 if (vma->vm_ops && vma->vm_ops->mprotect) { 775 error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); 776 if (error) 777 break; 778 } 779 780 error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); 781 if (error) 782 break; 783 784 nstart = tmp; 785 786 if (nstart < prev->vm_end) 787 nstart = prev->vm_end; 788 if (nstart >= end) 789 break; 790 791 vma = prev->vm_next; 792 if (!vma || vma->vm_start != nstart) { 793 error = -ENOMEM; 794 break; 795 } 796 prot = reqprot; 797 } 798 tlb_finish_mmu(&tlb); 799 out: 800 mmap_write_unlock(current->mm); 801 return error; 802 } 803 804 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 805 unsigned long, prot) 806 { 807 return do_mprotect_pkey(start, len, prot, -1); 808 } 809 810 #ifdef CONFIG_ARCH_HAS_PKEYS 811 812 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, 813 unsigned long, prot, int, pkey) 814 { 815 return do_mprotect_pkey(start, len, prot, pkey); 816 } 817 818 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) 819 { 820 int pkey; 821 int ret; 822 823 /* No flags supported yet. */ 824 if (flags) 825 return -EINVAL; 826 /* check for unsupported init values */ 827 if (init_val & ~PKEY_ACCESS_MASK) 828 return -EINVAL; 829 830 mmap_write_lock(current->mm); 831 pkey = mm_pkey_alloc(current->mm); 832 833 ret = -ENOSPC; 834 if (pkey == -1) 835 goto out; 836 837 ret = arch_set_user_pkey_access(current, pkey, init_val); 838 if (ret) { 839 mm_pkey_free(current->mm, pkey); 840 goto out; 841 } 842 ret = pkey; 843 out: 844 mmap_write_unlock(current->mm); 845 return ret; 846 } 847 848 SYSCALL_DEFINE1(pkey_free, int, pkey) 849 { 850 int ret; 851 852 mmap_write_lock(current->mm); 853 ret = mm_pkey_free(current->mm, pkey); 854 mmap_write_unlock(current->mm); 855 856 /* 857 * We could provide warnings or errors if any VMA still 858 * has the pkey set here. 859 */ 860 return ret; 861 } 862 863 #endif /* CONFIG_ARCH_HAS_PKEYS */ 864