1 #include <linux/kernel.h> 2 #include <linux/errno.h> 3 #include <linux/err.h> 4 #include <linux/spinlock.h> 5 6 #include <linux/mm.h> 7 #include <linux/memremap.h> 8 #include <linux/pagemap.h> 9 #include <linux/rmap.h> 10 #include <linux/swap.h> 11 #include <linux/swapops.h> 12 13 #include <linux/sched/signal.h> 14 #include <linux/rwsem.h> 15 #include <linux/hugetlb.h> 16 17 #include <asm/mmu_context.h> 18 #include <asm/pgtable.h> 19 #include <asm/tlbflush.h> 20 21 #include "internal.h" 22 23 static struct page *no_page_table(struct vm_area_struct *vma, 24 unsigned int flags) 25 { 26 /* 27 * When core dumping an enormous anonymous area that nobody 28 * has touched so far, we don't want to allocate unnecessary pages or 29 * page tables. Return error instead of NULL to skip handle_mm_fault, 30 * then get_dump_page() will return NULL to leave a hole in the dump. 31 * But we can only make this optimization where a hole would surely 32 * be zero-filled if handle_mm_fault() actually did handle it. 33 */ 34 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) 35 return ERR_PTR(-EFAULT); 36 return NULL; 37 } 38 39 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, 40 pte_t *pte, unsigned int flags) 41 { 42 /* No page to get reference */ 43 if (flags & FOLL_GET) 44 return -EFAULT; 45 46 if (flags & FOLL_TOUCH) { 47 pte_t entry = *pte; 48 49 if (flags & FOLL_WRITE) 50 entry = pte_mkdirty(entry); 51 entry = pte_mkyoung(entry); 52 53 if (!pte_same(*pte, entry)) { 54 set_pte_at(vma->vm_mm, address, pte, entry); 55 update_mmu_cache(vma, address, pte); 56 } 57 } 58 59 /* Proper page table entry exists, but no corresponding struct page */ 60 return -EEXIST; 61 } 62 63 /* 64 * FOLL_FORCE can write to even unwritable pte's, but only 65 * after we've gone through a COW cycle and they are dirty. 66 */ 67 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) 68 { 69 return pte_write(pte) || 70 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); 71 } 72 73 static struct page *follow_page_pte(struct vm_area_struct *vma, 74 unsigned long address, pmd_t *pmd, unsigned int flags) 75 { 76 struct mm_struct *mm = vma->vm_mm; 77 struct dev_pagemap *pgmap = NULL; 78 struct page *page; 79 spinlock_t *ptl; 80 pte_t *ptep, pte; 81 82 retry: 83 if (unlikely(pmd_bad(*pmd))) 84 return no_page_table(vma, flags); 85 86 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 87 pte = *ptep; 88 if (!pte_present(pte)) { 89 swp_entry_t entry; 90 /* 91 * KSM's break_ksm() relies upon recognizing a ksm page 92 * even while it is being migrated, so for that case we 93 * need migration_entry_wait(). 94 */ 95 if (likely(!(flags & FOLL_MIGRATION))) 96 goto no_page; 97 if (pte_none(pte)) 98 goto no_page; 99 entry = pte_to_swp_entry(pte); 100 if (!is_migration_entry(entry)) 101 goto no_page; 102 pte_unmap_unlock(ptep, ptl); 103 migration_entry_wait(mm, pmd, address); 104 goto retry; 105 } 106 if ((flags & FOLL_NUMA) && pte_protnone(pte)) 107 goto no_page; 108 if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { 109 pte_unmap_unlock(ptep, ptl); 110 return NULL; 111 } 112 113 page = vm_normal_page(vma, address, pte); 114 if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { 115 /* 116 * Only return device mapping pages in the FOLL_GET case since 117 * they are only valid while holding the pgmap reference. 118 */ 119 pgmap = get_dev_pagemap(pte_pfn(pte), NULL); 120 if (pgmap) 121 page = pte_page(pte); 122 else 123 goto no_page; 124 } else if (unlikely(!page)) { 125 if (flags & FOLL_DUMP) { 126 /* Avoid special (like zero) pages in core dumps */ 127 page = ERR_PTR(-EFAULT); 128 goto out; 129 } 130 131 if (is_zero_pfn(pte_pfn(pte))) { 132 page = pte_page(pte); 133 } else { 134 int ret; 135 136 ret = follow_pfn_pte(vma, address, ptep, flags); 137 page = ERR_PTR(ret); 138 goto out; 139 } 140 } 141 142 if (flags & FOLL_SPLIT && PageTransCompound(page)) { 143 int ret; 144 get_page(page); 145 pte_unmap_unlock(ptep, ptl); 146 lock_page(page); 147 ret = split_huge_page(page); 148 unlock_page(page); 149 put_page(page); 150 if (ret) 151 return ERR_PTR(ret); 152 goto retry; 153 } 154 155 if (flags & FOLL_GET) { 156 get_page(page); 157 158 /* drop the pgmap reference now that we hold the page */ 159 if (pgmap) { 160 put_dev_pagemap(pgmap); 161 pgmap = NULL; 162 } 163 } 164 if (flags & FOLL_TOUCH) { 165 if ((flags & FOLL_WRITE) && 166 !pte_dirty(pte) && !PageDirty(page)) 167 set_page_dirty(page); 168 /* 169 * pte_mkyoung() would be more correct here, but atomic care 170 * is needed to avoid losing the dirty bit: it is easier to use 171 * mark_page_accessed(). 172 */ 173 mark_page_accessed(page); 174 } 175 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 176 /* Do not mlock pte-mapped THP */ 177 if (PageTransCompound(page)) 178 goto out; 179 180 /* 181 * The preliminary mapping check is mainly to avoid the 182 * pointless overhead of lock_page on the ZERO_PAGE 183 * which might bounce very badly if there is contention. 184 * 185 * If the page is already locked, we don't need to 186 * handle it now - vmscan will handle it later if and 187 * when it attempts to reclaim the page. 188 */ 189 if (page->mapping && trylock_page(page)) { 190 lru_add_drain(); /* push cached pages to LRU */ 191 /* 192 * Because we lock page here, and migration is 193 * blocked by the pte's page reference, and we 194 * know the page is still mapped, we don't even 195 * need to check for file-cache page truncation. 196 */ 197 mlock_vma_page(page); 198 unlock_page(page); 199 } 200 } 201 out: 202 pte_unmap_unlock(ptep, ptl); 203 return page; 204 no_page: 205 pte_unmap_unlock(ptep, ptl); 206 if (!pte_none(pte)) 207 return NULL; 208 return no_page_table(vma, flags); 209 } 210 211 /** 212 * follow_page_mask - look up a page descriptor from a user-virtual address 213 * @vma: vm_area_struct mapping @address 214 * @address: virtual address to look up 215 * @flags: flags modifying lookup behaviour 216 * @page_mask: on output, *page_mask is set according to the size of the page 217 * 218 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 219 * 220 * Returns the mapped (struct page *), %NULL if no mapping exists, or 221 * an error pointer if there is a mapping to something not represented 222 * by a page descriptor (see also vm_normal_page()). 223 */ 224 struct page *follow_page_mask(struct vm_area_struct *vma, 225 unsigned long address, unsigned int flags, 226 unsigned int *page_mask) 227 { 228 pgd_t *pgd; 229 p4d_t *p4d; 230 pud_t *pud; 231 pmd_t *pmd; 232 spinlock_t *ptl; 233 struct page *page; 234 struct mm_struct *mm = vma->vm_mm; 235 236 *page_mask = 0; 237 238 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 239 if (!IS_ERR(page)) { 240 BUG_ON(flags & FOLL_GET); 241 return page; 242 } 243 244 pgd = pgd_offset(mm, address); 245 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 246 return no_page_table(vma, flags); 247 p4d = p4d_offset(pgd, address); 248 if (p4d_none(*p4d)) 249 return no_page_table(vma, flags); 250 BUILD_BUG_ON(p4d_huge(*p4d)); 251 if (unlikely(p4d_bad(*p4d))) 252 return no_page_table(vma, flags); 253 pud = pud_offset(p4d, address); 254 if (pud_none(*pud)) 255 return no_page_table(vma, flags); 256 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 257 page = follow_huge_pud(mm, address, pud, flags); 258 if (page) 259 return page; 260 return no_page_table(vma, flags); 261 } 262 if (pud_devmap(*pud)) { 263 ptl = pud_lock(mm, pud); 264 page = follow_devmap_pud(vma, address, pud, flags); 265 spin_unlock(ptl); 266 if (page) 267 return page; 268 } 269 if (unlikely(pud_bad(*pud))) 270 return no_page_table(vma, flags); 271 272 pmd = pmd_offset(pud, address); 273 if (pmd_none(*pmd)) 274 return no_page_table(vma, flags); 275 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 276 page = follow_huge_pmd(mm, address, pmd, flags); 277 if (page) 278 return page; 279 return no_page_table(vma, flags); 280 } 281 if (pmd_devmap(*pmd)) { 282 ptl = pmd_lock(mm, pmd); 283 page = follow_devmap_pmd(vma, address, pmd, flags); 284 spin_unlock(ptl); 285 if (page) 286 return page; 287 } 288 if (likely(!pmd_trans_huge(*pmd))) 289 return follow_page_pte(vma, address, pmd, flags); 290 291 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 292 return no_page_table(vma, flags); 293 294 ptl = pmd_lock(mm, pmd); 295 if (unlikely(!pmd_trans_huge(*pmd))) { 296 spin_unlock(ptl); 297 return follow_page_pte(vma, address, pmd, flags); 298 } 299 if (flags & FOLL_SPLIT) { 300 int ret; 301 page = pmd_page(*pmd); 302 if (is_huge_zero_page(page)) { 303 spin_unlock(ptl); 304 ret = 0; 305 split_huge_pmd(vma, pmd, address); 306 if (pmd_trans_unstable(pmd)) 307 ret = -EBUSY; 308 } else { 309 get_page(page); 310 spin_unlock(ptl); 311 lock_page(page); 312 ret = split_huge_page(page); 313 unlock_page(page); 314 put_page(page); 315 if (pmd_none(*pmd)) 316 return no_page_table(vma, flags); 317 } 318 319 return ret ? ERR_PTR(ret) : 320 follow_page_pte(vma, address, pmd, flags); 321 } 322 323 page = follow_trans_huge_pmd(vma, address, pmd, flags); 324 spin_unlock(ptl); 325 *page_mask = HPAGE_PMD_NR - 1; 326 return page; 327 } 328 329 static int get_gate_page(struct mm_struct *mm, unsigned long address, 330 unsigned int gup_flags, struct vm_area_struct **vma, 331 struct page **page) 332 { 333 pgd_t *pgd; 334 p4d_t *p4d; 335 pud_t *pud; 336 pmd_t *pmd; 337 pte_t *pte; 338 int ret = -EFAULT; 339 340 /* user gate pages are read-only */ 341 if (gup_flags & FOLL_WRITE) 342 return -EFAULT; 343 if (address > TASK_SIZE) 344 pgd = pgd_offset_k(address); 345 else 346 pgd = pgd_offset_gate(mm, address); 347 BUG_ON(pgd_none(*pgd)); 348 p4d = p4d_offset(pgd, address); 349 BUG_ON(p4d_none(*p4d)); 350 pud = pud_offset(p4d, address); 351 BUG_ON(pud_none(*pud)); 352 pmd = pmd_offset(pud, address); 353 if (pmd_none(*pmd)) 354 return -EFAULT; 355 VM_BUG_ON(pmd_trans_huge(*pmd)); 356 pte = pte_offset_map(pmd, address); 357 if (pte_none(*pte)) 358 goto unmap; 359 *vma = get_gate_vma(mm); 360 if (!page) 361 goto out; 362 *page = vm_normal_page(*vma, address, *pte); 363 if (!*page) { 364 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 365 goto unmap; 366 *page = pte_page(*pte); 367 } 368 get_page(*page); 369 out: 370 ret = 0; 371 unmap: 372 pte_unmap(pte); 373 return ret; 374 } 375 376 /* 377 * mmap_sem must be held on entry. If @nonblocking != NULL and 378 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. 379 * If it is, *@nonblocking will be set to 0 and -EBUSY returned. 380 */ 381 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 382 unsigned long address, unsigned int *flags, int *nonblocking) 383 { 384 unsigned int fault_flags = 0; 385 int ret; 386 387 /* mlock all present pages, but do not fault in new pages */ 388 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) 389 return -ENOENT; 390 /* For mm_populate(), just skip the stack guard page. */ 391 if ((*flags & FOLL_POPULATE) && 392 (stack_guard_page_start(vma, address) || 393 stack_guard_page_end(vma, address + PAGE_SIZE))) 394 return -ENOENT; 395 if (*flags & FOLL_WRITE) 396 fault_flags |= FAULT_FLAG_WRITE; 397 if (*flags & FOLL_REMOTE) 398 fault_flags |= FAULT_FLAG_REMOTE; 399 if (nonblocking) 400 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 401 if (*flags & FOLL_NOWAIT) 402 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 403 if (*flags & FOLL_TRIED) { 404 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); 405 fault_flags |= FAULT_FLAG_TRIED; 406 } 407 408 ret = handle_mm_fault(vma, address, fault_flags); 409 if (ret & VM_FAULT_ERROR) { 410 int err = vm_fault_to_errno(ret, *flags); 411 412 if (err) 413 return err; 414 BUG(); 415 } 416 417 if (tsk) { 418 if (ret & VM_FAULT_MAJOR) 419 tsk->maj_flt++; 420 else 421 tsk->min_flt++; 422 } 423 424 if (ret & VM_FAULT_RETRY) { 425 if (nonblocking) 426 *nonblocking = 0; 427 return -EBUSY; 428 } 429 430 /* 431 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when 432 * necessary, even if maybe_mkwrite decided not to set pte_write. We 433 * can thus safely do subsequent page lookups as if they were reads. 434 * But only do so when looping for pte_write is futile: in some cases 435 * userspace may also be wanting to write to the gotten user page, 436 * which a read fault here might prevent (a readonly page might get 437 * reCOWed by userspace write). 438 */ 439 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 440 *flags |= FOLL_COW; 441 return 0; 442 } 443 444 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 445 { 446 vm_flags_t vm_flags = vma->vm_flags; 447 int write = (gup_flags & FOLL_WRITE); 448 int foreign = (gup_flags & FOLL_REMOTE); 449 450 if (vm_flags & (VM_IO | VM_PFNMAP)) 451 return -EFAULT; 452 453 if (write) { 454 if (!(vm_flags & VM_WRITE)) { 455 if (!(gup_flags & FOLL_FORCE)) 456 return -EFAULT; 457 /* 458 * We used to let the write,force case do COW in a 459 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 460 * set a breakpoint in a read-only mapping of an 461 * executable, without corrupting the file (yet only 462 * when that file had been opened for writing!). 463 * Anon pages in shared mappings are surprising: now 464 * just reject it. 465 */ 466 if (!is_cow_mapping(vm_flags)) 467 return -EFAULT; 468 } 469 } else if (!(vm_flags & VM_READ)) { 470 if (!(gup_flags & FOLL_FORCE)) 471 return -EFAULT; 472 /* 473 * Is there actually any vma we can reach here which does not 474 * have VM_MAYREAD set? 475 */ 476 if (!(vm_flags & VM_MAYREAD)) 477 return -EFAULT; 478 } 479 /* 480 * gups are always data accesses, not instruction 481 * fetches, so execute=false here 482 */ 483 if (!arch_vma_access_permitted(vma, write, false, foreign)) 484 return -EFAULT; 485 return 0; 486 } 487 488 /** 489 * __get_user_pages() - pin user pages in memory 490 * @tsk: task_struct of target task 491 * @mm: mm_struct of target mm 492 * @start: starting user address 493 * @nr_pages: number of pages from start to pin 494 * @gup_flags: flags modifying pin behaviour 495 * @pages: array that receives pointers to the pages pinned. 496 * Should be at least nr_pages long. Or NULL, if caller 497 * only intends to ensure the pages are faulted in. 498 * @vmas: array of pointers to vmas corresponding to each page. 499 * Or NULL if the caller does not require them. 500 * @nonblocking: whether waiting for disk IO or mmap_sem contention 501 * 502 * Returns number of pages pinned. This may be fewer than the number 503 * requested. If nr_pages is 0 or negative, returns 0. If no pages 504 * were pinned, returns -errno. Each page returned must be released 505 * with a put_page() call when it is finished with. vmas will only 506 * remain valid while mmap_sem is held. 507 * 508 * Must be called with mmap_sem held. It may be released. See below. 509 * 510 * __get_user_pages walks a process's page tables and takes a reference to 511 * each struct page that each user address corresponds to at a given 512 * instant. That is, it takes the page that would be accessed if a user 513 * thread accesses the given user virtual address at that instant. 514 * 515 * This does not guarantee that the page exists in the user mappings when 516 * __get_user_pages returns, and there may even be a completely different 517 * page there in some cases (eg. if mmapped pagecache has been invalidated 518 * and subsequently re faulted). However it does guarantee that the page 519 * won't be freed completely. And mostly callers simply care that the page 520 * contains data that was valid *at some point in time*. Typically, an IO 521 * or similar operation cannot guarantee anything stronger anyway because 522 * locks can't be held over the syscall boundary. 523 * 524 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 525 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 526 * appropriate) must be called after the page is finished with, and 527 * before put_page is called. 528 * 529 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 530 * or mmap_sem contention, and if waiting is needed to pin all pages, 531 * *@nonblocking will be set to 0. Further, if @gup_flags does not 532 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in 533 * this case. 534 * 535 * A caller using such a combination of @nonblocking and @gup_flags 536 * must therefore hold the mmap_sem for reading only, and recognize 537 * when it's been released. Otherwise, it must be held for either 538 * reading or writing and will not be released. 539 * 540 * In most cases, get_user_pages or get_user_pages_fast should be used 541 * instead of __get_user_pages. __get_user_pages should be used only if 542 * you need some special @gup_flags. 543 */ 544 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 545 unsigned long start, unsigned long nr_pages, 546 unsigned int gup_flags, struct page **pages, 547 struct vm_area_struct **vmas, int *nonblocking) 548 { 549 long i = 0; 550 unsigned int page_mask; 551 struct vm_area_struct *vma = NULL; 552 553 if (!nr_pages) 554 return 0; 555 556 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 557 558 /* 559 * If FOLL_FORCE is set then do not force a full fault as the hinting 560 * fault information is unrelated to the reference behaviour of a task 561 * using the address space 562 */ 563 if (!(gup_flags & FOLL_FORCE)) 564 gup_flags |= FOLL_NUMA; 565 566 do { 567 struct page *page; 568 unsigned int foll_flags = gup_flags; 569 unsigned int page_increm; 570 571 /* first iteration or cross vma bound */ 572 if (!vma || start >= vma->vm_end) { 573 vma = find_extend_vma(mm, start); 574 if (!vma && in_gate_area(mm, start)) { 575 int ret; 576 ret = get_gate_page(mm, start & PAGE_MASK, 577 gup_flags, &vma, 578 pages ? &pages[i] : NULL); 579 if (ret) 580 return i ? : ret; 581 page_mask = 0; 582 goto next_page; 583 } 584 585 if (!vma || check_vma_flags(vma, gup_flags)) 586 return i ? : -EFAULT; 587 if (is_vm_hugetlb_page(vma)) { 588 i = follow_hugetlb_page(mm, vma, pages, vmas, 589 &start, &nr_pages, i, 590 gup_flags, nonblocking); 591 continue; 592 } 593 } 594 retry: 595 /* 596 * If we have a pending SIGKILL, don't keep faulting pages and 597 * potentially allocating memory. 598 */ 599 if (unlikely(fatal_signal_pending(current))) 600 return i ? i : -ERESTARTSYS; 601 cond_resched(); 602 page = follow_page_mask(vma, start, foll_flags, &page_mask); 603 if (!page) { 604 int ret; 605 ret = faultin_page(tsk, vma, start, &foll_flags, 606 nonblocking); 607 switch (ret) { 608 case 0: 609 goto retry; 610 case -EFAULT: 611 case -ENOMEM: 612 case -EHWPOISON: 613 return i ? i : ret; 614 case -EBUSY: 615 return i; 616 case -ENOENT: 617 goto next_page; 618 } 619 BUG(); 620 } else if (PTR_ERR(page) == -EEXIST) { 621 /* 622 * Proper page table entry exists, but no corresponding 623 * struct page. 624 */ 625 goto next_page; 626 } else if (IS_ERR(page)) { 627 return i ? i : PTR_ERR(page); 628 } 629 if (pages) { 630 pages[i] = page; 631 flush_anon_page(vma, page, start); 632 flush_dcache_page(page); 633 page_mask = 0; 634 } 635 next_page: 636 if (vmas) { 637 vmas[i] = vma; 638 page_mask = 0; 639 } 640 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 641 if (page_increm > nr_pages) 642 page_increm = nr_pages; 643 i += page_increm; 644 start += page_increm * PAGE_SIZE; 645 nr_pages -= page_increm; 646 } while (nr_pages); 647 return i; 648 } 649 650 static bool vma_permits_fault(struct vm_area_struct *vma, 651 unsigned int fault_flags) 652 { 653 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 654 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 655 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 656 657 if (!(vm_flags & vma->vm_flags)) 658 return false; 659 660 /* 661 * The architecture might have a hardware protection 662 * mechanism other than read/write that can deny access. 663 * 664 * gup always represents data access, not instruction 665 * fetches, so execute=false here: 666 */ 667 if (!arch_vma_access_permitted(vma, write, false, foreign)) 668 return false; 669 670 return true; 671 } 672 673 /* 674 * fixup_user_fault() - manually resolve a user page fault 675 * @tsk: the task_struct to use for page fault accounting, or 676 * NULL if faults are not to be recorded. 677 * @mm: mm_struct of target mm 678 * @address: user address 679 * @fault_flags:flags to pass down to handle_mm_fault() 680 * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller 681 * does not allow retry 682 * 683 * This is meant to be called in the specific scenario where for locking reasons 684 * we try to access user memory in atomic context (within a pagefault_disable() 685 * section), this returns -EFAULT, and we want to resolve the user fault before 686 * trying again. 687 * 688 * Typically this is meant to be used by the futex code. 689 * 690 * The main difference with get_user_pages() is that this function will 691 * unconditionally call handle_mm_fault() which will in turn perform all the 692 * necessary SW fixup of the dirty and young bits in the PTE, while 693 * get_user_pages() only guarantees to update these in the struct page. 694 * 695 * This is important for some architectures where those bits also gate the 696 * access permission to the page because they are maintained in software. On 697 * such architectures, gup() will not be enough to make a subsequent access 698 * succeed. 699 * 700 * This function will not return with an unlocked mmap_sem. So it has not the 701 * same semantics wrt the @mm->mmap_sem as does filemap_fault(). 702 */ 703 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 704 unsigned long address, unsigned int fault_flags, 705 bool *unlocked) 706 { 707 struct vm_area_struct *vma; 708 int ret, major = 0; 709 710 if (unlocked) 711 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 712 713 retry: 714 vma = find_extend_vma(mm, address); 715 if (!vma || address < vma->vm_start) 716 return -EFAULT; 717 718 if (!vma_permits_fault(vma, fault_flags)) 719 return -EFAULT; 720 721 ret = handle_mm_fault(vma, address, fault_flags); 722 major |= ret & VM_FAULT_MAJOR; 723 if (ret & VM_FAULT_ERROR) { 724 int err = vm_fault_to_errno(ret, 0); 725 726 if (err) 727 return err; 728 BUG(); 729 } 730 731 if (ret & VM_FAULT_RETRY) { 732 down_read(&mm->mmap_sem); 733 if (!(fault_flags & FAULT_FLAG_TRIED)) { 734 *unlocked = true; 735 fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; 736 fault_flags |= FAULT_FLAG_TRIED; 737 goto retry; 738 } 739 } 740 741 if (tsk) { 742 if (major) 743 tsk->maj_flt++; 744 else 745 tsk->min_flt++; 746 } 747 return 0; 748 } 749 EXPORT_SYMBOL_GPL(fixup_user_fault); 750 751 static __always_inline long __get_user_pages_locked(struct task_struct *tsk, 752 struct mm_struct *mm, 753 unsigned long start, 754 unsigned long nr_pages, 755 struct page **pages, 756 struct vm_area_struct **vmas, 757 int *locked, bool notify_drop, 758 unsigned int flags) 759 { 760 long ret, pages_done; 761 bool lock_dropped; 762 763 if (locked) { 764 /* if VM_FAULT_RETRY can be returned, vmas become invalid */ 765 BUG_ON(vmas); 766 /* check caller initialized locked */ 767 BUG_ON(*locked != 1); 768 } 769 770 if (pages) 771 flags |= FOLL_GET; 772 773 pages_done = 0; 774 lock_dropped = false; 775 for (;;) { 776 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, 777 vmas, locked); 778 if (!locked) 779 /* VM_FAULT_RETRY couldn't trigger, bypass */ 780 return ret; 781 782 /* VM_FAULT_RETRY cannot return errors */ 783 if (!*locked) { 784 BUG_ON(ret < 0); 785 BUG_ON(ret >= nr_pages); 786 } 787 788 if (!pages) 789 /* If it's a prefault don't insist harder */ 790 return ret; 791 792 if (ret > 0) { 793 nr_pages -= ret; 794 pages_done += ret; 795 if (!nr_pages) 796 break; 797 } 798 if (*locked) { 799 /* VM_FAULT_RETRY didn't trigger */ 800 if (!pages_done) 801 pages_done = ret; 802 break; 803 } 804 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ 805 pages += ret; 806 start += ret << PAGE_SHIFT; 807 808 /* 809 * Repeat on the address that fired VM_FAULT_RETRY 810 * without FAULT_FLAG_ALLOW_RETRY but with 811 * FAULT_FLAG_TRIED. 812 */ 813 *locked = 1; 814 lock_dropped = true; 815 down_read(&mm->mmap_sem); 816 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, 817 pages, NULL, NULL); 818 if (ret != 1) { 819 BUG_ON(ret > 1); 820 if (!pages_done) 821 pages_done = ret; 822 break; 823 } 824 nr_pages--; 825 pages_done++; 826 if (!nr_pages) 827 break; 828 pages++; 829 start += PAGE_SIZE; 830 } 831 if (notify_drop && lock_dropped && *locked) { 832 /* 833 * We must let the caller know we temporarily dropped the lock 834 * and so the critical section protected by it was lost. 835 */ 836 up_read(&mm->mmap_sem); 837 *locked = 0; 838 } 839 return pages_done; 840 } 841 842 /* 843 * We can leverage the VM_FAULT_RETRY functionality in the page fault 844 * paths better by using either get_user_pages_locked() or 845 * get_user_pages_unlocked(). 846 * 847 * get_user_pages_locked() is suitable to replace the form: 848 * 849 * down_read(&mm->mmap_sem); 850 * do_something() 851 * get_user_pages(tsk, mm, ..., pages, NULL); 852 * up_read(&mm->mmap_sem); 853 * 854 * to: 855 * 856 * int locked = 1; 857 * down_read(&mm->mmap_sem); 858 * do_something() 859 * get_user_pages_locked(tsk, mm, ..., pages, &locked); 860 * if (locked) 861 * up_read(&mm->mmap_sem); 862 */ 863 long get_user_pages_locked(unsigned long start, unsigned long nr_pages, 864 unsigned int gup_flags, struct page **pages, 865 int *locked) 866 { 867 return __get_user_pages_locked(current, current->mm, start, nr_pages, 868 pages, NULL, locked, true, 869 gup_flags | FOLL_TOUCH); 870 } 871 EXPORT_SYMBOL(get_user_pages_locked); 872 873 /* 874 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for 875 * tsk, mm to be specified. 876 * 877 * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the 878 * caller if required (just like with __get_user_pages). "FOLL_GET" 879 * is set implicitly if "pages" is non-NULL. 880 */ 881 static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, 882 struct mm_struct *mm, unsigned long start, 883 unsigned long nr_pages, struct page **pages, 884 unsigned int gup_flags) 885 { 886 long ret; 887 int locked = 1; 888 889 down_read(&mm->mmap_sem); 890 ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL, 891 &locked, false, gup_flags); 892 if (locked) 893 up_read(&mm->mmap_sem); 894 return ret; 895 } 896 897 /* 898 * get_user_pages_unlocked() is suitable to replace the form: 899 * 900 * down_read(&mm->mmap_sem); 901 * get_user_pages(tsk, mm, ..., pages, NULL); 902 * up_read(&mm->mmap_sem); 903 * 904 * with: 905 * 906 * get_user_pages_unlocked(tsk, mm, ..., pages); 907 * 908 * It is functionally equivalent to get_user_pages_fast so 909 * get_user_pages_fast should be used instead if specific gup_flags 910 * (e.g. FOLL_FORCE) are not required. 911 */ 912 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 913 struct page **pages, unsigned int gup_flags) 914 { 915 return __get_user_pages_unlocked(current, current->mm, start, nr_pages, 916 pages, gup_flags | FOLL_TOUCH); 917 } 918 EXPORT_SYMBOL(get_user_pages_unlocked); 919 920 /* 921 * get_user_pages_remote() - pin user pages in memory 922 * @tsk: the task_struct to use for page fault accounting, or 923 * NULL if faults are not to be recorded. 924 * @mm: mm_struct of target mm 925 * @start: starting user address 926 * @nr_pages: number of pages from start to pin 927 * @gup_flags: flags modifying lookup behaviour 928 * @pages: array that receives pointers to the pages pinned. 929 * Should be at least nr_pages long. Or NULL, if caller 930 * only intends to ensure the pages are faulted in. 931 * @vmas: array of pointers to vmas corresponding to each page. 932 * Or NULL if the caller does not require them. 933 * @locked: pointer to lock flag indicating whether lock is held and 934 * subsequently whether VM_FAULT_RETRY functionality can be 935 * utilised. Lock must initially be held. 936 * 937 * Returns number of pages pinned. This may be fewer than the number 938 * requested. If nr_pages is 0 or negative, returns 0. If no pages 939 * were pinned, returns -errno. Each page returned must be released 940 * with a put_page() call when it is finished with. vmas will only 941 * remain valid while mmap_sem is held. 942 * 943 * Must be called with mmap_sem held for read or write. 944 * 945 * get_user_pages walks a process's page tables and takes a reference to 946 * each struct page that each user address corresponds to at a given 947 * instant. That is, it takes the page that would be accessed if a user 948 * thread accesses the given user virtual address at that instant. 949 * 950 * This does not guarantee that the page exists in the user mappings when 951 * get_user_pages returns, and there may even be a completely different 952 * page there in some cases (eg. if mmapped pagecache has been invalidated 953 * and subsequently re faulted). However it does guarantee that the page 954 * won't be freed completely. And mostly callers simply care that the page 955 * contains data that was valid *at some point in time*. Typically, an IO 956 * or similar operation cannot guarantee anything stronger anyway because 957 * locks can't be held over the syscall boundary. 958 * 959 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page 960 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must 961 * be called after the page is finished with, and before put_page is called. 962 * 963 * get_user_pages is typically used for fewer-copy IO operations, to get a 964 * handle on the memory by some means other than accesses via the user virtual 965 * addresses. The pages may be submitted for DMA to devices or accessed via 966 * their kernel linear mapping (via the kmap APIs). Care should be taken to 967 * use the correct cache flushing APIs. 968 * 969 * See also get_user_pages_fast, for performance critical applications. 970 * 971 * get_user_pages should be phased out in favor of 972 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing 973 * should use get_user_pages because it cannot pass 974 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 975 */ 976 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 977 unsigned long start, unsigned long nr_pages, 978 unsigned int gup_flags, struct page **pages, 979 struct vm_area_struct **vmas, int *locked) 980 { 981 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 982 locked, true, 983 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 984 } 985 EXPORT_SYMBOL(get_user_pages_remote); 986 987 /* 988 * This is the same as get_user_pages_remote(), just with a 989 * less-flexible calling convention where we assume that the task 990 * and mm being operated on are the current task's and don't allow 991 * passing of a locked parameter. We also obviously don't pass 992 * FOLL_REMOTE in here. 993 */ 994 long get_user_pages(unsigned long start, unsigned long nr_pages, 995 unsigned int gup_flags, struct page **pages, 996 struct vm_area_struct **vmas) 997 { 998 return __get_user_pages_locked(current, current->mm, start, nr_pages, 999 pages, vmas, NULL, false, 1000 gup_flags | FOLL_TOUCH); 1001 } 1002 EXPORT_SYMBOL(get_user_pages); 1003 1004 /** 1005 * populate_vma_page_range() - populate a range of pages in the vma. 1006 * @vma: target vma 1007 * @start: start address 1008 * @end: end address 1009 * @nonblocking: 1010 * 1011 * This takes care of mlocking the pages too if VM_LOCKED is set. 1012 * 1013 * return 0 on success, negative error code on error. 1014 * 1015 * vma->vm_mm->mmap_sem must be held. 1016 * 1017 * If @nonblocking is NULL, it may be held for read or write and will 1018 * be unperturbed. 1019 * 1020 * If @nonblocking is non-NULL, it must held for read only and may be 1021 * released. If it's released, *@nonblocking will be set to 0. 1022 */ 1023 long populate_vma_page_range(struct vm_area_struct *vma, 1024 unsigned long start, unsigned long end, int *nonblocking) 1025 { 1026 struct mm_struct *mm = vma->vm_mm; 1027 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1028 int gup_flags; 1029 1030 VM_BUG_ON(start & ~PAGE_MASK); 1031 VM_BUG_ON(end & ~PAGE_MASK); 1032 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1033 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1034 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 1035 1036 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 1037 if (vma->vm_flags & VM_LOCKONFAULT) 1038 gup_flags &= ~FOLL_POPULATE; 1039 /* 1040 * We want to touch writable mappings with a write fault in order 1041 * to break COW, except for shared mappings because these don't COW 1042 * and we would not want to dirty them for nothing. 1043 */ 1044 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 1045 gup_flags |= FOLL_WRITE; 1046 1047 /* 1048 * We want mlock to succeed for regions that have any permissions 1049 * other than PROT_NONE. 1050 */ 1051 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 1052 gup_flags |= FOLL_FORCE; 1053 1054 /* 1055 * We made sure addr is within a VMA, so the following will 1056 * not result in a stack expansion that recurses back here. 1057 */ 1058 return __get_user_pages(current, mm, start, nr_pages, gup_flags, 1059 NULL, NULL, nonblocking); 1060 } 1061 1062 /* 1063 * __mm_populate - populate and/or mlock pages within a range of address space. 1064 * 1065 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap 1066 * flags. VMAs must be already marked with the desired vm_flags, and 1067 * mmap_sem must not be held. 1068 */ 1069 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) 1070 { 1071 struct mm_struct *mm = current->mm; 1072 unsigned long end, nstart, nend; 1073 struct vm_area_struct *vma = NULL; 1074 int locked = 0; 1075 long ret = 0; 1076 1077 VM_BUG_ON(start & ~PAGE_MASK); 1078 VM_BUG_ON(len != PAGE_ALIGN(len)); 1079 end = start + len; 1080 1081 for (nstart = start; nstart < end; nstart = nend) { 1082 /* 1083 * We want to fault in pages for [nstart; end) address range. 1084 * Find first corresponding VMA. 1085 */ 1086 if (!locked) { 1087 locked = 1; 1088 down_read(&mm->mmap_sem); 1089 vma = find_vma(mm, nstart); 1090 } else if (nstart >= vma->vm_end) 1091 vma = vma->vm_next; 1092 if (!vma || vma->vm_start >= end) 1093 break; 1094 /* 1095 * Set [nstart; nend) to intersection of desired address 1096 * range with the first VMA. Also, skip undesirable VMA types. 1097 */ 1098 nend = min(end, vma->vm_end); 1099 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1100 continue; 1101 if (nstart < vma->vm_start) 1102 nstart = vma->vm_start; 1103 /* 1104 * Now fault in a range of pages. populate_vma_page_range() 1105 * double checks the vma flags, so that it won't mlock pages 1106 * if the vma was already munlocked. 1107 */ 1108 ret = populate_vma_page_range(vma, nstart, nend, &locked); 1109 if (ret < 0) { 1110 if (ignore_errors) { 1111 ret = 0; 1112 continue; /* continue at next VMA */ 1113 } 1114 break; 1115 } 1116 nend = nstart + ret * PAGE_SIZE; 1117 ret = 0; 1118 } 1119 if (locked) 1120 up_read(&mm->mmap_sem); 1121 return ret; /* 0 or negative error code */ 1122 } 1123 1124 /** 1125 * get_dump_page() - pin user page in memory while writing it to core dump 1126 * @addr: user address 1127 * 1128 * Returns struct page pointer of user page pinned for dump, 1129 * to be freed afterwards by put_page(). 1130 * 1131 * Returns NULL on any kind of failure - a hole must then be inserted into 1132 * the corefile, to preserve alignment with its headers; and also returns 1133 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1134 * allowing a hole to be left in the corefile to save diskspace. 1135 * 1136 * Called without mmap_sem, but after all other threads have been killed. 1137 */ 1138 #ifdef CONFIG_ELF_CORE 1139 struct page *get_dump_page(unsigned long addr) 1140 { 1141 struct vm_area_struct *vma; 1142 struct page *page; 1143 1144 if (__get_user_pages(current, current->mm, addr, 1, 1145 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 1146 NULL) < 1) 1147 return NULL; 1148 flush_cache_page(vma, addr, page_to_pfn(page)); 1149 return page; 1150 } 1151 #endif /* CONFIG_ELF_CORE */ 1152 1153 /* 1154 * Generic RCU Fast GUP 1155 * 1156 * get_user_pages_fast attempts to pin user pages by walking the page 1157 * tables directly and avoids taking locks. Thus the walker needs to be 1158 * protected from page table pages being freed from under it, and should 1159 * block any THP splits. 1160 * 1161 * One way to achieve this is to have the walker disable interrupts, and 1162 * rely on IPIs from the TLB flushing code blocking before the page table 1163 * pages are freed. This is unsuitable for architectures that do not need 1164 * to broadcast an IPI when invalidating TLBs. 1165 * 1166 * Another way to achieve this is to batch up page table containing pages 1167 * belonging to more than one mm_user, then rcu_sched a callback to free those 1168 * pages. Disabling interrupts will allow the fast_gup walker to both block 1169 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 1170 * (which is a relatively rare event). The code below adopts this strategy. 1171 * 1172 * Before activating this code, please be aware that the following assumptions 1173 * are currently made: 1174 * 1175 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free 1176 * pages containing page tables. 1177 * 1178 * *) ptes can be read atomically by the architecture. 1179 * 1180 * *) access_ok is sufficient to validate userspace address ranges. 1181 * 1182 * The last two assumptions can be relaxed by the addition of helper functions. 1183 * 1184 * This code is based heavily on the PowerPC implementation by Nick Piggin. 1185 */ 1186 #ifdef CONFIG_HAVE_GENERIC_RCU_GUP 1187 1188 #ifndef gup_get_pte 1189 /* 1190 * We assume that the PTE can be read atomically. If this is not the case for 1191 * your architecture, please provide the helper. 1192 */ 1193 static inline pte_t gup_get_pte(pte_t *ptep) 1194 { 1195 return READ_ONCE(*ptep); 1196 } 1197 #endif 1198 1199 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) 1200 { 1201 while ((*nr) - nr_start) { 1202 struct page *page = pages[--(*nr)]; 1203 1204 ClearPageReferenced(page); 1205 put_page(page); 1206 } 1207 } 1208 1209 #ifdef __HAVE_ARCH_PTE_SPECIAL 1210 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1211 int write, struct page **pages, int *nr) 1212 { 1213 struct dev_pagemap *pgmap = NULL; 1214 int nr_start = *nr, ret = 0; 1215 pte_t *ptep, *ptem; 1216 1217 ptem = ptep = pte_offset_map(&pmd, addr); 1218 do { 1219 pte_t pte = gup_get_pte(ptep); 1220 struct page *head, *page; 1221 1222 /* 1223 * Similar to the PMD case below, NUMA hinting must take slow 1224 * path using the pte_protnone check. 1225 */ 1226 if (pte_protnone(pte)) 1227 goto pte_unmap; 1228 1229 if (!pte_access_permitted(pte, write)) 1230 goto pte_unmap; 1231 1232 if (pte_devmap(pte)) { 1233 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 1234 if (unlikely(!pgmap)) { 1235 undo_dev_pagemap(nr, nr_start, pages); 1236 goto pte_unmap; 1237 } 1238 } else if (pte_special(pte)) 1239 goto pte_unmap; 1240 1241 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1242 page = pte_page(pte); 1243 head = compound_head(page); 1244 1245 if (!page_cache_get_speculative(head)) 1246 goto pte_unmap; 1247 1248 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1249 put_page(head); 1250 goto pte_unmap; 1251 } 1252 1253 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1254 1255 put_dev_pagemap(pgmap); 1256 SetPageReferenced(page); 1257 pages[*nr] = page; 1258 (*nr)++; 1259 1260 } while (ptep++, addr += PAGE_SIZE, addr != end); 1261 1262 ret = 1; 1263 1264 pte_unmap: 1265 pte_unmap(ptem); 1266 return ret; 1267 } 1268 #else 1269 1270 /* 1271 * If we can't determine whether or not a pte is special, then fail immediately 1272 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 1273 * to be special. 1274 * 1275 * For a futex to be placed on a THP tail page, get_futex_key requires a 1276 * __get_user_pages_fast implementation that can pin pages. Thus it's still 1277 * useful to have gup_huge_pmd even if we can't operate on ptes. 1278 */ 1279 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1280 int write, struct page **pages, int *nr) 1281 { 1282 return 0; 1283 } 1284 #endif /* __HAVE_ARCH_PTE_SPECIAL */ 1285 1286 #ifdef __HAVE_ARCH_PTE_DEVMAP 1287 static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1288 unsigned long end, struct page **pages, int *nr) 1289 { 1290 int nr_start = *nr; 1291 struct dev_pagemap *pgmap = NULL; 1292 1293 do { 1294 struct page *page = pfn_to_page(pfn); 1295 1296 pgmap = get_dev_pagemap(pfn, pgmap); 1297 if (unlikely(!pgmap)) { 1298 undo_dev_pagemap(nr, nr_start, pages); 1299 return 0; 1300 } 1301 SetPageReferenced(page); 1302 pages[*nr] = page; 1303 get_page(page); 1304 put_dev_pagemap(pgmap); 1305 (*nr)++; 1306 pfn++; 1307 } while (addr += PAGE_SIZE, addr != end); 1308 return 1; 1309 } 1310 1311 static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, 1312 unsigned long end, struct page **pages, int *nr) 1313 { 1314 unsigned long fault_pfn; 1315 1316 fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1317 return __gup_device_huge(fault_pfn, addr, end, pages, nr); 1318 } 1319 1320 static int __gup_device_huge_pud(pud_t pud, unsigned long addr, 1321 unsigned long end, struct page **pages, int *nr) 1322 { 1323 unsigned long fault_pfn; 1324 1325 fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1326 return __gup_device_huge(fault_pfn, addr, end, pages, nr); 1327 } 1328 #else 1329 static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, 1330 unsigned long end, struct page **pages, int *nr) 1331 { 1332 BUILD_BUG(); 1333 return 0; 1334 } 1335 1336 static int __gup_device_huge_pud(pud_t pud, unsigned long addr, 1337 unsigned long end, struct page **pages, int *nr) 1338 { 1339 BUILD_BUG(); 1340 return 0; 1341 } 1342 #endif 1343 1344 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1345 unsigned long end, int write, struct page **pages, int *nr) 1346 { 1347 struct page *head, *page; 1348 int refs; 1349 1350 if (!pmd_access_permitted(orig, write)) 1351 return 0; 1352 1353 if (pmd_devmap(orig)) 1354 return __gup_device_huge_pmd(orig, addr, end, pages, nr); 1355 1356 refs = 0; 1357 head = pmd_page(orig); 1358 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1359 do { 1360 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1361 pages[*nr] = page; 1362 (*nr)++; 1363 page++; 1364 refs++; 1365 } while (addr += PAGE_SIZE, addr != end); 1366 1367 if (!page_cache_add_speculative(head, refs)) { 1368 *nr -= refs; 1369 return 0; 1370 } 1371 1372 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 1373 *nr -= refs; 1374 while (refs--) 1375 put_page(head); 1376 return 0; 1377 } 1378 1379 SetPageReferenced(head); 1380 return 1; 1381 } 1382 1383 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1384 unsigned long end, int write, struct page **pages, int *nr) 1385 { 1386 struct page *head, *page; 1387 int refs; 1388 1389 if (!pud_access_permitted(orig, write)) 1390 return 0; 1391 1392 if (pud_devmap(orig)) 1393 return __gup_device_huge_pud(orig, addr, end, pages, nr); 1394 1395 refs = 0; 1396 head = pud_page(orig); 1397 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1398 do { 1399 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1400 pages[*nr] = page; 1401 (*nr)++; 1402 page++; 1403 refs++; 1404 } while (addr += PAGE_SIZE, addr != end); 1405 1406 if (!page_cache_add_speculative(head, refs)) { 1407 *nr -= refs; 1408 return 0; 1409 } 1410 1411 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 1412 *nr -= refs; 1413 while (refs--) 1414 put_page(head); 1415 return 0; 1416 } 1417 1418 SetPageReferenced(head); 1419 return 1; 1420 } 1421 1422 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 1423 unsigned long end, int write, 1424 struct page **pages, int *nr) 1425 { 1426 int refs; 1427 struct page *head, *page; 1428 1429 if (!pgd_access_permitted(orig, write)) 1430 return 0; 1431 1432 BUILD_BUG_ON(pgd_devmap(orig)); 1433 refs = 0; 1434 head = pgd_page(orig); 1435 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 1436 do { 1437 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1438 pages[*nr] = page; 1439 (*nr)++; 1440 page++; 1441 refs++; 1442 } while (addr += PAGE_SIZE, addr != end); 1443 1444 if (!page_cache_add_speculative(head, refs)) { 1445 *nr -= refs; 1446 return 0; 1447 } 1448 1449 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 1450 *nr -= refs; 1451 while (refs--) 1452 put_page(head); 1453 return 0; 1454 } 1455 1456 SetPageReferenced(head); 1457 return 1; 1458 } 1459 1460 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 1461 int write, struct page **pages, int *nr) 1462 { 1463 unsigned long next; 1464 pmd_t *pmdp; 1465 1466 pmdp = pmd_offset(&pud, addr); 1467 do { 1468 pmd_t pmd = READ_ONCE(*pmdp); 1469 1470 next = pmd_addr_end(addr, end); 1471 if (pmd_none(pmd)) 1472 return 0; 1473 1474 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { 1475 /* 1476 * NUMA hinting faults need to be handled in the GUP 1477 * slowpath for accounting purposes and so that they 1478 * can be serialised against THP migration. 1479 */ 1480 if (pmd_protnone(pmd)) 1481 return 0; 1482 1483 if (!gup_huge_pmd(pmd, pmdp, addr, next, write, 1484 pages, nr)) 1485 return 0; 1486 1487 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 1488 /* 1489 * architecture have different format for hugetlbfs 1490 * pmd format and THP pmd format 1491 */ 1492 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 1493 PMD_SHIFT, next, write, pages, nr)) 1494 return 0; 1495 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 1496 return 0; 1497 } while (pmdp++, addr = next, addr != end); 1498 1499 return 1; 1500 } 1501 1502 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 1503 int write, struct page **pages, int *nr) 1504 { 1505 unsigned long next; 1506 pud_t *pudp; 1507 1508 pudp = pud_offset(&p4d, addr); 1509 do { 1510 pud_t pud = READ_ONCE(*pudp); 1511 1512 next = pud_addr_end(addr, end); 1513 if (pud_none(pud)) 1514 return 0; 1515 if (unlikely(pud_huge(pud))) { 1516 if (!gup_huge_pud(pud, pudp, addr, next, write, 1517 pages, nr)) 1518 return 0; 1519 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 1520 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 1521 PUD_SHIFT, next, write, pages, nr)) 1522 return 0; 1523 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 1524 return 0; 1525 } while (pudp++, addr = next, addr != end); 1526 1527 return 1; 1528 } 1529 1530 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 1531 int write, struct page **pages, int *nr) 1532 { 1533 unsigned long next; 1534 p4d_t *p4dp; 1535 1536 p4dp = p4d_offset(&pgd, addr); 1537 do { 1538 p4d_t p4d = READ_ONCE(*p4dp); 1539 1540 next = p4d_addr_end(addr, end); 1541 if (p4d_none(p4d)) 1542 return 0; 1543 BUILD_BUG_ON(p4d_huge(p4d)); 1544 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 1545 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 1546 P4D_SHIFT, next, write, pages, nr)) 1547 return 0; 1548 } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) 1549 return 0; 1550 } while (p4dp++, addr = next, addr != end); 1551 1552 return 1; 1553 } 1554 1555 /* 1556 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 1557 * the regular GUP. It will only return non-negative values. 1558 */ 1559 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 1560 struct page **pages) 1561 { 1562 struct mm_struct *mm = current->mm; 1563 unsigned long addr, len, end; 1564 unsigned long next, flags; 1565 pgd_t *pgdp; 1566 int nr = 0; 1567 1568 start &= PAGE_MASK; 1569 addr = start; 1570 len = (unsigned long) nr_pages << PAGE_SHIFT; 1571 end = start + len; 1572 1573 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 1574 (void __user *)start, len))) 1575 return 0; 1576 1577 /* 1578 * Disable interrupts. We use the nested form as we can already have 1579 * interrupts disabled by get_futex_key. 1580 * 1581 * With interrupts disabled, we block page table pages from being 1582 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h 1583 * for more details. 1584 * 1585 * We do not adopt an rcu_read_lock(.) here as we also want to 1586 * block IPIs that come from THPs splitting. 1587 */ 1588 1589 local_irq_save(flags); 1590 pgdp = pgd_offset(mm, addr); 1591 do { 1592 pgd_t pgd = READ_ONCE(*pgdp); 1593 1594 next = pgd_addr_end(addr, end); 1595 if (pgd_none(pgd)) 1596 break; 1597 if (unlikely(pgd_huge(pgd))) { 1598 if (!gup_huge_pgd(pgd, pgdp, addr, next, write, 1599 pages, &nr)) 1600 break; 1601 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 1602 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 1603 PGDIR_SHIFT, next, write, pages, &nr)) 1604 break; 1605 } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) 1606 break; 1607 } while (pgdp++, addr = next, addr != end); 1608 local_irq_restore(flags); 1609 1610 return nr; 1611 } 1612 1613 #ifndef gup_fast_permitted 1614 /* 1615 * Check if it's allowed to use __get_user_pages_fast() for the range, or 1616 * we need to fall back to the slow version: 1617 */ 1618 bool gup_fast_permitted(unsigned long start, int nr_pages, int write) 1619 { 1620 unsigned long len, end; 1621 1622 len = (unsigned long) nr_pages << PAGE_SHIFT; 1623 end = start + len; 1624 return end >= start; 1625 } 1626 #endif 1627 1628 /** 1629 * get_user_pages_fast() - pin user pages in memory 1630 * @start: starting user address 1631 * @nr_pages: number of pages from start to pin 1632 * @write: whether pages will be written to 1633 * @pages: array that receives pointers to the pages pinned. 1634 * Should be at least nr_pages long. 1635 * 1636 * Attempt to pin user pages in memory without taking mm->mmap_sem. 1637 * If not successful, it will fall back to taking the lock and 1638 * calling get_user_pages(). 1639 * 1640 * Returns number of pages pinned. This may be fewer than the number 1641 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1642 * were pinned, returns -errno. 1643 */ 1644 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1645 struct page **pages) 1646 { 1647 int nr = 0, ret = 0; 1648 1649 start &= PAGE_MASK; 1650 1651 if (gup_fast_permitted(start, nr_pages, write)) { 1652 nr = __get_user_pages_fast(start, nr_pages, write, pages); 1653 ret = nr; 1654 } 1655 1656 if (nr < nr_pages) { 1657 /* Try to get the remaining pages with get_user_pages */ 1658 start += nr << PAGE_SHIFT; 1659 pages += nr; 1660 1661 ret = get_user_pages_unlocked(start, nr_pages - nr, pages, 1662 write ? FOLL_WRITE : 0); 1663 1664 /* Have to be a bit careful with return values */ 1665 if (nr > 0) { 1666 if (ret < 0) 1667 ret = nr; 1668 else 1669 ret += nr; 1670 } 1671 } 1672 1673 return ret; 1674 } 1675 1676 #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ 1677