1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/err.h>
5 #include <linux/spinlock.h>
6
7 #include <linux/mm.h>
8 #include <linux/memfd.h>
9 #include <linux/memremap.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/swapops.h>
14 #include <linux/secretmem.h>
15
16 #include <linux/sched/signal.h>
17 #include <linux/rwsem.h>
18 #include <linux/hugetlb.h>
19 #include <linux/migrate.h>
20 #include <linux/mm_inline.h>
21 #include <linux/pagevec.h>
22 #include <linux/sched/mm.h>
23 #include <linux/shmem_fs.h>
24
25 #include <asm/mmu_context.h>
26 #include <asm/tlbflush.h>
27
28 #include "internal.h"
29 #include "swap.h"
30
sanity_check_pinned_pages(struct page ** pages,unsigned long npages)31 static inline void sanity_check_pinned_pages(struct page **pages,
32 unsigned long npages)
33 {
34 if (!IS_ENABLED(CONFIG_DEBUG_VM))
35 return;
36
37 /*
38 * We only pin anonymous pages if they are exclusive. Once pinned, we
39 * can no longer turn them possibly shared and PageAnonExclusive() will
40 * stick around until the page is freed.
41 *
42 * We'd like to verify that our pinned anonymous pages are still mapped
43 * exclusively. The issue with anon THP is that we don't know how
44 * they are/were mapped when pinning them. However, for anon
45 * THP we can assume that either the given page (PTE-mapped THP) or
46 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
47 * neither is the case, there is certainly something wrong.
48 */
49 for (; npages; npages--, pages++) {
50 struct page *page = *pages;
51 struct folio *folio;
52
53 if (!page)
54 continue;
55
56 folio = page_folio(page);
57
58 if (is_zero_page(page) ||
59 !folio_test_anon(folio))
60 continue;
61 if (!folio_test_large(folio) || folio_test_hugetlb(folio))
62 VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
63 else
64 /* Either a PTE-mapped or a PMD-mapped THP. */
65 VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
66 !PageAnonExclusive(page), page);
67 }
68 }
69
70 /*
71 * Return the folio with ref appropriately incremented,
72 * or NULL if that failed.
73 */
try_get_folio(struct page * page,int refs)74 static inline struct folio *try_get_folio(struct page *page, int refs)
75 {
76 struct folio *folio;
77
78 retry:
79 folio = page_folio(page);
80 if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
81 return NULL;
82 if (unlikely(!folio_ref_try_add(folio, refs)))
83 return NULL;
84
85 /*
86 * At this point we have a stable reference to the folio; but it
87 * could be that between calling page_folio() and the refcount
88 * increment, the folio was split, in which case we'd end up
89 * holding a reference on a folio that has nothing to do with the page
90 * we were given anymore.
91 * So now that the folio is stable, recheck that the page still
92 * belongs to this folio.
93 */
94 if (unlikely(page_folio(page) != folio)) {
95 folio_put_refs(folio, refs);
96 goto retry;
97 }
98
99 return folio;
100 }
101
gup_put_folio(struct folio * folio,int refs,unsigned int flags)102 static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
103 {
104 if (flags & FOLL_PIN) {
105 if (is_zero_folio(folio))
106 return;
107 node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
108 if (folio_has_pincount(folio))
109 atomic_sub(refs, &folio->_pincount);
110 else
111 refs *= GUP_PIN_COUNTING_BIAS;
112 }
113
114 folio_put_refs(folio, refs);
115 }
116
117 /**
118 * try_grab_folio() - add a folio's refcount by a flag-dependent amount
119 * @folio: pointer to folio to be grabbed
120 * @refs: the value to (effectively) add to the folio's refcount
121 * @flags: gup flags: these are the FOLL_* flag values
122 *
123 * This might not do anything at all, depending on the flags argument.
124 *
125 * "grab" names in this file mean, "look at flags to decide whether to use
126 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
127 *
128 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
129 * time.
130 *
131 * Return: 0 for success, or if no action was required (if neither FOLL_PIN
132 * nor FOLL_GET was set, nothing is done). A negative error code for failure:
133 *
134 * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not
135 * be grabbed.
136 *
137 * It is called when we have a stable reference for the folio, typically in
138 * GUP slow path.
139 */
try_grab_folio(struct folio * folio,int refs,unsigned int flags)140 int __must_check try_grab_folio(struct folio *folio, int refs,
141 unsigned int flags)
142 {
143 if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
144 return -ENOMEM;
145
146 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
147 return -EREMOTEIO;
148
149 if (flags & FOLL_GET)
150 folio_ref_add(folio, refs);
151 else if (flags & FOLL_PIN) {
152 /*
153 * Don't take a pin on the zero page - it's not going anywhere
154 * and it is used in a *lot* of places.
155 */
156 if (is_zero_folio(folio))
157 return 0;
158
159 /*
160 * Increment the normal page refcount field at least once,
161 * so that the page really is pinned.
162 */
163 if (folio_has_pincount(folio)) {
164 folio_ref_add(folio, refs);
165 atomic_add(refs, &folio->_pincount);
166 } else {
167 folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);
168 }
169
170 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
171 }
172
173 return 0;
174 }
175
176 /**
177 * unpin_user_page() - release a dma-pinned page
178 * @page: pointer to page to be released
179 *
180 * Pages that were pinned via pin_user_pages*() must be released via either
181 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
182 * that such pages can be separately tracked and uniquely handled. In
183 * particular, interactions with RDMA and filesystems need special handling.
184 */
unpin_user_page(struct page * page)185 void unpin_user_page(struct page *page)
186 {
187 sanity_check_pinned_pages(&page, 1);
188 gup_put_folio(page_folio(page), 1, FOLL_PIN);
189 }
190 EXPORT_SYMBOL(unpin_user_page);
191
192 /**
193 * unpin_folio() - release a dma-pinned folio
194 * @folio: pointer to folio to be released
195 *
196 * Folios that were pinned via memfd_pin_folios() or other similar routines
197 * must be released either using unpin_folio() or unpin_folios().
198 */
unpin_folio(struct folio * folio)199 void unpin_folio(struct folio *folio)
200 {
201 gup_put_folio(folio, 1, FOLL_PIN);
202 }
203 EXPORT_SYMBOL_GPL(unpin_folio);
204
205 /**
206 * folio_add_pin - Try to get an additional pin on a pinned folio
207 * @folio: The folio to be pinned
208 *
209 * Get an additional pin on a folio we already have a pin on. Makes no change
210 * if the folio is a zero_page.
211 */
folio_add_pin(struct folio * folio)212 void folio_add_pin(struct folio *folio)
213 {
214 if (is_zero_folio(folio))
215 return;
216
217 /*
218 * Similar to try_grab_folio(): be sure to *also* increment the normal
219 * page refcount field at least once, so that the page really is
220 * pinned.
221 */
222 if (folio_has_pincount(folio)) {
223 WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
224 folio_ref_inc(folio);
225 atomic_inc(&folio->_pincount);
226 } else {
227 WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
228 folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
229 }
230 }
231
gup_folio_range_next(struct page * start,unsigned long npages,unsigned long i,unsigned int * ntails)232 static inline struct folio *gup_folio_range_next(struct page *start,
233 unsigned long npages, unsigned long i, unsigned int *ntails)
234 {
235 struct page *next = start + i;
236 struct folio *folio = page_folio(next);
237 unsigned int nr = 1;
238
239 if (folio_test_large(folio))
240 nr = min_t(unsigned int, npages - i,
241 folio_nr_pages(folio) - folio_page_idx(folio, next));
242
243 *ntails = nr;
244 return folio;
245 }
246
gup_folio_next(struct page ** list,unsigned long npages,unsigned long i,unsigned int * ntails)247 static inline struct folio *gup_folio_next(struct page **list,
248 unsigned long npages, unsigned long i, unsigned int *ntails)
249 {
250 struct folio *folio = page_folio(list[i]);
251 unsigned int nr;
252
253 for (nr = i + 1; nr < npages; nr++) {
254 if (page_folio(list[nr]) != folio)
255 break;
256 }
257
258 *ntails = nr - i;
259 return folio;
260 }
261
262 /**
263 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
264 * @pages: array of pages to be maybe marked dirty, and definitely released.
265 * @npages: number of pages in the @pages array.
266 * @make_dirty: whether to mark the pages dirty
267 *
268 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
269 * variants called on that page.
270 *
271 * For each page in the @pages array, make that page (or its head page, if a
272 * compound page) dirty, if @make_dirty is true, and if the page was previously
273 * listed as clean. In any case, releases all pages using unpin_user_page(),
274 * possibly via unpin_user_pages(), for the non-dirty case.
275 *
276 * Please see the unpin_user_page() documentation for details.
277 *
278 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
279 * required, then the caller should a) verify that this is really correct,
280 * because _lock() is usually required, and b) hand code it:
281 * set_page_dirty_lock(), unpin_user_page().
282 *
283 */
unpin_user_pages_dirty_lock(struct page ** pages,unsigned long npages,bool make_dirty)284 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
285 bool make_dirty)
286 {
287 unsigned long i;
288 struct folio *folio;
289 unsigned int nr;
290
291 if (!make_dirty) {
292 unpin_user_pages(pages, npages);
293 return;
294 }
295
296 sanity_check_pinned_pages(pages, npages);
297 for (i = 0; i < npages; i += nr) {
298 folio = gup_folio_next(pages, npages, i, &nr);
299 /*
300 * Checking PageDirty at this point may race with
301 * clear_page_dirty_for_io(), but that's OK. Two key
302 * cases:
303 *
304 * 1) This code sees the page as already dirty, so it
305 * skips the call to set_page_dirty(). That could happen
306 * because clear_page_dirty_for_io() called
307 * folio_mkclean(), followed by set_page_dirty().
308 * However, now the page is going to get written back,
309 * which meets the original intention of setting it
310 * dirty, so all is well: clear_page_dirty_for_io() goes
311 * on to call TestClearPageDirty(), and write the page
312 * back.
313 *
314 * 2) This code sees the page as clean, so it calls
315 * set_page_dirty(). The page stays dirty, despite being
316 * written back, so it gets written back again in the
317 * next writeback cycle. This is harmless.
318 */
319 if (!folio_test_dirty(folio)) {
320 folio_lock(folio);
321 folio_mark_dirty(folio);
322 folio_unlock(folio);
323 }
324 gup_put_folio(folio, nr, FOLL_PIN);
325 }
326 }
327 EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
328
329 /**
330 * unpin_user_page_range_dirty_lock() - release and optionally dirty
331 * gup-pinned page range
332 *
333 * @page: the starting page of a range maybe marked dirty, and definitely released.
334 * @npages: number of consecutive pages to release.
335 * @make_dirty: whether to mark the pages dirty
336 *
337 * "gup-pinned page range" refers to a range of pages that has had one of the
338 * pin_user_pages() variants called on that page.
339 *
340 * The page range must be truly physically contiguous: the page range
341 * corresponds to a contiguous PFN range and all pages can be iterated
342 * naturally.
343 *
344 * For the page ranges defined by [page .. page+npages], make that range (or
345 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
346 * page range was previously listed as clean.
347 *
348 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
349 * required, then the caller should a) verify that this is really correct,
350 * because _lock() is usually required, and b) hand code it:
351 * set_page_dirty_lock(), unpin_user_page().
352 *
353 */
unpin_user_page_range_dirty_lock(struct page * page,unsigned long npages,bool make_dirty)354 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
355 bool make_dirty)
356 {
357 unsigned long i;
358 struct folio *folio;
359 unsigned int nr;
360
361 VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));
362
363 for (i = 0; i < npages; i += nr) {
364 folio = gup_folio_range_next(page, npages, i, &nr);
365 if (make_dirty && !folio_test_dirty(folio)) {
366 folio_lock(folio);
367 folio_mark_dirty(folio);
368 folio_unlock(folio);
369 }
370 gup_put_folio(folio, nr, FOLL_PIN);
371 }
372 }
373 EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
374
gup_fast_unpin_user_pages(struct page ** pages,unsigned long npages)375 static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
376 {
377 unsigned long i;
378 struct folio *folio;
379 unsigned int nr;
380
381 /*
382 * Don't perform any sanity checks because we might have raced with
383 * fork() and some anonymous pages might now actually be shared --
384 * which is why we're unpinning after all.
385 */
386 for (i = 0; i < npages; i += nr) {
387 folio = gup_folio_next(pages, npages, i, &nr);
388 gup_put_folio(folio, nr, FOLL_PIN);
389 }
390 }
391
392 /**
393 * unpin_user_pages() - release an array of gup-pinned pages.
394 * @pages: array of pages to be marked dirty and released.
395 * @npages: number of pages in the @pages array.
396 *
397 * For each page in the @pages array, release the page using unpin_user_page().
398 *
399 * Please see the unpin_user_page() documentation for details.
400 */
unpin_user_pages(struct page ** pages,unsigned long npages)401 void unpin_user_pages(struct page **pages, unsigned long npages)
402 {
403 unsigned long i;
404 struct folio *folio;
405 unsigned int nr;
406
407 /*
408 * If this WARN_ON() fires, then the system *might* be leaking pages (by
409 * leaving them pinned), but probably not. More likely, gup/pup returned
410 * a hard -ERRNO error to the caller, who erroneously passed it here.
411 */
412 if (WARN_ON(IS_ERR_VALUE(npages)))
413 return;
414
415 sanity_check_pinned_pages(pages, npages);
416 for (i = 0; i < npages; i += nr) {
417 if (!pages[i]) {
418 nr = 1;
419 continue;
420 }
421 folio = gup_folio_next(pages, npages, i, &nr);
422 gup_put_folio(folio, nr, FOLL_PIN);
423 }
424 }
425 EXPORT_SYMBOL(unpin_user_pages);
426
427 /**
428 * unpin_user_folio() - release pages of a folio
429 * @folio: pointer to folio to be released
430 * @npages: number of pages of same folio
431 *
432 * Release npages of the folio
433 */
unpin_user_folio(struct folio * folio,unsigned long npages)434 void unpin_user_folio(struct folio *folio, unsigned long npages)
435 {
436 gup_put_folio(folio, npages, FOLL_PIN);
437 }
438 EXPORT_SYMBOL(unpin_user_folio);
439
440 /**
441 * unpin_folios() - release an array of gup-pinned folios.
442 * @folios: array of folios to be marked dirty and released.
443 * @nfolios: number of folios in the @folios array.
444 *
445 * For each folio in the @folios array, release the folio using gup_put_folio.
446 *
447 * Please see the unpin_folio() documentation for details.
448 */
unpin_folios(struct folio ** folios,unsigned long nfolios)449 void unpin_folios(struct folio **folios, unsigned long nfolios)
450 {
451 unsigned long i = 0, j;
452
453 /*
454 * If this WARN_ON() fires, then the system *might* be leaking folios
455 * (by leaving them pinned), but probably not. More likely, gup/pup
456 * returned a hard -ERRNO error to the caller, who erroneously passed
457 * it here.
458 */
459 if (WARN_ON(IS_ERR_VALUE(nfolios)))
460 return;
461
462 while (i < nfolios) {
463 for (j = i + 1; j < nfolios; j++)
464 if (folios[i] != folios[j])
465 break;
466
467 if (folios[i])
468 gup_put_folio(folios[i], j - i, FOLL_PIN);
469 i = j;
470 }
471 }
472 EXPORT_SYMBOL_GPL(unpin_folios);
473
474 /*
475 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
476 * lifecycle. Avoid setting the bit unless necessary, or it might cause write
477 * cache bouncing on large SMP machines for concurrent pinned gups.
478 */
mm_set_has_pinned_flag(struct mm_struct * mm)479 static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
480 {
481 if (!mm_flags_test(MMF_HAS_PINNED, mm))
482 mm_flags_set(MMF_HAS_PINNED, mm);
483 }
484
485 #ifdef CONFIG_MMU
486
487 #ifdef CONFIG_HAVE_GUP_FAST
488 /**
489 * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
490 * @page: pointer to page to be grabbed
491 * @refs: the value to (effectively) add to the folio's refcount
492 * @flags: gup flags: these are the FOLL_* flag values.
493 *
494 * "grab" names in this file mean, "look at flags to decide whether to use
495 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
496 *
497 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
498 * same time. (That's true throughout the get_user_pages*() and
499 * pin_user_pages*() APIs.) Cases:
500 *
501 * FOLL_GET: folio's refcount will be incremented by @refs.
502 *
503 * FOLL_PIN on large folios: folio's refcount will be incremented by
504 * @refs, and its pincount will be incremented by @refs.
505 *
506 * FOLL_PIN on single-page folios: folio's refcount will be incremented by
507 * @refs * GUP_PIN_COUNTING_BIAS.
508 *
509 * Return: The folio containing @page (with refcount appropriately
510 * incremented) for success, or NULL upon failure. If neither FOLL_GET
511 * nor FOLL_PIN was set, that's considered failure, and furthermore,
512 * a likely bug in the caller, so a warning is also emitted.
513 *
514 * It uses add ref unless zero to elevate the folio refcount and must be called
515 * in fast path only.
516 */
try_grab_folio_fast(struct page * page,int refs,unsigned int flags)517 static struct folio *try_grab_folio_fast(struct page *page, int refs,
518 unsigned int flags)
519 {
520 struct folio *folio;
521
522 /* Raise warn if it is not called in fast GUP */
523 VM_WARN_ON_ONCE(!irqs_disabled());
524
525 if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
526 return NULL;
527
528 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
529 return NULL;
530
531 if (flags & FOLL_GET)
532 return try_get_folio(page, refs);
533
534 /* FOLL_PIN is set */
535
536 /*
537 * Don't take a pin on the zero page - it's not going anywhere
538 * and it is used in a *lot* of places.
539 */
540 if (is_zero_page(page))
541 return page_folio(page);
542
543 folio = try_get_folio(page, refs);
544 if (!folio)
545 return NULL;
546
547 /*
548 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
549 * right zone, so fail and let the caller fall back to the slow
550 * path.
551 */
552 if (unlikely((flags & FOLL_LONGTERM) &&
553 !folio_is_longterm_pinnable(folio))) {
554 folio_put_refs(folio, refs);
555 return NULL;
556 }
557
558 /*
559 * When pinning a large folio, use an exact count to track it.
560 *
561 * However, be sure to *also* increment the normal folio
562 * refcount field at least once, so that the folio really
563 * is pinned. That's why the refcount from the earlier
564 * try_get_folio() is left intact.
565 */
566 if (folio_has_pincount(folio))
567 atomic_add(refs, &folio->_pincount);
568 else
569 folio_ref_add(folio,
570 refs * (GUP_PIN_COUNTING_BIAS - 1));
571 /*
572 * Adjust the pincount before re-checking the PTE for changes.
573 * This is essentially a smp_mb() and is paired with a memory
574 * barrier in folio_try_share_anon_rmap_*().
575 */
576 smp_mb__after_atomic();
577
578 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
579
580 return folio;
581 }
582 #endif /* CONFIG_HAVE_GUP_FAST */
583
584 /* Common code for can_follow_write_* */
can_follow_write_common(struct page * page,struct vm_area_struct * vma,unsigned int flags)585 static inline bool can_follow_write_common(struct page *page,
586 struct vm_area_struct *vma, unsigned int flags)
587 {
588 /* Maybe FOLL_FORCE is set to override it? */
589 if (!(flags & FOLL_FORCE))
590 return false;
591
592 /* But FOLL_FORCE has no effect on shared mappings */
593 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
594 return false;
595
596 /* ... or read-only private ones */
597 if (!(vma->vm_flags & VM_MAYWRITE))
598 return false;
599
600 /* ... or already writable ones that just need to take a write fault */
601 if (vma->vm_flags & VM_WRITE)
602 return false;
603
604 /*
605 * See can_change_pte_writable(): we broke COW and could map the page
606 * writable if we have an exclusive anonymous page ...
607 */
608 return page && PageAnon(page) && PageAnonExclusive(page);
609 }
610
no_page_table(struct vm_area_struct * vma,unsigned int flags,unsigned long address)611 static struct page *no_page_table(struct vm_area_struct *vma,
612 unsigned int flags, unsigned long address)
613 {
614 if (!(flags & FOLL_DUMP))
615 return NULL;
616
617 /*
618 * When core dumping, we don't want to allocate unnecessary pages or
619 * page tables. Return error instead of NULL to skip handle_mm_fault,
620 * then get_dump_page() will return NULL to leave a hole in the dump.
621 * But we can only make this optimization where a hole would surely
622 * be zero-filled if handle_mm_fault() actually did handle it.
623 */
624 if (is_vm_hugetlb_page(vma)) {
625 struct hstate *h = hstate_vma(vma);
626
627 if (!hugetlbfs_pagecache_present(h, vma, address))
628 return ERR_PTR(-EFAULT);
629 } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
630 return ERR_PTR(-EFAULT);
631 }
632
633 return NULL;
634 }
635
636 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
637 /* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
can_follow_write_pud(pud_t pud,struct page * page,struct vm_area_struct * vma,unsigned int flags)638 static inline bool can_follow_write_pud(pud_t pud, struct page *page,
639 struct vm_area_struct *vma,
640 unsigned int flags)
641 {
642 /* If the pud is writable, we can write to the page. */
643 if (pud_write(pud))
644 return true;
645
646 return can_follow_write_common(page, vma, flags);
647 }
648
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)649 static struct page *follow_huge_pud(struct vm_area_struct *vma,
650 unsigned long addr, pud_t *pudp,
651 int flags, unsigned long *page_mask)
652 {
653 struct mm_struct *mm = vma->vm_mm;
654 struct page *page;
655 pud_t pud = *pudp;
656 unsigned long pfn = pud_pfn(pud);
657 int ret;
658
659 assert_spin_locked(pud_lockptr(mm, pudp));
660
661 if (!pud_present(pud))
662 return NULL;
663
664 if ((flags & FOLL_WRITE) &&
665 !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
666 return NULL;
667
668 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
669 page = pfn_to_page(pfn);
670
671 if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
672 return ERR_PTR(-EMLINK);
673
674 ret = try_grab_folio(page_folio(page), 1, flags);
675 if (ret)
676 page = ERR_PTR(ret);
677 else
678 *page_mask = HPAGE_PUD_NR - 1;
679
680 return page;
681 }
682
683 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
can_follow_write_pmd(pmd_t pmd,struct page * page,struct vm_area_struct * vma,unsigned int flags)684 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
685 struct vm_area_struct *vma,
686 unsigned int flags)
687 {
688 /* If the pmd is writable, we can write to the page. */
689 if (pmd_write(pmd))
690 return true;
691
692 if (!can_follow_write_common(page, vma, flags))
693 return false;
694
695 /* ... and a write-fault isn't required for other reasons. */
696 if (pmd_needs_soft_dirty_wp(vma, pmd))
697 return false;
698 return !userfaultfd_huge_pmd_wp(vma, pmd);
699 }
700
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)701 static struct page *follow_huge_pmd(struct vm_area_struct *vma,
702 unsigned long addr, pmd_t *pmd,
703 unsigned int flags,
704 unsigned long *page_mask)
705 {
706 struct mm_struct *mm = vma->vm_mm;
707 pmd_t pmdval = *pmd;
708 struct page *page;
709 int ret;
710
711 assert_spin_locked(pmd_lockptr(mm, pmd));
712
713 page = pmd_page(pmdval);
714 if ((flags & FOLL_WRITE) &&
715 !can_follow_write_pmd(pmdval, page, vma, flags))
716 return NULL;
717
718 /* Avoid dumping huge zero page */
719 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
720 return ERR_PTR(-EFAULT);
721
722 if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
723 return NULL;
724
725 if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
726 return ERR_PTR(-EMLINK);
727
728 VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
729 !PageAnonExclusive(page), page);
730
731 ret = try_grab_folio(page_folio(page), 1, flags);
732 if (ret)
733 return ERR_PTR(ret);
734
735 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
736 if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
737 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
738 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
739
740 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
741 *page_mask = HPAGE_PMD_NR - 1;
742
743 return page;
744 }
745
746 #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)747 static struct page *follow_huge_pud(struct vm_area_struct *vma,
748 unsigned long addr, pud_t *pudp,
749 int flags, unsigned long *page_mask)
750 {
751 return NULL;
752 }
753
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)754 static struct page *follow_huge_pmd(struct vm_area_struct *vma,
755 unsigned long addr, pmd_t *pmd,
756 unsigned int flags,
757 unsigned long *page_mask)
758 {
759 return NULL;
760 }
761 #endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
762
follow_pfn_pte(struct vm_area_struct * vma,unsigned long address,pte_t * pte,unsigned int flags)763 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
764 pte_t *pte, unsigned int flags)
765 {
766 if (flags & FOLL_TOUCH) {
767 pte_t orig_entry = ptep_get(pte);
768 pte_t entry = orig_entry;
769
770 if (flags & FOLL_WRITE)
771 entry = pte_mkdirty(entry);
772 entry = pte_mkyoung(entry);
773
774 if (!pte_same(orig_entry, entry)) {
775 set_pte_at(vma->vm_mm, address, pte, entry);
776 update_mmu_cache(vma, address, pte);
777 }
778 }
779
780 /* Proper page table entry exists, but no corresponding struct page */
781 return -EEXIST;
782 }
783
784 /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
can_follow_write_pte(pte_t pte,struct page * page,struct vm_area_struct * vma,unsigned int flags)785 static inline bool can_follow_write_pte(pte_t pte, struct page *page,
786 struct vm_area_struct *vma,
787 unsigned int flags)
788 {
789 /* If the pte is writable, we can write to the page. */
790 if (pte_write(pte))
791 return true;
792
793 if (!can_follow_write_common(page, vma, flags))
794 return false;
795
796 /* ... and a write-fault isn't required for other reasons. */
797 if (pte_needs_soft_dirty_wp(vma, pte))
798 return false;
799 return !userfaultfd_pte_wp(vma, pte);
800 }
801
follow_page_pte(struct vm_area_struct * vma,unsigned long address,pmd_t * pmd,unsigned int flags)802 static struct page *follow_page_pte(struct vm_area_struct *vma,
803 unsigned long address, pmd_t *pmd, unsigned int flags)
804 {
805 struct mm_struct *mm = vma->vm_mm;
806 struct folio *folio;
807 struct page *page;
808 spinlock_t *ptl;
809 pte_t *ptep, pte;
810 int ret;
811
812 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
813 if (!ptep)
814 return no_page_table(vma, flags, address);
815 pte = ptep_get(ptep);
816 if (!pte_present(pte))
817 goto no_page;
818 if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
819 goto no_page;
820
821 page = vm_normal_page(vma, address, pte);
822
823 /*
824 * We only care about anon pages in can_follow_write_pte().
825 */
826 if ((flags & FOLL_WRITE) &&
827 !can_follow_write_pte(pte, page, vma, flags)) {
828 page = NULL;
829 goto out;
830 }
831
832 if (unlikely(!page)) {
833 if (flags & FOLL_DUMP) {
834 /* Avoid special (like zero) pages in core dumps */
835 page = ERR_PTR(-EFAULT);
836 goto out;
837 }
838
839 if (is_zero_pfn(pte_pfn(pte))) {
840 page = pte_page(pte);
841 } else {
842 ret = follow_pfn_pte(vma, address, ptep, flags);
843 page = ERR_PTR(ret);
844 goto out;
845 }
846 }
847 folio = page_folio(page);
848
849 if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
850 page = ERR_PTR(-EMLINK);
851 goto out;
852 }
853
854 VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
855 !PageAnonExclusive(page), page);
856
857 /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
858 ret = try_grab_folio(folio, 1, flags);
859 if (unlikely(ret)) {
860 page = ERR_PTR(ret);
861 goto out;
862 }
863
864 /*
865 * We need to make the page accessible if and only if we are going
866 * to access its content (the FOLL_PIN case). Please see
867 * Documentation/core-api/pin_user_pages.rst for details.
868 */
869 if (flags & FOLL_PIN) {
870 ret = arch_make_folio_accessible(folio);
871 if (ret) {
872 unpin_user_page(page);
873 page = ERR_PTR(ret);
874 goto out;
875 }
876 }
877 if (flags & FOLL_TOUCH) {
878 if ((flags & FOLL_WRITE) &&
879 !pte_dirty(pte) && !folio_test_dirty(folio))
880 folio_mark_dirty(folio);
881 /*
882 * pte_mkyoung() would be more correct here, but atomic care
883 * is needed to avoid losing the dirty bit: it is easier to use
884 * folio_mark_accessed().
885 */
886 folio_mark_accessed(folio);
887 }
888 out:
889 pte_unmap_unlock(ptep, ptl);
890 return page;
891 no_page:
892 pte_unmap_unlock(ptep, ptl);
893 if (!pte_none(pte))
894 return NULL;
895 return no_page_table(vma, flags, address);
896 }
897
follow_pmd_mask(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,unsigned int flags,unsigned long * page_mask)898 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
899 unsigned long address, pud_t *pudp,
900 unsigned int flags,
901 unsigned long *page_mask)
902 {
903 pmd_t *pmd, pmdval;
904 spinlock_t *ptl;
905 struct page *page;
906 struct mm_struct *mm = vma->vm_mm;
907
908 pmd = pmd_offset(pudp, address);
909 pmdval = pmdp_get_lockless(pmd);
910 if (pmd_none(pmdval))
911 return no_page_table(vma, flags, address);
912 if (!pmd_present(pmdval))
913 return no_page_table(vma, flags, address);
914 if (likely(!pmd_leaf(pmdval)))
915 return follow_page_pte(vma, address, pmd, flags);
916
917 if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
918 return no_page_table(vma, flags, address);
919
920 ptl = pmd_lock(mm, pmd);
921 pmdval = *pmd;
922 if (unlikely(!pmd_present(pmdval))) {
923 spin_unlock(ptl);
924 return no_page_table(vma, flags, address);
925 }
926 if (unlikely(!pmd_leaf(pmdval))) {
927 spin_unlock(ptl);
928 return follow_page_pte(vma, address, pmd, flags);
929 }
930 if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
931 spin_unlock(ptl);
932 split_huge_pmd(vma, pmd, address);
933 /* If pmd was left empty, stuff a page table in there quickly */
934 return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
935 follow_page_pte(vma, address, pmd, flags);
936 }
937 page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
938 spin_unlock(ptl);
939 return page;
940 }
941
follow_pud_mask(struct vm_area_struct * vma,unsigned long address,p4d_t * p4dp,unsigned int flags,unsigned long * page_mask)942 static struct page *follow_pud_mask(struct vm_area_struct *vma,
943 unsigned long address, p4d_t *p4dp,
944 unsigned int flags,
945 unsigned long *page_mask)
946 {
947 pud_t *pudp, pud;
948 spinlock_t *ptl;
949 struct page *page;
950 struct mm_struct *mm = vma->vm_mm;
951
952 pudp = pud_offset(p4dp, address);
953 pud = READ_ONCE(*pudp);
954 if (!pud_present(pud))
955 return no_page_table(vma, flags, address);
956 if (pud_leaf(pud)) {
957 ptl = pud_lock(mm, pudp);
958 page = follow_huge_pud(vma, address, pudp, flags, page_mask);
959 spin_unlock(ptl);
960 if (page)
961 return page;
962 return no_page_table(vma, flags, address);
963 }
964 if (unlikely(pud_bad(pud)))
965 return no_page_table(vma, flags, address);
966
967 return follow_pmd_mask(vma, address, pudp, flags, page_mask);
968 }
969
follow_p4d_mask(struct vm_area_struct * vma,unsigned long address,pgd_t * pgdp,unsigned int flags,unsigned long * page_mask)970 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
971 unsigned long address, pgd_t *pgdp,
972 unsigned int flags,
973 unsigned long *page_mask)
974 {
975 p4d_t *p4dp, p4d;
976
977 p4dp = p4d_offset(pgdp, address);
978 p4d = READ_ONCE(*p4dp);
979 BUILD_BUG_ON(p4d_leaf(p4d));
980
981 if (!p4d_present(p4d) || p4d_bad(p4d))
982 return no_page_table(vma, flags, address);
983
984 return follow_pud_mask(vma, address, p4dp, flags, page_mask);
985 }
986
987 /**
988 * follow_page_mask - look up a page descriptor from a user-virtual address
989 * @vma: vm_area_struct mapping @address
990 * @address: virtual address to look up
991 * @flags: flags modifying lookup behaviour
992 * @page_mask: a pointer to output page_mask
993 *
994 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
995 *
996 * When getting an anonymous page and the caller has to trigger unsharing
997 * of a shared anonymous page first, -EMLINK is returned. The caller should
998 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
999 * relevant with FOLL_PIN and !FOLL_WRITE.
1000 *
1001 * On output, @page_mask is set according to the size of the page.
1002 *
1003 * Return: the mapped (struct page *), %NULL if no mapping exists, or
1004 * an error pointer if there is a mapping to something not represented
1005 * by a page descriptor (see also vm_normal_page()).
1006 */
follow_page_mask(struct vm_area_struct * vma,unsigned long address,unsigned int flags,unsigned long * page_mask)1007 static struct page *follow_page_mask(struct vm_area_struct *vma,
1008 unsigned long address, unsigned int flags,
1009 unsigned long *page_mask)
1010 {
1011 pgd_t *pgd;
1012 struct mm_struct *mm = vma->vm_mm;
1013 struct page *page;
1014
1015 vma_pgtable_walk_begin(vma);
1016
1017 *page_mask = 0;
1018 pgd = pgd_offset(mm, address);
1019
1020 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1021 page = no_page_table(vma, flags, address);
1022 else
1023 page = follow_p4d_mask(vma, address, pgd, flags, page_mask);
1024
1025 vma_pgtable_walk_end(vma);
1026
1027 return page;
1028 }
1029
get_gate_page(struct mm_struct * mm,unsigned long address,unsigned int gup_flags,struct vm_area_struct ** vma,struct page ** page)1030 static int get_gate_page(struct mm_struct *mm, unsigned long address,
1031 unsigned int gup_flags, struct vm_area_struct **vma,
1032 struct page **page)
1033 {
1034 pgd_t *pgd;
1035 p4d_t *p4d;
1036 pud_t *pud;
1037 pmd_t *pmd;
1038 pte_t *pte;
1039 pte_t entry;
1040 int ret = -EFAULT;
1041
1042 /* user gate pages are read-only */
1043 if (gup_flags & FOLL_WRITE)
1044 return -EFAULT;
1045 pgd = pgd_offset(mm, address);
1046 if (pgd_none(*pgd))
1047 return -EFAULT;
1048 p4d = p4d_offset(pgd, address);
1049 if (p4d_none(*p4d))
1050 return -EFAULT;
1051 pud = pud_offset(p4d, address);
1052 if (pud_none(*pud))
1053 return -EFAULT;
1054 pmd = pmd_offset(pud, address);
1055 if (!pmd_present(*pmd))
1056 return -EFAULT;
1057 pte = pte_offset_map(pmd, address);
1058 if (!pte)
1059 return -EFAULT;
1060 entry = ptep_get(pte);
1061 if (pte_none(entry))
1062 goto unmap;
1063 *vma = get_gate_vma(mm);
1064 if (!page)
1065 goto out;
1066 *page = vm_normal_page(*vma, address, entry);
1067 if (!*page) {
1068 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
1069 goto unmap;
1070 *page = pte_page(entry);
1071 }
1072 ret = try_grab_folio(page_folio(*page), 1, gup_flags);
1073 if (unlikely(ret))
1074 goto unmap;
1075 out:
1076 ret = 0;
1077 unmap:
1078 pte_unmap(pte);
1079 return ret;
1080 }
1081
1082 /*
1083 * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
1084 * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
1085 * to 0 and -EBUSY returned.
1086 */
faultin_page(struct vm_area_struct * vma,unsigned long address,unsigned int flags,bool unshare,int * locked)1087 static int faultin_page(struct vm_area_struct *vma,
1088 unsigned long address, unsigned int flags, bool unshare,
1089 int *locked)
1090 {
1091 unsigned int fault_flags = 0;
1092 vm_fault_t ret;
1093
1094 if (flags & FOLL_NOFAULT)
1095 return -EFAULT;
1096 if (flags & FOLL_WRITE)
1097 fault_flags |= FAULT_FLAG_WRITE;
1098 if (flags & FOLL_REMOTE)
1099 fault_flags |= FAULT_FLAG_REMOTE;
1100 if (flags & FOLL_UNLOCKABLE) {
1101 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1102 /*
1103 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
1104 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
1105 * That's because some callers may not be prepared to
1106 * handle early exits caused by non-fatal signals.
1107 */
1108 if (flags & FOLL_INTERRUPTIBLE)
1109 fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
1110 }
1111 if (flags & FOLL_NOWAIT)
1112 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
1113 if (flags & FOLL_TRIED) {
1114 /*
1115 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
1116 * can co-exist
1117 */
1118 fault_flags |= FAULT_FLAG_TRIED;
1119 }
1120 if (unshare) {
1121 fault_flags |= FAULT_FLAG_UNSHARE;
1122 /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
1123 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
1124 }
1125
1126 ret = handle_mm_fault(vma, address, fault_flags, NULL);
1127
1128 if (ret & VM_FAULT_COMPLETED) {
1129 /*
1130 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
1131 * mmap lock in the page fault handler. Sanity check this.
1132 */
1133 WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
1134 *locked = 0;
1135
1136 /*
1137 * We should do the same as VM_FAULT_RETRY, but let's not
1138 * return -EBUSY since that's not reflecting the reality of
1139 * what has happened - we've just fully completed a page
1140 * fault, with the mmap lock released. Use -EAGAIN to show
1141 * that we want to take the mmap lock _again_.
1142 */
1143 return -EAGAIN;
1144 }
1145
1146 if (ret & VM_FAULT_ERROR) {
1147 int err = vm_fault_to_errno(ret, flags);
1148
1149 if (err)
1150 return err;
1151 BUG();
1152 }
1153
1154 if (ret & VM_FAULT_RETRY) {
1155 if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
1156 *locked = 0;
1157 return -EBUSY;
1158 }
1159
1160 return 0;
1161 }
1162
1163 /*
1164 * Writing to file-backed mappings which require folio dirty tracking using GUP
1165 * is a fundamentally broken operation, as kernel write access to GUP mappings
1166 * do not adhere to the semantics expected by a file system.
1167 *
1168 * Consider the following scenario:-
1169 *
1170 * 1. A folio is written to via GUP which write-faults the memory, notifying
1171 * the file system and dirtying the folio.
1172 * 2. Later, writeback is triggered, resulting in the folio being cleaned and
1173 * the PTE being marked read-only.
1174 * 3. The GUP caller writes to the folio, as it is mapped read/write via the
1175 * direct mapping.
1176 * 4. The GUP caller, now done with the page, unpins it and sets it dirty
1177 * (though it does not have to).
1178 *
1179 * This results in both data being written to a folio without writenotify, and
1180 * the folio being dirtied unexpectedly (if the caller decides to do so).
1181 */
writable_file_mapping_allowed(struct vm_area_struct * vma,unsigned long gup_flags)1182 static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
1183 unsigned long gup_flags)
1184 {
1185 /*
1186 * If we aren't pinning then no problematic write can occur. A long term
1187 * pin is the most egregious case so this is the case we disallow.
1188 */
1189 if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
1190 (FOLL_PIN | FOLL_LONGTERM))
1191 return true;
1192
1193 /*
1194 * If the VMA does not require dirty tracking then no problematic write
1195 * can occur either.
1196 */
1197 return !vma_needs_dirty_tracking(vma);
1198 }
1199
check_vma_flags(struct vm_area_struct * vma,unsigned long gup_flags)1200 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
1201 {
1202 vm_flags_t vm_flags = vma->vm_flags;
1203 int write = (gup_flags & FOLL_WRITE);
1204 int foreign = (gup_flags & FOLL_REMOTE);
1205 bool vma_anon = vma_is_anonymous(vma);
1206
1207 if (vm_flags & (VM_IO | VM_PFNMAP))
1208 return -EFAULT;
1209
1210 if ((gup_flags & FOLL_ANON) && !vma_anon)
1211 return -EFAULT;
1212
1213 if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
1214 return -EOPNOTSUPP;
1215
1216 if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
1217 return -EOPNOTSUPP;
1218
1219 if (vma_is_secretmem(vma))
1220 return -EFAULT;
1221
1222 if (write) {
1223 if (!vma_anon &&
1224 !writable_file_mapping_allowed(vma, gup_flags))
1225 return -EFAULT;
1226
1227 if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
1228 if (!(gup_flags & FOLL_FORCE))
1229 return -EFAULT;
1230 /*
1231 * We used to let the write,force case do COW in a
1232 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
1233 * set a breakpoint in a read-only mapping of an
1234 * executable, without corrupting the file (yet only
1235 * when that file had been opened for writing!).
1236 * Anon pages in shared mappings are surprising: now
1237 * just reject it.
1238 */
1239 if (!is_cow_mapping(vm_flags))
1240 return -EFAULT;
1241 }
1242 } else if (!(vm_flags & VM_READ)) {
1243 if (!(gup_flags & FOLL_FORCE))
1244 return -EFAULT;
1245 /*
1246 * Is there actually any vma we can reach here which does not
1247 * have VM_MAYREAD set?
1248 */
1249 if (!(vm_flags & VM_MAYREAD))
1250 return -EFAULT;
1251 }
1252 /*
1253 * gups are always data accesses, not instruction
1254 * fetches, so execute=false here
1255 */
1256 if (!arch_vma_access_permitted(vma, write, false, foreign))
1257 return -EFAULT;
1258 return 0;
1259 }
1260
1261 /*
1262 * This is "vma_lookup()", but with a warning if we would have
1263 * historically expanded the stack in the GUP code.
1264 */
gup_vma_lookup(struct mm_struct * mm,unsigned long addr)1265 static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
1266 unsigned long addr)
1267 {
1268 #ifdef CONFIG_STACK_GROWSUP
1269 return vma_lookup(mm, addr);
1270 #else
1271 static volatile unsigned long next_warn;
1272 struct vm_area_struct *vma;
1273 unsigned long now, next;
1274
1275 vma = find_vma(mm, addr);
1276 if (!vma || (addr >= vma->vm_start))
1277 return vma;
1278
1279 /* Only warn for half-way relevant accesses */
1280 if (!(vma->vm_flags & VM_GROWSDOWN))
1281 return NULL;
1282 if (vma->vm_start - addr > 65536)
1283 return NULL;
1284
1285 /* Let's not warn more than once an hour.. */
1286 now = jiffies; next = next_warn;
1287 if (next && time_before(now, next))
1288 return NULL;
1289 next_warn = now + 60*60*HZ;
1290
1291 /* Let people know things may have changed. */
1292 pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
1293 current->comm, task_pid_nr(current),
1294 vma->vm_start, vma->vm_end, addr);
1295 dump_stack();
1296 return NULL;
1297 #endif
1298 }
1299
1300 /**
1301 * __get_user_pages() - pin user pages in memory
1302 * @mm: mm_struct of target mm
1303 * @start: starting user address
1304 * @nr_pages: number of pages from start to pin
1305 * @gup_flags: flags modifying pin behaviour
1306 * @pages: array that receives pointers to the pages pinned.
1307 * Should be at least nr_pages long. Or NULL, if caller
1308 * only intends to ensure the pages are faulted in.
1309 * @locked: whether we're still with the mmap_lock held
1310 *
1311 * Returns either number of pages pinned (which may be less than the
1312 * number requested), or an error. Details about the return value:
1313 *
1314 * -- If nr_pages is 0, returns 0.
1315 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1316 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1317 * pages pinned. Again, this may be less than nr_pages.
1318 * -- 0 return value is possible when the fault would need to be retried.
1319 *
1320 * The caller is responsible for releasing returned @pages, via put_page().
1321 *
1322 * Must be called with mmap_lock held. It may be released. See below.
1323 *
1324 * __get_user_pages walks a process's page tables and takes a reference to
1325 * each struct page that each user address corresponds to at a given
1326 * instant. That is, it takes the page that would be accessed if a user
1327 * thread accesses the given user virtual address at that instant.
1328 *
1329 * This does not guarantee that the page exists in the user mappings when
1330 * __get_user_pages returns, and there may even be a completely different
1331 * page there in some cases (eg. if mmapped pagecache has been invalidated
1332 * and subsequently re-faulted). However it does guarantee that the page
1333 * won't be freed completely. And mostly callers simply care that the page
1334 * contains data that was valid *at some point in time*. Typically, an IO
1335 * or similar operation cannot guarantee anything stronger anyway because
1336 * locks can't be held over the syscall boundary.
1337 *
1338 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1339 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1340 * appropriate) must be called after the page is finished with, and
1341 * before put_page is called.
1342 *
1343 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
1344 * be released. If this happens *@locked will be set to 0 on return.
1345 *
1346 * A caller using such a combination of @gup_flags must therefore hold the
1347 * mmap_lock for reading only, and recognize when it's been released. Otherwise,
1348 * it must be held for either reading or writing and will not be released.
1349 *
1350 * In most cases, get_user_pages or get_user_pages_fast should be used
1351 * instead of __get_user_pages. __get_user_pages should be used only if
1352 * you need some special @gup_flags.
1353 */
__get_user_pages(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)1354 static long __get_user_pages(struct mm_struct *mm,
1355 unsigned long start, unsigned long nr_pages,
1356 unsigned int gup_flags, struct page **pages,
1357 int *locked)
1358 {
1359 long ret = 0, i = 0;
1360 struct vm_area_struct *vma = NULL;
1361 unsigned long page_mask = 0;
1362
1363 if (!nr_pages)
1364 return 0;
1365
1366 start = untagged_addr_remote(mm, start);
1367
1368 VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1369
1370 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
1371 VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
1372 (FOLL_PIN | FOLL_GET));
1373
1374 do {
1375 struct page *page;
1376 unsigned int page_increm;
1377
1378 /* first iteration or cross vma bound */
1379 if (!vma || start >= vma->vm_end) {
1380 /*
1381 * MADV_POPULATE_(READ|WRITE) wants to handle VMA
1382 * lookups+error reporting differently.
1383 */
1384 if (gup_flags & FOLL_MADV_POPULATE) {
1385 vma = vma_lookup(mm, start);
1386 if (!vma) {
1387 ret = -ENOMEM;
1388 goto out;
1389 }
1390 if (check_vma_flags(vma, gup_flags)) {
1391 ret = -EINVAL;
1392 goto out;
1393 }
1394 goto retry;
1395 }
1396 vma = gup_vma_lookup(mm, start);
1397 if (!vma && in_gate_area(mm, start)) {
1398 ret = get_gate_page(mm, start & PAGE_MASK,
1399 gup_flags, &vma,
1400 pages ? &page : NULL);
1401 if (ret)
1402 goto out;
1403 page_mask = 0;
1404 goto next_page;
1405 }
1406
1407 if (!vma) {
1408 ret = -EFAULT;
1409 goto out;
1410 }
1411 ret = check_vma_flags(vma, gup_flags);
1412 if (ret)
1413 goto out;
1414 }
1415 retry:
1416 /*
1417 * If we have a pending SIGKILL, don't keep faulting pages and
1418 * potentially allocating memory.
1419 */
1420 if (fatal_signal_pending(current)) {
1421 ret = -EINTR;
1422 goto out;
1423 }
1424 cond_resched();
1425
1426 page = follow_page_mask(vma, start, gup_flags, &page_mask);
1427 if (!page || PTR_ERR(page) == -EMLINK) {
1428 ret = faultin_page(vma, start, gup_flags,
1429 PTR_ERR(page) == -EMLINK, locked);
1430 switch (ret) {
1431 case 0:
1432 goto retry;
1433 case -EBUSY:
1434 case -EAGAIN:
1435 ret = 0;
1436 fallthrough;
1437 case -EFAULT:
1438 case -ENOMEM:
1439 case -EHWPOISON:
1440 goto out;
1441 }
1442 BUG();
1443 } else if (PTR_ERR(page) == -EEXIST) {
1444 /*
1445 * Proper page table entry exists, but no corresponding
1446 * struct page. If the caller expects **pages to be
1447 * filled in, bail out now, because that can't be done
1448 * for this page.
1449 */
1450 if (pages) {
1451 ret = PTR_ERR(page);
1452 goto out;
1453 }
1454 } else if (IS_ERR(page)) {
1455 ret = PTR_ERR(page);
1456 goto out;
1457 }
1458 next_page:
1459 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1460 if (page_increm > nr_pages)
1461 page_increm = nr_pages;
1462
1463 if (pages) {
1464 struct page *subpage;
1465 unsigned int j;
1466
1467 /*
1468 * This must be a large folio (and doesn't need to
1469 * be the whole folio; it can be part of it), do
1470 * the refcount work for all the subpages too.
1471 *
1472 * NOTE: here the page may not be the head page
1473 * e.g. when start addr is not thp-size aligned.
1474 * try_grab_folio() should have taken care of tail
1475 * pages.
1476 */
1477 if (page_increm > 1) {
1478 struct folio *folio = page_folio(page);
1479
1480 /*
1481 * Since we already hold refcount on the
1482 * large folio, this should never fail.
1483 */
1484 if (try_grab_folio(folio, page_increm - 1,
1485 gup_flags)) {
1486 /*
1487 * Release the 1st page ref if the
1488 * folio is problematic, fail hard.
1489 */
1490 gup_put_folio(folio, 1, gup_flags);
1491 ret = -EFAULT;
1492 goto out;
1493 }
1494 }
1495
1496 for (j = 0; j < page_increm; j++) {
1497 subpage = page + j;
1498 pages[i + j] = subpage;
1499 flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
1500 flush_dcache_page(subpage);
1501 }
1502 }
1503
1504 i += page_increm;
1505 start += page_increm * PAGE_SIZE;
1506 nr_pages -= page_increm;
1507 } while (nr_pages);
1508 out:
1509 return i ? i : ret;
1510 }
1511
vma_permits_fault(struct vm_area_struct * vma,unsigned int fault_flags)1512 static bool vma_permits_fault(struct vm_area_struct *vma,
1513 unsigned int fault_flags)
1514 {
1515 bool write = !!(fault_flags & FAULT_FLAG_WRITE);
1516 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1517 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1518
1519 if (!(vm_flags & vma->vm_flags))
1520 return false;
1521
1522 /*
1523 * The architecture might have a hardware protection
1524 * mechanism other than read/write that can deny access.
1525 *
1526 * gup always represents data access, not instruction
1527 * fetches, so execute=false here:
1528 */
1529 if (!arch_vma_access_permitted(vma, write, false, foreign))
1530 return false;
1531
1532 return true;
1533 }
1534
1535 /**
1536 * fixup_user_fault() - manually resolve a user page fault
1537 * @mm: mm_struct of target mm
1538 * @address: user address
1539 * @fault_flags:flags to pass down to handle_mm_fault()
1540 * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
1541 * does not allow retry. If NULL, the caller must guarantee
1542 * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
1543 *
1544 * This is meant to be called in the specific scenario where for locking reasons
1545 * we try to access user memory in atomic context (within a pagefault_disable()
1546 * section), this returns -EFAULT, and we want to resolve the user fault before
1547 * trying again.
1548 *
1549 * Typically this is meant to be used by the futex code.
1550 *
1551 * The main difference with get_user_pages() is that this function will
1552 * unconditionally call handle_mm_fault() which will in turn perform all the
1553 * necessary SW fixup of the dirty and young bits in the PTE, while
1554 * get_user_pages() only guarantees to update these in the struct page.
1555 *
1556 * This is important for some architectures where those bits also gate the
1557 * access permission to the page because they are maintained in software. On
1558 * such architectures, gup() will not be enough to make a subsequent access
1559 * succeed.
1560 *
1561 * This function will not return with an unlocked mmap_lock. So it has not the
1562 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
1563 */
fixup_user_fault(struct mm_struct * mm,unsigned long address,unsigned int fault_flags,bool * unlocked)1564 int fixup_user_fault(struct mm_struct *mm,
1565 unsigned long address, unsigned int fault_flags,
1566 bool *unlocked)
1567 {
1568 struct vm_area_struct *vma;
1569 vm_fault_t ret;
1570
1571 address = untagged_addr_remote(mm, address);
1572
1573 if (unlocked)
1574 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1575
1576 retry:
1577 vma = gup_vma_lookup(mm, address);
1578 if (!vma)
1579 return -EFAULT;
1580
1581 if (!vma_permits_fault(vma, fault_flags))
1582 return -EFAULT;
1583
1584 if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1585 fatal_signal_pending(current))
1586 return -EINTR;
1587
1588 ret = handle_mm_fault(vma, address, fault_flags, NULL);
1589
1590 if (ret & VM_FAULT_COMPLETED) {
1591 /*
1592 * NOTE: it's a pity that we need to retake the lock here
1593 * to pair with the unlock() in the callers. Ideally we
1594 * could tell the callers so they do not need to unlock.
1595 */
1596 mmap_read_lock(mm);
1597 *unlocked = true;
1598 return 0;
1599 }
1600
1601 if (ret & VM_FAULT_ERROR) {
1602 int err = vm_fault_to_errno(ret, 0);
1603
1604 if (err)
1605 return err;
1606 BUG();
1607 }
1608
1609 if (ret & VM_FAULT_RETRY) {
1610 mmap_read_lock(mm);
1611 *unlocked = true;
1612 fault_flags |= FAULT_FLAG_TRIED;
1613 goto retry;
1614 }
1615
1616 return 0;
1617 }
1618 EXPORT_SYMBOL_GPL(fixup_user_fault);
1619
1620 /*
1621 * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
1622 * specified, it'll also respond to generic signals. The caller of GUP
1623 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
1624 */
gup_signal_pending(unsigned int flags)1625 static bool gup_signal_pending(unsigned int flags)
1626 {
1627 if (fatal_signal_pending(current))
1628 return true;
1629
1630 if (!(flags & FOLL_INTERRUPTIBLE))
1631 return false;
1632
1633 return signal_pending(current);
1634 }
1635
1636 /*
1637 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
1638 * the caller. This function may drop the mmap_lock. If it does so, then it will
1639 * set (*locked = 0).
1640 *
1641 * (*locked == 0) means that the caller expects this function to acquire and
1642 * drop the mmap_lock. Therefore, the value of *locked will still be zero when
1643 * the function returns, even though it may have changed temporarily during
1644 * function execution.
1645 *
1646 * Please note that this function, unlike __get_user_pages(), will not return 0
1647 * for nr_pages > 0, unless FOLL_NOWAIT is used.
1648 */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int flags)1649 static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1650 unsigned long start,
1651 unsigned long nr_pages,
1652 struct page **pages,
1653 int *locked,
1654 unsigned int flags)
1655 {
1656 long ret, pages_done;
1657 bool must_unlock = false;
1658
1659 if (!nr_pages)
1660 return 0;
1661
1662 /*
1663 * The internal caller expects GUP to manage the lock internally and the
1664 * lock must be released when this returns.
1665 */
1666 if (!*locked) {
1667 if (mmap_read_lock_killable(mm))
1668 return -EAGAIN;
1669 must_unlock = true;
1670 *locked = 1;
1671 }
1672 else
1673 mmap_assert_locked(mm);
1674
1675 if (flags & FOLL_PIN)
1676 mm_set_has_pinned_flag(mm);
1677
1678 /*
1679 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1680 * is to set FOLL_GET if the caller wants pages[] filled in (but has
1681 * carelessly failed to specify FOLL_GET), so keep doing that, but only
1682 * for FOLL_GET, not for the newer FOLL_PIN.
1683 *
1684 * FOLL_PIN always expects pages to be non-null, but no need to assert
1685 * that here, as any failures will be obvious enough.
1686 */
1687 if (pages && !(flags & FOLL_PIN))
1688 flags |= FOLL_GET;
1689
1690 pages_done = 0;
1691 for (;;) {
1692 ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1693 locked);
1694 if (!(flags & FOLL_UNLOCKABLE)) {
1695 /* VM_FAULT_RETRY couldn't trigger, bypass */
1696 pages_done = ret;
1697 break;
1698 }
1699
1700 /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
1701 VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
1702
1703 if (ret > 0) {
1704 nr_pages -= ret;
1705 pages_done += ret;
1706 if (!nr_pages)
1707 break;
1708 }
1709 if (*locked) {
1710 /*
1711 * VM_FAULT_RETRY didn't trigger or it was a
1712 * FOLL_NOWAIT.
1713 */
1714 if (!pages_done)
1715 pages_done = ret;
1716 break;
1717 }
1718 /*
1719 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1720 * For the prefault case (!pages) we only update counts.
1721 */
1722 if (likely(pages))
1723 pages += ret;
1724 start += ret << PAGE_SHIFT;
1725
1726 /* The lock was temporarily dropped, so we must unlock later */
1727 must_unlock = true;
1728
1729 retry:
1730 /*
1731 * Repeat on the address that fired VM_FAULT_RETRY
1732 * with both FAULT_FLAG_ALLOW_RETRY and
1733 * FAULT_FLAG_TRIED. Note that GUP can be interrupted
1734 * by fatal signals of even common signals, depending on
1735 * the caller's request. So we need to check it before we
1736 * start trying again otherwise it can loop forever.
1737 */
1738 if (gup_signal_pending(flags)) {
1739 if (!pages_done)
1740 pages_done = -EINTR;
1741 break;
1742 }
1743
1744 ret = mmap_read_lock_killable(mm);
1745 if (ret) {
1746 if (!pages_done)
1747 pages_done = ret;
1748 break;
1749 }
1750
1751 *locked = 1;
1752 ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1753 pages, locked);
1754 if (!*locked) {
1755 /* Continue to retry until we succeeded */
1756 VM_WARN_ON_ONCE(ret != 0);
1757 goto retry;
1758 }
1759 if (ret != 1) {
1760 VM_WARN_ON_ONCE(ret > 1);
1761 if (!pages_done)
1762 pages_done = ret;
1763 break;
1764 }
1765 nr_pages--;
1766 pages_done++;
1767 if (!nr_pages)
1768 break;
1769 if (likely(pages))
1770 pages++;
1771 start += PAGE_SIZE;
1772 }
1773 if (must_unlock && *locked) {
1774 /*
1775 * We either temporarily dropped the lock, or the caller
1776 * requested that we both acquire and drop the lock. Either way,
1777 * we must now unlock, and notify the caller of that state.
1778 */
1779 mmap_read_unlock(mm);
1780 *locked = 0;
1781 }
1782
1783 /*
1784 * Failing to pin anything implies something has gone wrong (except when
1785 * FOLL_NOWAIT is specified).
1786 */
1787 if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
1788 return -EFAULT;
1789
1790 return pages_done;
1791 }
1792
1793 /**
1794 * populate_vma_page_range() - populate a range of pages in the vma.
1795 * @vma: target vma
1796 * @start: start address
1797 * @end: end address
1798 * @locked: whether the mmap_lock is still held
1799 *
1800 * This takes care of mlocking the pages too if VM_LOCKED is set.
1801 *
1802 * Return either number of pages pinned in the vma, or a negative error
1803 * code on error.
1804 *
1805 * vma->vm_mm->mmap_lock must be held.
1806 *
1807 * If @locked is NULL, it may be held for read or write and will
1808 * be unperturbed.
1809 *
1810 * If @locked is non-NULL, it must held for read only and may be
1811 * released. If it's released, *@locked will be set to 0.
1812 */
populate_vma_page_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,int * locked)1813 long populate_vma_page_range(struct vm_area_struct *vma,
1814 unsigned long start, unsigned long end, int *locked)
1815 {
1816 struct mm_struct *mm = vma->vm_mm;
1817 unsigned long nr_pages = (end - start) / PAGE_SIZE;
1818 int local_locked = 1;
1819 int gup_flags;
1820 long ret;
1821
1822 VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1823 VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
1824 VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
1825 VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);
1826 mmap_assert_locked(mm);
1827
1828 /*
1829 * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1830 * faultin_page() to break COW, so it has no work to do here.
1831 */
1832 if (vma->vm_flags & VM_LOCKONFAULT)
1833 return nr_pages;
1834
1835 /* ... similarly, we've never faulted in PROT_NONE pages */
1836 if (!vma_is_accessible(vma))
1837 return -EFAULT;
1838
1839 gup_flags = FOLL_TOUCH;
1840 /*
1841 * We want to touch writable mappings with a write fault in order
1842 * to break COW, except for shared mappings because these don't COW
1843 * and we would not want to dirty them for nothing.
1844 *
1845 * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
1846 * readable (ie write-only or executable).
1847 */
1848 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1849 gup_flags |= FOLL_WRITE;
1850 else
1851 gup_flags |= FOLL_FORCE;
1852
1853 if (locked)
1854 gup_flags |= FOLL_UNLOCKABLE;
1855
1856 /*
1857 * We made sure addr is within a VMA, so the following will
1858 * not result in a stack expansion that recurses back here.
1859 */
1860 ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1861 NULL, locked ? locked : &local_locked);
1862 lru_add_drain();
1863 return ret;
1864 }
1865
1866 /*
1867 * faultin_page_range() - populate (prefault) page tables inside the
1868 * given range readable/writable
1869 *
1870 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
1871 *
1872 * @mm: the mm to populate page tables in
1873 * @start: start address
1874 * @end: end address
1875 * @write: whether to prefault readable or writable
1876 * @locked: whether the mmap_lock is still held
1877 *
1878 * Returns either number of processed pages in the MM, or a negative error
1879 * code on error (see __get_user_pages()). Note that this function reports
1880 * errors related to VMAs, such as incompatible mappings, as expected by
1881 * MADV_POPULATE_(READ|WRITE).
1882 *
1883 * The range must be page-aligned.
1884 *
1885 * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
1886 */
faultin_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool write,int * locked)1887 long faultin_page_range(struct mm_struct *mm, unsigned long start,
1888 unsigned long end, bool write, int *locked)
1889 {
1890 unsigned long nr_pages = (end - start) / PAGE_SIZE;
1891 int gup_flags;
1892 long ret;
1893
1894 VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1895 VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
1896 mmap_assert_locked(mm);
1897
1898 /*
1899 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
1900 * the page dirty with FOLL_WRITE -- which doesn't make a
1901 * difference with !FOLL_FORCE, because the page is writable
1902 * in the page table.
1903 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
1904 * a poisoned page.
1905 * !FOLL_FORCE: Require proper access permissions.
1906 */
1907 gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
1908 FOLL_MADV_POPULATE;
1909 if (write)
1910 gup_flags |= FOLL_WRITE;
1911
1912 ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
1913 gup_flags);
1914 lru_add_drain();
1915 return ret;
1916 }
1917
1918 /*
1919 * __mm_populate - populate and/or mlock pages within a range of address space.
1920 *
1921 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1922 * flags. VMAs must be already marked with the desired vm_flags, and
1923 * mmap_lock must not be held.
1924 */
__mm_populate(unsigned long start,unsigned long len,int ignore_errors)1925 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1926 {
1927 struct mm_struct *mm = current->mm;
1928 unsigned long end, nstart, nend;
1929 struct vm_area_struct *vma = NULL;
1930 int locked = 0;
1931 long ret = 0;
1932
1933 end = start + len;
1934
1935 for (nstart = start; nstart < end; nstart = nend) {
1936 /*
1937 * We want to fault in pages for [nstart; end) address range.
1938 * Find first corresponding VMA.
1939 */
1940 if (!locked) {
1941 locked = 1;
1942 mmap_read_lock(mm);
1943 vma = find_vma_intersection(mm, nstart, end);
1944 } else if (nstart >= vma->vm_end)
1945 vma = find_vma_intersection(mm, vma->vm_end, end);
1946
1947 if (!vma)
1948 break;
1949 /*
1950 * Set [nstart; nend) to intersection of desired address
1951 * range with the first VMA. Also, skip undesirable VMA types.
1952 */
1953 nend = min(end, vma->vm_end);
1954 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1955 continue;
1956 if (nstart < vma->vm_start)
1957 nstart = vma->vm_start;
1958 /*
1959 * Now fault in a range of pages. populate_vma_page_range()
1960 * double checks the vma flags, so that it won't mlock pages
1961 * if the vma was already munlocked.
1962 */
1963 ret = populate_vma_page_range(vma, nstart, nend, &locked);
1964 if (ret < 0) {
1965 if (ignore_errors) {
1966 ret = 0;
1967 continue; /* continue at next VMA */
1968 }
1969 break;
1970 }
1971 nend = nstart + ret * PAGE_SIZE;
1972 ret = 0;
1973 }
1974 if (locked)
1975 mmap_read_unlock(mm);
1976 return ret; /* 0 or negative error code */
1977 }
1978 #else /* CONFIG_MMU */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int foll_flags)1979 static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1980 unsigned long nr_pages, struct page **pages,
1981 int *locked, unsigned int foll_flags)
1982 {
1983 struct vm_area_struct *vma;
1984 bool must_unlock = false;
1985 vm_flags_t vm_flags;
1986 long i;
1987
1988 if (!nr_pages)
1989 return 0;
1990
1991 /*
1992 * The internal caller expects GUP to manage the lock internally and the
1993 * lock must be released when this returns.
1994 */
1995 if (!*locked) {
1996 if (mmap_read_lock_killable(mm))
1997 return -EAGAIN;
1998 must_unlock = true;
1999 *locked = 1;
2000 }
2001
2002 /* calculate required read or write permissions.
2003 * If FOLL_FORCE is set, we only require the "MAY" flags.
2004 */
2005 vm_flags = (foll_flags & FOLL_WRITE) ?
2006 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
2007 vm_flags &= (foll_flags & FOLL_FORCE) ?
2008 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
2009
2010 for (i = 0; i < nr_pages; i++) {
2011 vma = find_vma(mm, start);
2012 if (!vma)
2013 break;
2014
2015 /* protect what we can, including chardevs */
2016 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
2017 !(vm_flags & vma->vm_flags))
2018 break;
2019
2020 if (pages) {
2021 pages[i] = virt_to_page((void *)start);
2022 if (pages[i])
2023 get_page(pages[i]);
2024 }
2025
2026 start = (start + PAGE_SIZE) & PAGE_MASK;
2027 }
2028
2029 if (must_unlock && *locked) {
2030 mmap_read_unlock(mm);
2031 *locked = 0;
2032 }
2033
2034 return i ? : -EFAULT;
2035 }
2036 #endif /* !CONFIG_MMU */
2037
2038 /**
2039 * fault_in_writeable - fault in userspace address range for writing
2040 * @uaddr: start of address range
2041 * @size: size of address range
2042 *
2043 * Returns the number of bytes not faulted in (like copy_to_user() and
2044 * copy_from_user()).
2045 */
fault_in_writeable(char __user * uaddr,size_t size)2046 size_t fault_in_writeable(char __user *uaddr, size_t size)
2047 {
2048 const unsigned long start = (unsigned long)uaddr;
2049 const unsigned long end = start + size;
2050 unsigned long cur;
2051
2052 if (unlikely(size == 0))
2053 return 0;
2054 if (!user_write_access_begin(uaddr, size))
2055 return size;
2056
2057 /* Stop once we overflow to 0. */
2058 for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2059 unsafe_put_user(0, (char __user *)cur, out);
2060 out:
2061 user_write_access_end();
2062 if (size > cur - start)
2063 return size - (cur - start);
2064 return 0;
2065 }
2066 EXPORT_SYMBOL(fault_in_writeable);
2067
2068 /**
2069 * fault_in_subpage_writeable - fault in an address range for writing
2070 * @uaddr: start of address range
2071 * @size: size of address range
2072 *
2073 * Fault in a user address range for writing while checking for permissions at
2074 * sub-page granularity (e.g. arm64 MTE). This function should be used when
2075 * the caller cannot guarantee forward progress of a copy_to_user() loop.
2076 *
2077 * Returns the number of bytes not faulted in (like copy_to_user() and
2078 * copy_from_user()).
2079 */
fault_in_subpage_writeable(char __user * uaddr,size_t size)2080 size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
2081 {
2082 size_t faulted_in;
2083
2084 /*
2085 * Attempt faulting in at page granularity first for page table
2086 * permission checking. The arch-specific probe_subpage_writeable()
2087 * functions may not check for this.
2088 */
2089 faulted_in = size - fault_in_writeable(uaddr, size);
2090 if (faulted_in)
2091 faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
2092
2093 return size - faulted_in;
2094 }
2095 EXPORT_SYMBOL(fault_in_subpage_writeable);
2096
2097 /*
2098 * fault_in_safe_writeable - fault in an address range for writing
2099 * @uaddr: start of address range
2100 * @size: length of address range
2101 *
2102 * Faults in an address range for writing. This is primarily useful when we
2103 * already know that some or all of the pages in the address range aren't in
2104 * memory.
2105 *
2106 * Unlike fault_in_writeable(), this function is non-destructive.
2107 *
2108 * Note that we don't pin or otherwise hold the pages referenced that we fault
2109 * in. There's no guarantee that they'll stay in memory for any duration of
2110 * time.
2111 *
2112 * Returns the number of bytes not faulted in, like copy_to_user() and
2113 * copy_from_user().
2114 */
fault_in_safe_writeable(const char __user * uaddr,size_t size)2115 size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
2116 {
2117 const unsigned long start = (unsigned long)uaddr;
2118 const unsigned long end = start + size;
2119 unsigned long cur;
2120 struct mm_struct *mm = current->mm;
2121 bool unlocked = false;
2122
2123 if (unlikely(size == 0))
2124 return 0;
2125
2126 mmap_read_lock(mm);
2127 /* Stop once we overflow to 0. */
2128 for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2129 if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
2130 break;
2131 mmap_read_unlock(mm);
2132
2133 if (size > cur - start)
2134 return size - (cur - start);
2135 return 0;
2136 }
2137 EXPORT_SYMBOL(fault_in_safe_writeable);
2138
2139 /**
2140 * fault_in_readable - fault in userspace address range for reading
2141 * @uaddr: start of user address range
2142 * @size: size of user address range
2143 *
2144 * Returns the number of bytes not faulted in (like copy_to_user() and
2145 * copy_from_user()).
2146 */
fault_in_readable(const char __user * uaddr,size_t size)2147 size_t fault_in_readable(const char __user *uaddr, size_t size)
2148 {
2149 const unsigned long start = (unsigned long)uaddr;
2150 const unsigned long end = start + size;
2151 unsigned long cur;
2152 volatile char c;
2153
2154 if (unlikely(size == 0))
2155 return 0;
2156 if (!user_read_access_begin(uaddr, size))
2157 return size;
2158
2159 /* Stop once we overflow to 0. */
2160 for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2161 unsafe_get_user(c, (const char __user *)cur, out);
2162 out:
2163 user_read_access_end();
2164 (void)c;
2165 if (size > cur - start)
2166 return size - (cur - start);
2167 return 0;
2168 }
2169 EXPORT_SYMBOL(fault_in_readable);
2170
2171 /**
2172 * get_dump_page() - pin user page in memory while writing it to core dump
2173 * @addr: user address
2174 * @locked: a pointer to an int denoting whether the mmap sem is held
2175 *
2176 * Returns struct page pointer of user page pinned for dump,
2177 * to be freed afterwards by put_page().
2178 *
2179 * Returns NULL on any kind of failure - a hole must then be inserted into
2180 * the corefile, to preserve alignment with its headers; and also returns
2181 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
2182 * allowing a hole to be left in the corefile to save disk space.
2183 *
2184 * Called without mmap_lock (takes and releases the mmap_lock by itself).
2185 */
2186 #ifdef CONFIG_ELF_CORE
get_dump_page(unsigned long addr,int * locked)2187 struct page *get_dump_page(unsigned long addr, int *locked)
2188 {
2189 struct page *page;
2190 int ret;
2191
2192 ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
2193 FOLL_FORCE | FOLL_DUMP | FOLL_GET);
2194 return (ret == 1) ? page : NULL;
2195 }
2196 #endif /* CONFIG_ELF_CORE */
2197
2198 #ifdef CONFIG_MIGRATION
2199
2200 /*
2201 * An array of either pages or folios ("pofs"). Although it may seem tempting to
2202 * avoid this complication, by simply interpreting a list of folios as a list of
2203 * pages, that approach won't work in the longer term, because eventually the
2204 * layouts of struct page and struct folio will become completely different.
2205 * Furthermore, this pof approach avoids excessive page_folio() calls.
2206 */
2207 struct pages_or_folios {
2208 union {
2209 struct page **pages;
2210 struct folio **folios;
2211 void **entries;
2212 };
2213 bool has_folios;
2214 long nr_entries;
2215 };
2216
pofs_get_folio(struct pages_or_folios * pofs,long i)2217 static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
2218 {
2219 if (pofs->has_folios)
2220 return pofs->folios[i];
2221 return page_folio(pofs->pages[i]);
2222 }
2223
pofs_clear_entry(struct pages_or_folios * pofs,long i)2224 static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
2225 {
2226 pofs->entries[i] = NULL;
2227 }
2228
pofs_unpin(struct pages_or_folios * pofs)2229 static void pofs_unpin(struct pages_or_folios *pofs)
2230 {
2231 if (pofs->has_folios)
2232 unpin_folios(pofs->folios, pofs->nr_entries);
2233 else
2234 unpin_user_pages(pofs->pages, pofs->nr_entries);
2235 }
2236
pofs_next_folio(struct folio * folio,struct pages_or_folios * pofs,long * index_ptr)2237 static struct folio *pofs_next_folio(struct folio *folio,
2238 struct pages_or_folios *pofs, long *index_ptr)
2239 {
2240 long i = *index_ptr + 1;
2241
2242 if (!pofs->has_folios && folio_test_large(folio)) {
2243 const unsigned long start_pfn = folio_pfn(folio);
2244 const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
2245
2246 for (; i < pofs->nr_entries; i++) {
2247 unsigned long pfn = page_to_pfn(pofs->pages[i]);
2248
2249 /* Is this page part of this folio? */
2250 if (pfn < start_pfn || pfn >= end_pfn)
2251 break;
2252 }
2253 }
2254
2255 if (unlikely(i == pofs->nr_entries))
2256 return NULL;
2257 *index_ptr = i;
2258
2259 return pofs_get_folio(pofs, i);
2260 }
2261
2262 /*
2263 * Returns the number of collected folios. Return value is always >= 0.
2264 */
collect_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)2265 static unsigned long collect_longterm_unpinnable_folios(
2266 struct list_head *movable_folio_list,
2267 struct pages_or_folios *pofs)
2268 {
2269 unsigned long collected = 0;
2270 struct folio *folio;
2271 int drained = 0;
2272 long i = 0;
2273
2274 for (folio = pofs_get_folio(pofs, i); folio;
2275 folio = pofs_next_folio(folio, pofs, &i)) {
2276
2277 if (folio_is_longterm_pinnable(folio))
2278 continue;
2279
2280 collected++;
2281
2282 if (folio_is_device_coherent(folio))
2283 continue;
2284
2285 if (folio_test_hugetlb(folio)) {
2286 folio_isolate_hugetlb(folio, movable_folio_list);
2287 continue;
2288 }
2289
2290 if (drained == 0 && folio_may_be_lru_cached(folio) &&
2291 folio_ref_count(folio) !=
2292 folio_expected_ref_count(folio) + 1) {
2293 lru_add_drain();
2294 drained = 1;
2295 }
2296 if (drained == 1 && folio_may_be_lru_cached(folio) &&
2297 folio_ref_count(folio) !=
2298 folio_expected_ref_count(folio) + 1) {
2299 lru_add_drain_all();
2300 drained = 2;
2301 }
2302
2303 if (!folio_isolate_lru(folio))
2304 continue;
2305
2306 list_add_tail(&folio->lru, movable_folio_list);
2307 node_stat_mod_folio(folio,
2308 NR_ISOLATED_ANON + folio_is_file_lru(folio),
2309 folio_nr_pages(folio));
2310 }
2311
2312 return collected;
2313 }
2314
2315 /*
2316 * Unpins all folios and migrates device coherent folios and movable_folio_list.
2317 * Returns -EAGAIN if all folios were successfully migrated or -errno for
2318 * failure (or partial success).
2319 */
2320 static int
migrate_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)2321 migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
2322 struct pages_or_folios *pofs)
2323 {
2324 int ret;
2325 unsigned long i;
2326
2327 for (i = 0; i < pofs->nr_entries; i++) {
2328 struct folio *folio = pofs_get_folio(pofs, i);
2329
2330 if (folio_is_device_coherent(folio)) {
2331 /*
2332 * Migration will fail if the folio is pinned, so
2333 * convert the pin on the source folio to a normal
2334 * reference.
2335 */
2336 pofs_clear_entry(pofs, i);
2337 folio_get(folio);
2338 gup_put_folio(folio, 1, FOLL_PIN);
2339
2340 if (migrate_device_coherent_folio(folio)) {
2341 ret = -EBUSY;
2342 goto err;
2343 }
2344
2345 continue;
2346 }
2347
2348 /*
2349 * We can't migrate folios with unexpected references, so drop
2350 * the reference obtained by __get_user_pages_locked().
2351 * Migrating folios have been added to movable_folio_list after
2352 * calling folio_isolate_lru() which takes a reference so the
2353 * folio won't be freed if it's migrating.
2354 */
2355 unpin_folio(folio);
2356 pofs_clear_entry(pofs, i);
2357 }
2358
2359 if (!list_empty(movable_folio_list)) {
2360 struct migration_target_control mtc = {
2361 .nid = NUMA_NO_NODE,
2362 .gfp_mask = GFP_USER | __GFP_NOWARN,
2363 .reason = MR_LONGTERM_PIN,
2364 };
2365
2366 if (migrate_pages(movable_folio_list, alloc_migration_target,
2367 NULL, (unsigned long)&mtc, MIGRATE_SYNC,
2368 MR_LONGTERM_PIN, NULL)) {
2369 ret = -ENOMEM;
2370 goto err;
2371 }
2372 }
2373
2374 putback_movable_pages(movable_folio_list);
2375
2376 return -EAGAIN;
2377
2378 err:
2379 pofs_unpin(pofs);
2380 putback_movable_pages(movable_folio_list);
2381
2382 return ret;
2383 }
2384
2385 static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios * pofs)2386 check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
2387 {
2388 LIST_HEAD(movable_folio_list);
2389 unsigned long collected;
2390
2391 collected = collect_longterm_unpinnable_folios(&movable_folio_list,
2392 pofs);
2393 if (!collected)
2394 return 0;
2395
2396 return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
2397 }
2398
2399 /*
2400 * Check whether all folios are *allowed* to be pinned indefinitely (long term).
2401 * Rather confusingly, all folios in the range are required to be pinned via
2402 * FOLL_PIN, before calling this routine.
2403 *
2404 * Return values:
2405 *
2406 * 0: if everything is OK and all folios in the range are allowed to be pinned,
2407 * then this routine leaves all folios pinned and returns zero for success.
2408 *
2409 * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
2410 * routine will migrate those folios away, unpin all the folios in the range. If
2411 * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
2412 * caller should re-pin the entire range with FOLL_PIN and then call this
2413 * routine again.
2414 *
2415 * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
2416 * indicates a migration failure. The caller should give up, and propagate the
2417 * error back up the call stack. The caller does not need to unpin any folios in
2418 * that case, because this routine will do the unpinning.
2419 */
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)2420 static long check_and_migrate_movable_folios(unsigned long nr_folios,
2421 struct folio **folios)
2422 {
2423 struct pages_or_folios pofs = {
2424 .folios = folios,
2425 .has_folios = true,
2426 .nr_entries = nr_folios,
2427 };
2428
2429 return check_and_migrate_movable_pages_or_folios(&pofs);
2430 }
2431
2432 /*
2433 * Return values and behavior are the same as those for
2434 * check_and_migrate_movable_folios().
2435 */
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)2436 static long check_and_migrate_movable_pages(unsigned long nr_pages,
2437 struct page **pages)
2438 {
2439 struct pages_or_folios pofs = {
2440 .pages = pages,
2441 .has_folios = false,
2442 .nr_entries = nr_pages,
2443 };
2444
2445 return check_and_migrate_movable_pages_or_folios(&pofs);
2446 }
2447 #else
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)2448 static long check_and_migrate_movable_pages(unsigned long nr_pages,
2449 struct page **pages)
2450 {
2451 return 0;
2452 }
2453
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)2454 static long check_and_migrate_movable_folios(unsigned long nr_folios,
2455 struct folio **folios)
2456 {
2457 return 0;
2458 }
2459 #endif /* CONFIG_MIGRATION */
2460
2461 /*
2462 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
2463 * allows us to process the FOLL_LONGTERM flag.
2464 */
__gup_longterm_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int gup_flags)2465 static long __gup_longterm_locked(struct mm_struct *mm,
2466 unsigned long start,
2467 unsigned long nr_pages,
2468 struct page **pages,
2469 int *locked,
2470 unsigned int gup_flags)
2471 {
2472 unsigned int flags;
2473 long rc, nr_pinned_pages;
2474
2475 if (!(gup_flags & FOLL_LONGTERM))
2476 return __get_user_pages_locked(mm, start, nr_pages, pages,
2477 locked, gup_flags);
2478
2479 flags = memalloc_pin_save();
2480 do {
2481 nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
2482 pages, locked,
2483 gup_flags);
2484 if (nr_pinned_pages <= 0) {
2485 rc = nr_pinned_pages;
2486 break;
2487 }
2488
2489 /* FOLL_LONGTERM implies FOLL_PIN */
2490 rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
2491 } while (rc == -EAGAIN);
2492 memalloc_pin_restore(flags);
2493 return rc ? rc : nr_pinned_pages;
2494 }
2495
2496 /*
2497 * Check that the given flags are valid for the exported gup/pup interface, and
2498 * update them with the required flags that the caller must have set.
2499 */
is_valid_gup_args(struct page ** pages,int * locked,unsigned int * gup_flags_p,unsigned int to_set)2500 static bool is_valid_gup_args(struct page **pages, int *locked,
2501 unsigned int *gup_flags_p, unsigned int to_set)
2502 {
2503 unsigned int gup_flags = *gup_flags_p;
2504
2505 /*
2506 * These flags not allowed to be specified externally to the gup
2507 * interfaces:
2508 * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
2509 * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
2510 * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
2511 */
2512 if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
2513 return false;
2514
2515 gup_flags |= to_set;
2516 if (locked) {
2517 /* At the external interface locked must be set */
2518 if (WARN_ON_ONCE(*locked != 1))
2519 return false;
2520
2521 gup_flags |= FOLL_UNLOCKABLE;
2522 }
2523
2524 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2525 if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
2526 (FOLL_PIN | FOLL_GET)))
2527 return false;
2528
2529 /* LONGTERM can only be specified when pinning */
2530 if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
2531 return false;
2532
2533 /* Pages input must be given if using GET/PIN */
2534 if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
2535 return false;
2536
2537 /* We want to allow the pgmap to be hot-unplugged at all times */
2538 if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
2539 (gup_flags & FOLL_PCI_P2PDMA)))
2540 return false;
2541
2542 *gup_flags_p = gup_flags;
2543 return true;
2544 }
2545
2546 #ifdef CONFIG_MMU
2547 /**
2548 * get_user_pages_remote() - pin user pages in memory
2549 * @mm: mm_struct of target mm
2550 * @start: starting user address
2551 * @nr_pages: number of pages from start to pin
2552 * @gup_flags: flags modifying lookup behaviour
2553 * @pages: array that receives pointers to the pages pinned.
2554 * Should be at least nr_pages long. Or NULL, if caller
2555 * only intends to ensure the pages are faulted in.
2556 * @locked: pointer to lock flag indicating whether lock is held and
2557 * subsequently whether VM_FAULT_RETRY functionality can be
2558 * utilised. Lock must initially be held.
2559 *
2560 * Returns either number of pages pinned (which may be less than the
2561 * number requested), or an error. Details about the return value:
2562 *
2563 * -- If nr_pages is 0, returns 0.
2564 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2565 * -- If nr_pages is >0, and some pages were pinned, returns the number of
2566 * pages pinned. Again, this may be less than nr_pages.
2567 *
2568 * The caller is responsible for releasing returned @pages, via put_page().
2569 *
2570 * Must be called with mmap_lock held for read or write.
2571 *
2572 * get_user_pages_remote walks a process's page tables and takes a reference
2573 * to each struct page that each user address corresponds to at a given
2574 * instant. That is, it takes the page that would be accessed if a user
2575 * thread accesses the given user virtual address at that instant.
2576 *
2577 * This does not guarantee that the page exists in the user mappings when
2578 * get_user_pages_remote returns, and there may even be a completely different
2579 * page there in some cases (eg. if mmapped pagecache has been invalidated
2580 * and subsequently re-faulted). However it does guarantee that the page
2581 * won't be freed completely. And mostly callers simply care that the page
2582 * contains data that was valid *at some point in time*. Typically, an IO
2583 * or similar operation cannot guarantee anything stronger anyway because
2584 * locks can't be held over the syscall boundary.
2585 *
2586 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2587 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2588 * be called after the page is finished with, and before put_page is called.
2589 *
2590 * get_user_pages_remote is typically used for fewer-copy IO operations,
2591 * to get a handle on the memory by some means other than accesses
2592 * via the user virtual addresses. The pages may be submitted for
2593 * DMA to devices or accessed via their kernel linear mapping (via the
2594 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2595 *
2596 * See also get_user_pages_fast, for performance critical applications.
2597 *
2598 * get_user_pages_remote should be phased out in favor of
2599 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2600 * should use get_user_pages_remote because it cannot pass
2601 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2602 */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)2603 long get_user_pages_remote(struct mm_struct *mm,
2604 unsigned long start, unsigned long nr_pages,
2605 unsigned int gup_flags, struct page **pages,
2606 int *locked)
2607 {
2608 int local_locked = 1;
2609
2610 if (!is_valid_gup_args(pages, locked, &gup_flags,
2611 FOLL_TOUCH | FOLL_REMOTE))
2612 return -EINVAL;
2613
2614 return __get_user_pages_locked(mm, start, nr_pages, pages,
2615 locked ? locked : &local_locked,
2616 gup_flags);
2617 }
2618 EXPORT_SYMBOL(get_user_pages_remote);
2619
2620 #else /* CONFIG_MMU */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)2621 long get_user_pages_remote(struct mm_struct *mm,
2622 unsigned long start, unsigned long nr_pages,
2623 unsigned int gup_flags, struct page **pages,
2624 int *locked)
2625 {
2626 return 0;
2627 }
2628 #endif /* !CONFIG_MMU */
2629
2630 /**
2631 * get_user_pages() - pin user pages in memory
2632 * @start: starting user address
2633 * @nr_pages: number of pages from start to pin
2634 * @gup_flags: flags modifying lookup behaviour
2635 * @pages: array that receives pointers to the pages pinned.
2636 * Should be at least nr_pages long. Or NULL, if caller
2637 * only intends to ensure the pages are faulted in.
2638 *
2639 * This is the same as get_user_pages_remote(), just with a less-flexible
2640 * calling convention where we assume that the mm being operated on belongs to
2641 * the current task, and doesn't allow passing of a locked parameter. We also
2642 * obviously don't pass FOLL_REMOTE in here.
2643 */
get_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)2644 long get_user_pages(unsigned long start, unsigned long nr_pages,
2645 unsigned int gup_flags, struct page **pages)
2646 {
2647 int locked = 1;
2648
2649 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
2650 return -EINVAL;
2651
2652 return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2653 &locked, gup_flags);
2654 }
2655 EXPORT_SYMBOL(get_user_pages);
2656
2657 /*
2658 * get_user_pages_unlocked() is suitable to replace the form:
2659 *
2660 * mmap_read_lock(mm);
2661 * get_user_pages(mm, ..., pages, NULL);
2662 * mmap_read_unlock(mm);
2663 *
2664 * with:
2665 *
2666 * get_user_pages_unlocked(mm, ..., pages);
2667 *
2668 * It is functionally equivalent to get_user_pages_fast so
2669 * get_user_pages_fast should be used instead if specific gup_flags
2670 * (e.g. FOLL_FORCE) are not required.
2671 */
get_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)2672 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2673 struct page **pages, unsigned int gup_flags)
2674 {
2675 int locked = 0;
2676
2677 if (!is_valid_gup_args(pages, NULL, &gup_flags,
2678 FOLL_TOUCH | FOLL_UNLOCKABLE))
2679 return -EINVAL;
2680
2681 return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2682 &locked, gup_flags);
2683 }
2684 EXPORT_SYMBOL(get_user_pages_unlocked);
2685
2686 /*
2687 * GUP-fast
2688 *
2689 * get_user_pages_fast attempts to pin user pages by walking the page
2690 * tables directly and avoids taking locks. Thus the walker needs to be
2691 * protected from page table pages being freed from under it, and should
2692 * block any THP splits.
2693 *
2694 * One way to achieve this is to have the walker disable interrupts, and
2695 * rely on IPIs from the TLB flushing code blocking before the page table
2696 * pages are freed. This is unsuitable for architectures that do not need
2697 * to broadcast an IPI when invalidating TLBs.
2698 *
2699 * Another way to achieve this is to batch up page table containing pages
2700 * belonging to more than one mm_user, then rcu_sched a callback to free those
2701 * pages. Disabling interrupts will allow the gup_fast() walker to both block
2702 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2703 * (which is a relatively rare event). The code below adopts this strategy.
2704 *
2705 * Before activating this code, please be aware that the following assumptions
2706 * are currently made:
2707 *
2708 * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2709 * free pages containing page tables or TLB flushing requires IPI broadcast.
2710 *
2711 * *) ptes can be read atomically by the architecture.
2712 *
2713 * *) valid user addesses are below TASK_MAX_SIZE
2714 *
2715 * The last two assumptions can be relaxed by the addition of helper functions.
2716 *
2717 * This code is based heavily on the PowerPC implementation by Nick Piggin.
2718 */
2719 #ifdef CONFIG_HAVE_GUP_FAST
2720 /*
2721 * Used in the GUP-fast path to determine whether GUP is permitted to work on
2722 * a specific folio.
2723 *
2724 * This call assumes the caller has pinned the folio, that the lowest page table
2725 * level still points to this folio, and that interrupts have been disabled.
2726 *
2727 * GUP-fast must reject all secretmem folios.
2728 *
2729 * Writing to pinned file-backed dirty tracked folios is inherently problematic
2730 * (see comment describing the writable_file_mapping_allowed() function). We
2731 * therefore try to avoid the most egregious case of a long-term mapping doing
2732 * so.
2733 *
2734 * This function cannot be as thorough as that one as the VMA is not available
2735 * in the fast path, so instead we whitelist known good cases and if in doubt,
2736 * fall back to the slow path.
2737 */
gup_fast_folio_allowed(struct folio * folio,unsigned int flags)2738 static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
2739 {
2740 bool reject_file_backed = false;
2741 struct address_space *mapping;
2742 bool check_secretmem = false;
2743 unsigned long mapping_flags;
2744
2745 /*
2746 * If we aren't pinning then no problematic write can occur. A long term
2747 * pin is the most egregious case so this is the one we disallow.
2748 */
2749 if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
2750 (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
2751 reject_file_backed = true;
2752
2753 /* We hold a folio reference, so we can safely access folio fields. */
2754
2755 /* secretmem folios are always order-0 folios. */
2756 if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
2757 check_secretmem = true;
2758
2759 if (!reject_file_backed && !check_secretmem)
2760 return true;
2761
2762 if (WARN_ON_ONCE(folio_test_slab(folio)))
2763 return false;
2764
2765 /* hugetlb neither requires dirty-tracking nor can be secretmem. */
2766 if (folio_test_hugetlb(folio))
2767 return true;
2768
2769 /*
2770 * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
2771 * cannot proceed, which means no actions performed under RCU can
2772 * proceed either.
2773 *
2774 * inodes and thus their mappings are freed under RCU, which means the
2775 * mapping cannot be freed beneath us and thus we can safely dereference
2776 * it.
2777 */
2778 lockdep_assert_irqs_disabled();
2779
2780 /*
2781 * However, there may be operations which _alter_ the mapping, so ensure
2782 * we read it once and only once.
2783 */
2784 mapping = READ_ONCE(folio->mapping);
2785
2786 /*
2787 * The mapping may have been truncated, in any case we cannot determine
2788 * if this mapping is safe - fall back to slow path to determine how to
2789 * proceed.
2790 */
2791 if (!mapping)
2792 return false;
2793
2794 /* Anonymous folios pose no problem. */
2795 mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
2796 if (mapping_flags)
2797 return mapping_flags & FOLIO_MAPPING_ANON;
2798
2799 /*
2800 * At this point, we know the mapping is non-null and points to an
2801 * address_space object.
2802 */
2803 if (check_secretmem && secretmem_mapping(mapping))
2804 return false;
2805 /* The only remaining allowed file system is shmem. */
2806 return !reject_file_backed || shmem_mapping(mapping);
2807 }
2808
gup_fast_undo_dev_pagemap(int * nr,int nr_start,unsigned int flags,struct page ** pages)2809 static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
2810 unsigned int flags, struct page **pages)
2811 {
2812 while ((*nr) - nr_start) {
2813 struct folio *folio = page_folio(pages[--(*nr)]);
2814
2815 folio_clear_referenced(folio);
2816 gup_put_folio(folio, 1, flags);
2817 }
2818 }
2819
2820 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2821 /*
2822 * GUP-fast relies on pte change detection to avoid concurrent pgtable
2823 * operations.
2824 *
2825 * To pin the page, GUP-fast needs to do below in order:
2826 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2827 *
2828 * For the rest of pgtable operations where pgtable updates can be racy
2829 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
2830 * is pinned.
2831 *
2832 * Above will work for all pte-level operations, including THP split.
2833 *
2834 * For THP collapse, it's a bit more complicated because GUP-fast may be
2835 * walking a pgtable page that is being freed (pte is still valid but pmd
2836 * can be cleared already). To avoid race in such condition, we need to
2837 * also check pmd here to make sure pmd doesn't change (corresponds to
2838 * pmdp_collapse_flush() in the THP collapse code path).
2839 */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)2840 static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2841 unsigned long end, unsigned int flags, struct page **pages,
2842 int *nr)
2843 {
2844 int ret = 0;
2845 pte_t *ptep, *ptem;
2846
2847 ptem = ptep = pte_offset_map(&pmd, addr);
2848 if (!ptep)
2849 return 0;
2850 do {
2851 pte_t pte = ptep_get_lockless(ptep);
2852 struct page *page;
2853 struct folio *folio;
2854
2855 /*
2856 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
2857 * pte_access_permitted() better should reject these pages
2858 * either way: otherwise, GUP-fast might succeed in
2859 * cases where ordinary GUP would fail due to VMA access
2860 * permissions.
2861 */
2862 if (pte_protnone(pte))
2863 goto pte_unmap;
2864
2865 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2866 goto pte_unmap;
2867
2868 if (pte_special(pte))
2869 goto pte_unmap;
2870
2871 /* If it's not marked as special it must have a valid memmap. */
2872 VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
2873 page = pte_page(pte);
2874
2875 folio = try_grab_folio_fast(page, 1, flags);
2876 if (!folio)
2877 goto pte_unmap;
2878
2879 if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2880 unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
2881 gup_put_folio(folio, 1, flags);
2882 goto pte_unmap;
2883 }
2884
2885 if (!gup_fast_folio_allowed(folio, flags)) {
2886 gup_put_folio(folio, 1, flags);
2887 goto pte_unmap;
2888 }
2889
2890 if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
2891 gup_put_folio(folio, 1, flags);
2892 goto pte_unmap;
2893 }
2894
2895 /*
2896 * We need to make the page accessible if and only if we are
2897 * going to access its content (the FOLL_PIN case). Please
2898 * see Documentation/core-api/pin_user_pages.rst for
2899 * details.
2900 */
2901 if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
2902 gup_put_folio(folio, 1, flags);
2903 goto pte_unmap;
2904 }
2905 folio_set_referenced(folio);
2906 pages[*nr] = page;
2907 (*nr)++;
2908 } while (ptep++, addr += PAGE_SIZE, addr != end);
2909
2910 ret = 1;
2911
2912 pte_unmap:
2913 pte_unmap(ptem);
2914 return ret;
2915 }
2916 #else
2917
2918 /*
2919 * If we can't determine whether or not a pte is special, then fail immediately
2920 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2921 * to be special.
2922 *
2923 * For a futex to be placed on a THP tail page, get_futex_key requires a
2924 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2925 * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
2926 */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)2927 static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2928 unsigned long end, unsigned int flags, struct page **pages,
2929 int *nr)
2930 {
2931 return 0;
2932 }
2933 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2934
gup_fast_pmd_leaf(pmd_t orig,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)2935 static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2936 unsigned long end, unsigned int flags, struct page **pages,
2937 int *nr)
2938 {
2939 struct page *page;
2940 struct folio *folio;
2941 int refs;
2942
2943 if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2944 return 0;
2945
2946 if (pmd_special(orig))
2947 return 0;
2948
2949 refs = (end - addr) >> PAGE_SHIFT;
2950 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2951
2952 folio = try_grab_folio_fast(page, refs, flags);
2953 if (!folio)
2954 return 0;
2955
2956 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2957 gup_put_folio(folio, refs, flags);
2958 return 0;
2959 }
2960
2961 if (!gup_fast_folio_allowed(folio, flags)) {
2962 gup_put_folio(folio, refs, flags);
2963 return 0;
2964 }
2965 if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2966 gup_put_folio(folio, refs, flags);
2967 return 0;
2968 }
2969
2970 pages += *nr;
2971 *nr += refs;
2972 for (; refs; refs--)
2973 *(pages++) = page++;
2974 folio_set_referenced(folio);
2975 return 1;
2976 }
2977
gup_fast_pud_leaf(pud_t orig,pud_t * pudp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)2978 static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
2979 unsigned long end, unsigned int flags, struct page **pages,
2980 int *nr)
2981 {
2982 struct page *page;
2983 struct folio *folio;
2984 int refs;
2985
2986 if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2987 return 0;
2988
2989 if (pud_special(orig))
2990 return 0;
2991
2992 refs = (end - addr) >> PAGE_SHIFT;
2993 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2994
2995 folio = try_grab_folio_fast(page, refs, flags);
2996 if (!folio)
2997 return 0;
2998
2999 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
3000 gup_put_folio(folio, refs, flags);
3001 return 0;
3002 }
3003
3004 if (!gup_fast_folio_allowed(folio, flags)) {
3005 gup_put_folio(folio, refs, flags);
3006 return 0;
3007 }
3008
3009 if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
3010 gup_put_folio(folio, refs, flags);
3011 return 0;
3012 }
3013
3014 pages += *nr;
3015 *nr += refs;
3016 for (; refs; refs--)
3017 *(pages++) = page++;
3018 folio_set_referenced(folio);
3019 return 1;
3020 }
3021
gup_fast_pmd_range(pud_t * pudp,pud_t pud,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)3022 static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
3023 unsigned long end, unsigned int flags, struct page **pages,
3024 int *nr)
3025 {
3026 unsigned long next;
3027 pmd_t *pmdp;
3028
3029 pmdp = pmd_offset_lockless(pudp, pud, addr);
3030 do {
3031 pmd_t pmd = pmdp_get_lockless(pmdp);
3032
3033 next = pmd_addr_end(addr, end);
3034 if (!pmd_present(pmd))
3035 return 0;
3036
3037 if (unlikely(pmd_leaf(pmd))) {
3038 /* See gup_fast_pte_range() */
3039 if (pmd_protnone(pmd))
3040 return 0;
3041
3042 if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
3043 pages, nr))
3044 return 0;
3045
3046 } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
3047 pages, nr))
3048 return 0;
3049 } while (pmdp++, addr = next, addr != end);
3050
3051 return 1;
3052 }
3053
gup_fast_pud_range(p4d_t * p4dp,p4d_t p4d,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)3054 static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
3055 unsigned long end, unsigned int flags, struct page **pages,
3056 int *nr)
3057 {
3058 unsigned long next;
3059 pud_t *pudp;
3060
3061 pudp = pud_offset_lockless(p4dp, p4d, addr);
3062 do {
3063 pud_t pud = READ_ONCE(*pudp);
3064
3065 next = pud_addr_end(addr, end);
3066 if (unlikely(!pud_present(pud)))
3067 return 0;
3068 if (unlikely(pud_leaf(pud))) {
3069 if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
3070 pages, nr))
3071 return 0;
3072 } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
3073 pages, nr))
3074 return 0;
3075 } while (pudp++, addr = next, addr != end);
3076
3077 return 1;
3078 }
3079
gup_fast_p4d_range(pgd_t * pgdp,pgd_t pgd,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)3080 static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
3081 unsigned long end, unsigned int flags, struct page **pages,
3082 int *nr)
3083 {
3084 unsigned long next;
3085 p4d_t *p4dp;
3086
3087 p4dp = p4d_offset_lockless(pgdp, pgd, addr);
3088 do {
3089 p4d_t p4d = READ_ONCE(*p4dp);
3090
3091 next = p4d_addr_end(addr, end);
3092 if (!p4d_present(p4d))
3093 return 0;
3094 BUILD_BUG_ON(p4d_leaf(p4d));
3095 if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
3096 pages, nr))
3097 return 0;
3098 } while (p4dp++, addr = next, addr != end);
3099
3100 return 1;
3101 }
3102
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)3103 static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3104 unsigned int flags, struct page **pages, int *nr)
3105 {
3106 unsigned long next;
3107 pgd_t *pgdp;
3108
3109 pgdp = pgd_offset(current->mm, addr);
3110 do {
3111 pgd_t pgd = READ_ONCE(*pgdp);
3112
3113 next = pgd_addr_end(addr, end);
3114 if (pgd_none(pgd))
3115 return;
3116 BUILD_BUG_ON(pgd_leaf(pgd));
3117 if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
3118 pages, nr))
3119 return;
3120 } while (pgdp++, addr = next, addr != end);
3121 }
3122 #else
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)3123 static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3124 unsigned int flags, struct page **pages, int *nr)
3125 {
3126 }
3127 #endif /* CONFIG_HAVE_GUP_FAST */
3128
3129 #ifndef gup_fast_permitted
3130 /*
3131 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
3132 * we need to fall back to the slow version:
3133 */
gup_fast_permitted(unsigned long start,unsigned long end)3134 static bool gup_fast_permitted(unsigned long start, unsigned long end)
3135 {
3136 return true;
3137 }
3138 #endif
3139
gup_fast(unsigned long start,unsigned long end,unsigned int gup_flags,struct page ** pages)3140 static unsigned long gup_fast(unsigned long start, unsigned long end,
3141 unsigned int gup_flags, struct page **pages)
3142 {
3143 unsigned long flags;
3144 int nr_pinned = 0;
3145 unsigned seq;
3146
3147 if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
3148 !gup_fast_permitted(start, end))
3149 return 0;
3150
3151 if (gup_flags & FOLL_PIN) {
3152 if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq))
3153 return 0;
3154 }
3155
3156 /*
3157 * Disable interrupts. The nested form is used, in order to allow full,
3158 * general purpose use of this routine.
3159 *
3160 * With interrupts disabled, we block page table pages from being freed
3161 * from under us. See struct mmu_table_batch comments in
3162 * include/asm-generic/tlb.h for more details.
3163 *
3164 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
3165 * that come from callers of tlb_remove_table_sync_one().
3166 */
3167 local_irq_save(flags);
3168 gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
3169 local_irq_restore(flags);
3170
3171 /*
3172 * When pinning pages for DMA there could be a concurrent write protect
3173 * from fork() via copy_page_range(), in this case always fail GUP-fast.
3174 */
3175 if (gup_flags & FOLL_PIN) {
3176 if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {
3177 gup_fast_unpin_user_pages(pages, nr_pinned);
3178 return 0;
3179 } else {
3180 sanity_check_pinned_pages(pages, nr_pinned);
3181 }
3182 }
3183 return nr_pinned;
3184 }
3185
gup_fast_fallback(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)3186 static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
3187 unsigned int gup_flags, struct page **pages)
3188 {
3189 unsigned long len, end;
3190 unsigned long nr_pinned;
3191 int locked = 0;
3192 int ret;
3193
3194 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
3195 FOLL_FORCE | FOLL_PIN | FOLL_GET |
3196 FOLL_FAST_ONLY | FOLL_NOFAULT |
3197 FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
3198 return -EINVAL;
3199
3200 if (gup_flags & FOLL_PIN)
3201 mm_set_has_pinned_flag(current->mm);
3202
3203 if (!(gup_flags & FOLL_FAST_ONLY))
3204 might_lock_read(¤t->mm->mmap_lock);
3205
3206 start = untagged_addr(start) & PAGE_MASK;
3207 len = nr_pages << PAGE_SHIFT;
3208 if (check_add_overflow(start, len, &end))
3209 return -EOVERFLOW;
3210 if (end > TASK_SIZE_MAX)
3211 return -EFAULT;
3212
3213 nr_pinned = gup_fast(start, end, gup_flags, pages);
3214 if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
3215 return nr_pinned;
3216
3217 /* Slow path: try to get the remaining pages with get_user_pages */
3218 start += nr_pinned << PAGE_SHIFT;
3219 pages += nr_pinned;
3220 ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
3221 pages, &locked,
3222 gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
3223 if (ret < 0) {
3224 /*
3225 * The caller has to unpin the pages we already pinned so
3226 * returning -errno is not an option
3227 */
3228 if (nr_pinned)
3229 return nr_pinned;
3230 return ret;
3231 }
3232 return ret + nr_pinned;
3233 }
3234
3235 /**
3236 * get_user_pages_fast_only() - pin user pages in memory
3237 * @start: starting user address
3238 * @nr_pages: number of pages from start to pin
3239 * @gup_flags: flags modifying pin behaviour
3240 * @pages: array that receives pointers to the pages pinned.
3241 * Should be at least nr_pages long.
3242 *
3243 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
3244 * the regular GUP.
3245 *
3246 * If the architecture does not support this function, simply return with no
3247 * pages pinned.
3248 *
3249 * Careful, careful! COW breaking can go either way, so a non-write
3250 * access can get ambiguous page results. If you call this function without
3251 * 'write' set, you'd better be sure that you're ok with that ambiguity.
3252 */
get_user_pages_fast_only(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3253 int get_user_pages_fast_only(unsigned long start, int nr_pages,
3254 unsigned int gup_flags, struct page **pages)
3255 {
3256 /*
3257 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
3258 * because gup fast is always a "pin with a +1 page refcount" request.
3259 *
3260 * FOLL_FAST_ONLY is required in order to match the API description of
3261 * this routine: no fall back to regular ("slow") GUP.
3262 */
3263 if (!is_valid_gup_args(pages, NULL, &gup_flags,
3264 FOLL_GET | FOLL_FAST_ONLY))
3265 return -EINVAL;
3266
3267 return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3268 }
3269 EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
3270
3271 /**
3272 * get_user_pages_fast() - pin user pages in memory
3273 * @start: starting user address
3274 * @nr_pages: number of pages from start to pin
3275 * @gup_flags: flags modifying pin behaviour
3276 * @pages: array that receives pointers to the pages pinned.
3277 * Should be at least nr_pages long.
3278 *
3279 * Attempt to pin user pages in memory without taking mm->mmap_lock.
3280 * If not successful, it will fall back to taking the lock and
3281 * calling get_user_pages().
3282 *
3283 * Returns number of pages pinned. This may be fewer than the number requested.
3284 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
3285 * -errno.
3286 */
get_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3287 int get_user_pages_fast(unsigned long start, int nr_pages,
3288 unsigned int gup_flags, struct page **pages)
3289 {
3290 /*
3291 * The caller may or may not have explicitly set FOLL_GET; either way is
3292 * OK. However, internally (within mm/gup.c), gup fast variants must set
3293 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
3294 * request.
3295 */
3296 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
3297 return -EINVAL;
3298 return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3299 }
3300 EXPORT_SYMBOL_GPL(get_user_pages_fast);
3301
3302 /**
3303 * pin_user_pages_fast() - pin user pages in memory without taking locks
3304 *
3305 * @start: starting user address
3306 * @nr_pages: number of pages from start to pin
3307 * @gup_flags: flags modifying pin behaviour
3308 * @pages: array that receives pointers to the pages pinned.
3309 * Should be at least nr_pages long.
3310 *
3311 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
3312 * get_user_pages_fast() for documentation on the function arguments, because
3313 * the arguments here are identical.
3314 *
3315 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3316 * see Documentation/core-api/pin_user_pages.rst for further details.
3317 *
3318 * Note that if a zero_page is amongst the returned pages, it will not have
3319 * pins in it and unpin_user_page() will not remove pins from it.
3320 */
pin_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3321 int pin_user_pages_fast(unsigned long start, int nr_pages,
3322 unsigned int gup_flags, struct page **pages)
3323 {
3324 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
3325 return -EINVAL;
3326 return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3327 }
3328 EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3329
3330 /**
3331 * pin_user_pages_remote() - pin pages of a remote process
3332 *
3333 * @mm: mm_struct of target mm
3334 * @start: starting user address
3335 * @nr_pages: number of pages from start to pin
3336 * @gup_flags: flags modifying lookup behaviour
3337 * @pages: array that receives pointers to the pages pinned.
3338 * Should be at least nr_pages long.
3339 * @locked: pointer to lock flag indicating whether lock is held and
3340 * subsequently whether VM_FAULT_RETRY functionality can be
3341 * utilised. Lock must initially be held.
3342 *
3343 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
3344 * get_user_pages_remote() for documentation on the function arguments, because
3345 * the arguments here are identical.
3346 *
3347 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3348 * see Documentation/core-api/pin_user_pages.rst for details.
3349 *
3350 * Note that if a zero_page is amongst the returned pages, it will not have
3351 * pins in it and unpin_user_page*() will not remove pins from it.
3352 */
pin_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)3353 long pin_user_pages_remote(struct mm_struct *mm,
3354 unsigned long start, unsigned long nr_pages,
3355 unsigned int gup_flags, struct page **pages,
3356 int *locked)
3357 {
3358 int local_locked = 1;
3359
3360 if (!is_valid_gup_args(pages, locked, &gup_flags,
3361 FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
3362 return 0;
3363 return __gup_longterm_locked(mm, start, nr_pages, pages,
3364 locked ? locked : &local_locked,
3365 gup_flags);
3366 }
3367 EXPORT_SYMBOL(pin_user_pages_remote);
3368
3369 /**
3370 * pin_user_pages() - pin user pages in memory for use by other devices
3371 *
3372 * @start: starting user address
3373 * @nr_pages: number of pages from start to pin
3374 * @gup_flags: flags modifying lookup behaviour
3375 * @pages: array that receives pointers to the pages pinned.
3376 * Should be at least nr_pages long.
3377 *
3378 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
3379 * FOLL_PIN is set.
3380 *
3381 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3382 * see Documentation/core-api/pin_user_pages.rst for details.
3383 *
3384 * Note that if a zero_page is amongst the returned pages, it will not have
3385 * pins in it and unpin_user_page*() will not remove pins from it.
3386 */
pin_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)3387 long pin_user_pages(unsigned long start, unsigned long nr_pages,
3388 unsigned int gup_flags, struct page **pages)
3389 {
3390 int locked = 1;
3391
3392 if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
3393 return 0;
3394 return __gup_longterm_locked(current->mm, start, nr_pages,
3395 pages, &locked, gup_flags);
3396 }
3397 EXPORT_SYMBOL(pin_user_pages);
3398
3399 /*
3400 * pin_user_pages_unlocked() is the FOLL_PIN variant of
3401 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
3402 * FOLL_PIN and rejects FOLL_GET.
3403 *
3404 * Note that if a zero_page is amongst the returned pages, it will not have
3405 * pins in it and unpin_user_page*() will not remove pins from it.
3406 */
pin_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)3407 long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3408 struct page **pages, unsigned int gup_flags)
3409 {
3410 int locked = 0;
3411
3412 if (!is_valid_gup_args(pages, NULL, &gup_flags,
3413 FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
3414 return 0;
3415
3416 return __gup_longterm_locked(current->mm, start, nr_pages, pages,
3417 &locked, gup_flags);
3418 }
3419 EXPORT_SYMBOL(pin_user_pages_unlocked);
3420
3421 /**
3422 * memfd_pin_folios() - pin folios associated with a memfd
3423 * @memfd: the memfd whose folios are to be pinned
3424 * @start: the first memfd offset
3425 * @end: the last memfd offset (inclusive)
3426 * @folios: array that receives pointers to the folios pinned
3427 * @max_folios: maximum number of entries in @folios
3428 * @offset: the offset into the first folio
3429 *
3430 * Attempt to pin folios associated with a memfd in the contiguous range
3431 * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
3432 * the folios can either be found in the page cache or need to be allocated
3433 * if necessary. Once the folios are located, they are all pinned via
3434 * FOLL_PIN and @offset is populatedwith the offset into the first folio.
3435 * And, eventually, these pinned folios must be released either using
3436 * unpin_folios() or unpin_folio().
3437 *
3438 * It must be noted that the folios may be pinned for an indefinite amount
3439 * of time. And, in most cases, the duration of time they may stay pinned
3440 * would be controlled by the userspace. This behavior is effectively the
3441 * same as using FOLL_LONGTERM with other GUP APIs.
3442 *
3443 * Returns number of folios pinned, which could be less than @max_folios
3444 * as it depends on the folio sizes that cover the range [start, end].
3445 * If no folios were pinned, it returns -errno.
3446 */
memfd_pin_folios(struct file * memfd,loff_t start,loff_t end,struct folio ** folios,unsigned int max_folios,pgoff_t * offset)3447 long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
3448 struct folio **folios, unsigned int max_folios,
3449 pgoff_t *offset)
3450 {
3451 unsigned int flags, nr_folios, nr_found;
3452 unsigned int i, pgshift = PAGE_SHIFT;
3453 pgoff_t start_idx, end_idx;
3454 struct folio *folio = NULL;
3455 struct folio_batch fbatch;
3456 struct hstate *h;
3457 long ret = -EINVAL;
3458
3459 if (start < 0 || start > end || !max_folios)
3460 return -EINVAL;
3461
3462 if (!memfd)
3463 return -EINVAL;
3464
3465 if (!shmem_file(memfd) && !is_file_hugepages(memfd))
3466 return -EINVAL;
3467
3468 if (end >= i_size_read(file_inode(memfd)))
3469 return -EINVAL;
3470
3471 if (is_file_hugepages(memfd)) {
3472 h = hstate_file(memfd);
3473 pgshift = huge_page_shift(h);
3474 }
3475
3476 flags = memalloc_pin_save();
3477 do {
3478 nr_folios = 0;
3479 start_idx = start >> pgshift;
3480 end_idx = end >> pgshift;
3481 if (is_file_hugepages(memfd)) {
3482 start_idx <<= huge_page_order(h);
3483 end_idx <<= huge_page_order(h);
3484 }
3485
3486 folio_batch_init(&fbatch);
3487 while (start_idx <= end_idx && nr_folios < max_folios) {
3488 /*
3489 * In most cases, we should be able to find the folios
3490 * in the page cache. If we cannot find them for some
3491 * reason, we try to allocate them and add them to the
3492 * page cache.
3493 */
3494 nr_found = filemap_get_folios_contig(memfd->f_mapping,
3495 &start_idx,
3496 end_idx,
3497 &fbatch);
3498 if (folio) {
3499 folio_put(folio);
3500 folio = NULL;
3501 }
3502
3503 for (i = 0; i < nr_found; i++) {
3504 folio = fbatch.folios[i];
3505
3506 if (try_grab_folio(folio, 1, FOLL_PIN)) {
3507 folio_batch_release(&fbatch);
3508 ret = -EINVAL;
3509 goto err;
3510 }
3511
3512 if (nr_folios == 0)
3513 *offset = offset_in_folio(folio, start);
3514
3515 folios[nr_folios] = folio;
3516 if (++nr_folios == max_folios)
3517 break;
3518 }
3519
3520 folio = NULL;
3521 folio_batch_release(&fbatch);
3522 if (!nr_found) {
3523 folio = memfd_alloc_folio(memfd, start_idx);
3524 if (IS_ERR(folio)) {
3525 ret = PTR_ERR(folio);
3526 if (ret != -EEXIST)
3527 goto err;
3528 folio = NULL;
3529 }
3530 }
3531 }
3532
3533 ret = check_and_migrate_movable_folios(nr_folios, folios);
3534 } while (ret == -EAGAIN);
3535
3536 memalloc_pin_restore(flags);
3537 return ret ? ret : nr_folios;
3538 err:
3539 memalloc_pin_restore(flags);
3540 unpin_folios(folios, nr_folios);
3541
3542 return ret;
3543 }
3544 EXPORT_SYMBOL_GPL(memfd_pin_folios);
3545
3546 /**
3547 * folio_add_pins() - add pins to an already-pinned folio
3548 * @folio: the folio to add more pins to
3549 * @pins: number of pins to add
3550 *
3551 * Try to add more pins to an already-pinned folio. The semantics
3552 * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
3553 * be changed.
3554 *
3555 * This function is helpful when having obtained a pin on a large folio
3556 * using memfd_pin_folios(), but wanting to logically unpin parts
3557 * (e.g., individual pages) of the folio later, for example, using
3558 * unpin_user_page_range_dirty_lock().
3559 *
3560 * This is not the right interface to initially pin a folio.
3561 */
folio_add_pins(struct folio * folio,unsigned int pins)3562 int folio_add_pins(struct folio *folio, unsigned int pins)
3563 {
3564 VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
3565
3566 return try_grab_folio(folio, pins, FOLL_PIN);
3567 }
3568 EXPORT_SYMBOL_GPL(folio_add_pins);
3569