1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Memory Migration functionality - linux/mm/migrate.c
4 *
5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6 *
7 * Page migration was first developed in the context of the memory hotplug
8 * project. The main authors of the migration code are:
9 *
10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11 * Hirokazu Takahashi <taka@valinux.co.jp>
12 * Dave Hansen <haveblue@us.ibm.com>
13 * Christoph Lameter
14 */
15
16 #include <linux/migrate.h>
17 #include <linux/export.h>
18 #include <linux/swap.h>
19 #include <linux/swapops.h>
20 #include <linux/pagemap.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mm_inline.h>
23 #include <linux/ksm.h>
24 #include <linux/rmap.h>
25 #include <linux/topology.h>
26 #include <linux/cpu.h>
27 #include <linux/cpuset.h>
28 #include <linux/writeback.h>
29 #include <linux/mempolicy.h>
30 #include <linux/vmalloc.h>
31 #include <linux/security.h>
32 #include <linux/backing-dev.h>
33 #include <linux/compaction.h>
34 #include <linux/syscalls.h>
35 #include <linux/compat.h>
36 #include <linux/hugetlb.h>
37 #include <linux/gfp.h>
38 #include <linux/pfn_t.h>
39 #include <linux/page_idle.h>
40 #include <linux/page_owner.h>
41 #include <linux/sched/mm.h>
42 #include <linux/ptrace.h>
43 #include <linux/memory.h>
44 #include <linux/sched/sysctl.h>
45 #include <linux/memory-tiers.h>
46 #include <linux/pagewalk.h>
47
48 #include <asm/tlbflush.h>
49
50 #include <trace/events/migrate.h>
51
52 #include "internal.h"
53 #include "swap.h"
54
isolate_movable_page(struct page * page,isolate_mode_t mode)55 bool isolate_movable_page(struct page *page, isolate_mode_t mode)
56 {
57 struct folio *folio = folio_get_nontail_page(page);
58 const struct movable_operations *mops;
59
60 /*
61 * Avoid burning cycles with pages that are yet under __free_pages(),
62 * or just got freed under us.
63 *
64 * In case we 'win' a race for a movable page being freed under us and
65 * raise its refcount preventing __free_pages() from doing its job
66 * the put_page() at the end of this block will take care of
67 * release this page, thus avoiding a nasty leakage.
68 */
69 if (!folio)
70 goto out;
71
72 /*
73 * Check movable flag before taking the page lock because
74 * we use non-atomic bitops on newly allocated page flags so
75 * unconditionally grabbing the lock ruins page's owner side.
76 */
77 if (unlikely(!__folio_test_movable(folio)))
78 goto out_putfolio;
79
80 /*
81 * As movable pages are not isolated from LRU lists, concurrent
82 * compaction threads can race against page migration functions
83 * as well as race against the releasing a page.
84 *
85 * In order to avoid having an already isolated movable page
86 * being (wrongly) re-isolated while it is under migration,
87 * or to avoid attempting to isolate pages being released,
88 * lets be sure we have the page lock
89 * before proceeding with the movable page isolation steps.
90 */
91 if (unlikely(!folio_trylock(folio)))
92 goto out_putfolio;
93
94 if (!folio_test_movable(folio) || folio_test_isolated(folio))
95 goto out_no_isolated;
96
97 mops = folio_movable_ops(folio);
98 VM_BUG_ON_FOLIO(!mops, folio);
99
100 if (!mops->isolate_page(&folio->page, mode))
101 goto out_no_isolated;
102
103 /* Driver shouldn't use the isolated flag */
104 WARN_ON_ONCE(folio_test_isolated(folio));
105 folio_set_isolated(folio);
106 folio_unlock(folio);
107
108 return true;
109
110 out_no_isolated:
111 folio_unlock(folio);
112 out_putfolio:
113 folio_put(folio);
114 out:
115 return false;
116 }
117
putback_movable_folio(struct folio * folio)118 static void putback_movable_folio(struct folio *folio)
119 {
120 const struct movable_operations *mops = folio_movable_ops(folio);
121
122 mops->putback_page(&folio->page);
123 folio_clear_isolated(folio);
124 }
125
126 /*
127 * Put previously isolated pages back onto the appropriate lists
128 * from where they were once taken off for compaction/migration.
129 *
130 * This function shall be used whenever the isolated pageset has been
131 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
132 * and folio_isolate_hugetlb().
133 */
putback_movable_pages(struct list_head * l)134 void putback_movable_pages(struct list_head *l)
135 {
136 struct folio *folio;
137 struct folio *folio2;
138
139 list_for_each_entry_safe(folio, folio2, l, lru) {
140 if (unlikely(folio_test_hugetlb(folio))) {
141 folio_putback_hugetlb(folio);
142 continue;
143 }
144 list_del(&folio->lru);
145 /*
146 * We isolated non-lru movable folio so here we can use
147 * __folio_test_movable because LRU folio's mapping cannot
148 * have PAGE_MAPPING_MOVABLE.
149 */
150 if (unlikely(__folio_test_movable(folio))) {
151 VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
152 folio_lock(folio);
153 if (folio_test_movable(folio))
154 putback_movable_folio(folio);
155 else
156 folio_clear_isolated(folio);
157 folio_unlock(folio);
158 folio_put(folio);
159 } else {
160 node_stat_mod_folio(folio, NR_ISOLATED_ANON +
161 folio_is_file_lru(folio), -folio_nr_pages(folio));
162 folio_putback_lru(folio);
163 }
164 }
165 }
166
167 /* Must be called with an elevated refcount on the non-hugetlb folio */
isolate_folio_to_list(struct folio * folio,struct list_head * list)168 bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
169 {
170 bool isolated, lru;
171
172 if (folio_test_hugetlb(folio))
173 return folio_isolate_hugetlb(folio, list);
174
175 lru = !__folio_test_movable(folio);
176 if (lru)
177 isolated = folio_isolate_lru(folio);
178 else
179 isolated = isolate_movable_page(&folio->page,
180 ISOLATE_UNEVICTABLE);
181
182 if (!isolated)
183 return false;
184
185 list_add(&folio->lru, list);
186 if (lru)
187 node_stat_add_folio(folio, NR_ISOLATED_ANON +
188 folio_is_file_lru(folio));
189
190 return true;
191 }
192
try_to_map_unused_to_zeropage(struct page_vma_mapped_walk * pvmw,struct folio * folio,unsigned long idx)193 static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
194 struct folio *folio,
195 unsigned long idx)
196 {
197 struct page *page = folio_page(folio, idx);
198 bool contains_data;
199 pte_t newpte;
200 void *addr;
201
202 if (PageCompound(page))
203 return false;
204 VM_BUG_ON_PAGE(!PageAnon(page), page);
205 VM_BUG_ON_PAGE(!PageLocked(page), page);
206 VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
207
208 if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
209 mm_forbids_zeropage(pvmw->vma->vm_mm))
210 return false;
211
212 /*
213 * The pmd entry mapping the old thp was flushed and the pte mapping
214 * this subpage has been non present. If the subpage is only zero-filled
215 * then map it to the shared zeropage.
216 */
217 addr = kmap_local_page(page);
218 contains_data = memchr_inv(addr, 0, PAGE_SIZE);
219 kunmap_local(addr);
220
221 if (contains_data)
222 return false;
223
224 newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
225 pvmw->vma->vm_page_prot));
226 set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
227
228 dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
229 return true;
230 }
231
232 struct rmap_walk_arg {
233 struct folio *folio;
234 bool map_unused_to_zeropage;
235 };
236
237 /*
238 * Restore a potential migration pte to a working pte entry
239 */
remove_migration_pte(struct folio * folio,struct vm_area_struct * vma,unsigned long addr,void * arg)240 static bool remove_migration_pte(struct folio *folio,
241 struct vm_area_struct *vma, unsigned long addr, void *arg)
242 {
243 struct rmap_walk_arg *rmap_walk_arg = arg;
244 DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
245
246 while (page_vma_mapped_walk(&pvmw)) {
247 rmap_t rmap_flags = RMAP_NONE;
248 pte_t old_pte;
249 pte_t pte;
250 swp_entry_t entry;
251 struct page *new;
252 unsigned long idx = 0;
253
254 /* pgoff is invalid for ksm pages, but they are never large */
255 if (folio_test_large(folio) && !folio_test_hugetlb(folio))
256 idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
257 new = folio_page(folio, idx);
258
259 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
260 /* PMD-mapped THP migration entry */
261 if (!pvmw.pte) {
262 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
263 !folio_test_pmd_mappable(folio), folio);
264 remove_migration_pmd(&pvmw, new);
265 continue;
266 }
267 #endif
268 if (rmap_walk_arg->map_unused_to_zeropage &&
269 try_to_map_unused_to_zeropage(&pvmw, folio, idx))
270 continue;
271
272 folio_get(folio);
273 pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
274 old_pte = ptep_get(pvmw.pte);
275
276 entry = pte_to_swp_entry(old_pte);
277 if (!is_migration_entry_young(entry))
278 pte = pte_mkold(pte);
279 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
280 pte = pte_mkdirty(pte);
281 if (pte_swp_soft_dirty(old_pte))
282 pte = pte_mksoft_dirty(pte);
283 else
284 pte = pte_clear_soft_dirty(pte);
285
286 if (is_writable_migration_entry(entry))
287 pte = pte_mkwrite(pte, vma);
288 else if (pte_swp_uffd_wp(old_pte))
289 pte = pte_mkuffd_wp(pte);
290
291 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
292 rmap_flags |= RMAP_EXCLUSIVE;
293
294 if (unlikely(is_device_private_page(new))) {
295 if (pte_write(pte))
296 entry = make_writable_device_private_entry(
297 page_to_pfn(new));
298 else
299 entry = make_readable_device_private_entry(
300 page_to_pfn(new));
301 pte = swp_entry_to_pte(entry);
302 if (pte_swp_soft_dirty(old_pte))
303 pte = pte_swp_mksoft_dirty(pte);
304 if (pte_swp_uffd_wp(old_pte))
305 pte = pte_swp_mkuffd_wp(pte);
306 }
307
308 #ifdef CONFIG_HUGETLB_PAGE
309 if (folio_test_hugetlb(folio)) {
310 struct hstate *h = hstate_vma(vma);
311 unsigned int shift = huge_page_shift(h);
312 unsigned long psize = huge_page_size(h);
313
314 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
315 if (folio_test_anon(folio))
316 hugetlb_add_anon_rmap(folio, vma, pvmw.address,
317 rmap_flags);
318 else
319 hugetlb_add_file_rmap(folio);
320 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
321 psize);
322 } else
323 #endif
324 {
325 if (folio_test_anon(folio))
326 folio_add_anon_rmap_pte(folio, new, vma,
327 pvmw.address, rmap_flags);
328 else
329 folio_add_file_rmap_pte(folio, new, vma);
330 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
331 }
332 if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
333 mlock_drain_local();
334
335 trace_remove_migration_pte(pvmw.address, pte_val(pte),
336 compound_order(new));
337
338 /* No need to invalidate - it was non-present before */
339 update_mmu_cache(vma, pvmw.address, pvmw.pte);
340 }
341
342 return true;
343 }
344
345 /*
346 * Get rid of all migration entries and replace them by
347 * references to the indicated page.
348 */
remove_migration_ptes(struct folio * src,struct folio * dst,int flags)349 void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
350 {
351 struct rmap_walk_arg rmap_walk_arg = {
352 .folio = src,
353 .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
354 };
355
356 struct rmap_walk_control rwc = {
357 .rmap_one = remove_migration_pte,
358 .arg = &rmap_walk_arg,
359 };
360
361 VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
362
363 if (flags & RMP_LOCKED)
364 rmap_walk_locked(dst, &rwc);
365 else
366 rmap_walk(dst, &rwc);
367 }
368
369 /*
370 * Something used the pte of a page under migration. We need to
371 * get to the page and wait until migration is finished.
372 * When we return from this function the fault will be retried.
373 */
migration_entry_wait(struct mm_struct * mm,pmd_t * pmd,unsigned long address)374 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
375 unsigned long address)
376 {
377 spinlock_t *ptl;
378 pte_t *ptep;
379 pte_t pte;
380 swp_entry_t entry;
381
382 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
383 if (!ptep)
384 return;
385
386 pte = ptep_get(ptep);
387 pte_unmap(ptep);
388
389 if (!is_swap_pte(pte))
390 goto out;
391
392 entry = pte_to_swp_entry(pte);
393 if (!is_migration_entry(entry))
394 goto out;
395
396 migration_entry_wait_on_locked(entry, ptl);
397 return;
398 out:
399 spin_unlock(ptl);
400 }
401
402 #ifdef CONFIG_HUGETLB_PAGE
403 /*
404 * The vma read lock must be held upon entry. Holding that lock prevents either
405 * the pte or the ptl from being freed.
406 *
407 * This function will release the vma lock before returning.
408 */
migration_entry_wait_huge(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)409 void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
410 {
411 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
412 pte_t pte;
413
414 hugetlb_vma_assert_locked(vma);
415 spin_lock(ptl);
416 pte = huge_ptep_get(vma->vm_mm, addr, ptep);
417
418 if (unlikely(!is_hugetlb_entry_migration(pte))) {
419 spin_unlock(ptl);
420 hugetlb_vma_unlock_read(vma);
421 } else {
422 /*
423 * If migration entry existed, safe to release vma lock
424 * here because the pgtable page won't be freed without the
425 * pgtable lock released. See comment right above pgtable
426 * lock release in migration_entry_wait_on_locked().
427 */
428 hugetlb_vma_unlock_read(vma);
429 migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
430 }
431 }
432 #endif
433
434 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmd_migration_entry_wait(struct mm_struct * mm,pmd_t * pmd)435 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
436 {
437 spinlock_t *ptl;
438
439 ptl = pmd_lock(mm, pmd);
440 if (!is_pmd_migration_entry(*pmd))
441 goto unlock;
442 migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
443 return;
444 unlock:
445 spin_unlock(ptl);
446 }
447 #endif
448
449 /*
450 * Replace the folio in the mapping.
451 *
452 * The number of remaining references must be:
453 * 1 for anonymous folios without a mapping
454 * 2 for folios with a mapping
455 * 3 for folios with a mapping and the private flag set.
456 */
__folio_migrate_mapping(struct address_space * mapping,struct folio * newfolio,struct folio * folio,int expected_count)457 static int __folio_migrate_mapping(struct address_space *mapping,
458 struct folio *newfolio, struct folio *folio, int expected_count)
459 {
460 XA_STATE(xas, &mapping->i_pages, folio_index(folio));
461 struct zone *oldzone, *newzone;
462 int dirty;
463 long nr = folio_nr_pages(folio);
464 long entries, i;
465
466 if (!mapping) {
467 /* Take off deferred split queue while frozen and memcg set */
468 if (folio_test_large(folio) &&
469 folio_test_large_rmappable(folio)) {
470 if (!folio_ref_freeze(folio, expected_count))
471 return -EAGAIN;
472 folio_unqueue_deferred_split(folio);
473 folio_ref_unfreeze(folio, expected_count);
474 }
475
476 /* No turning back from here */
477 newfolio->index = folio->index;
478 newfolio->mapping = folio->mapping;
479 if (folio_test_anon(folio) && folio_test_large(folio))
480 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
481 if (folio_test_swapbacked(folio))
482 __folio_set_swapbacked(newfolio);
483
484 return MIGRATEPAGE_SUCCESS;
485 }
486
487 oldzone = folio_zone(folio);
488 newzone = folio_zone(newfolio);
489
490 xas_lock_irq(&xas);
491 if (!folio_ref_freeze(folio, expected_count)) {
492 xas_unlock_irq(&xas);
493 return -EAGAIN;
494 }
495
496 /* Take off deferred split queue while frozen and memcg set */
497 folio_unqueue_deferred_split(folio);
498
499 /*
500 * Now we know that no one else is looking at the folio:
501 * no turning back from here.
502 */
503 newfolio->index = folio->index;
504 newfolio->mapping = folio->mapping;
505 if (folio_test_anon(folio) && folio_test_large(folio))
506 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
507 folio_ref_add(newfolio, nr); /* add cache reference */
508 if (folio_test_swapbacked(folio))
509 __folio_set_swapbacked(newfolio);
510 if (folio_test_swapcache(folio)) {
511 folio_set_swapcache(newfolio);
512 newfolio->private = folio_get_private(folio);
513 entries = nr;
514 } else {
515 entries = 1;
516 }
517
518 /* Move dirty while folio refs frozen and newfolio not yet exposed */
519 dirty = folio_test_dirty(folio);
520 if (dirty) {
521 folio_clear_dirty(folio);
522 folio_set_dirty(newfolio);
523 }
524
525 /* Swap cache still stores N entries instead of a high-order entry */
526 for (i = 0; i < entries; i++) {
527 xas_store(&xas, newfolio);
528 xas_next(&xas);
529 }
530
531 /*
532 * Drop cache reference from old folio by unfreezing
533 * to one less reference.
534 * We know this isn't the last reference.
535 */
536 folio_ref_unfreeze(folio, expected_count - nr);
537
538 xas_unlock(&xas);
539 /* Leave irq disabled to prevent preemption while updating stats */
540
541 /*
542 * If moved to a different zone then also account
543 * the folio for that zone. Other VM counters will be
544 * taken care of when we establish references to the
545 * new folio and drop references to the old folio.
546 *
547 * Note that anonymous folios are accounted for
548 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
549 * are mapped to swap space.
550 */
551 if (newzone != oldzone) {
552 struct lruvec *old_lruvec, *new_lruvec;
553 struct mem_cgroup *memcg;
554
555 memcg = folio_memcg(folio);
556 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
557 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
558
559 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
560 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
561 if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
562 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
563 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
564
565 if (folio_test_pmd_mappable(folio)) {
566 __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
567 __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
568 }
569 }
570 #ifdef CONFIG_SWAP
571 if (folio_test_swapcache(folio)) {
572 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
573 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
574 }
575 #endif
576 if (dirty && mapping_can_writeback(mapping)) {
577 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
578 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
579 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
580 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
581 }
582 }
583 local_irq_enable();
584
585 return MIGRATEPAGE_SUCCESS;
586 }
587
folio_migrate_mapping(struct address_space * mapping,struct folio * newfolio,struct folio * folio,int extra_count)588 int folio_migrate_mapping(struct address_space *mapping,
589 struct folio *newfolio, struct folio *folio, int extra_count)
590 {
591 int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
592
593 if (folio_ref_count(folio) != expected_count)
594 return -EAGAIN;
595
596 return __folio_migrate_mapping(mapping, newfolio, folio, expected_count);
597 }
598 EXPORT_SYMBOL(folio_migrate_mapping);
599
600 /*
601 * The expected number of remaining references is the same as that
602 * of folio_migrate_mapping().
603 */
migrate_huge_page_move_mapping(struct address_space * mapping,struct folio * dst,struct folio * src)604 int migrate_huge_page_move_mapping(struct address_space *mapping,
605 struct folio *dst, struct folio *src)
606 {
607 XA_STATE(xas, &mapping->i_pages, folio_index(src));
608 int rc, expected_count = folio_expected_ref_count(src) + 1;
609
610 if (folio_ref_count(src) != expected_count)
611 return -EAGAIN;
612
613 rc = folio_mc_copy(dst, src);
614 if (unlikely(rc))
615 return rc;
616
617 xas_lock_irq(&xas);
618 if (!folio_ref_freeze(src, expected_count)) {
619 xas_unlock_irq(&xas);
620 return -EAGAIN;
621 }
622
623 dst->index = src->index;
624 dst->mapping = src->mapping;
625
626 folio_ref_add(dst, folio_nr_pages(dst));
627
628 xas_store(&xas, dst);
629
630 folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
631
632 xas_unlock_irq(&xas);
633
634 return MIGRATEPAGE_SUCCESS;
635 }
636
637 /*
638 * Copy the flags and some other ancillary information
639 */
folio_migrate_flags(struct folio * newfolio,struct folio * folio)640 void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
641 {
642 int cpupid;
643
644 if (folio_test_referenced(folio))
645 folio_set_referenced(newfolio);
646 if (folio_test_uptodate(folio))
647 folio_mark_uptodate(newfolio);
648 if (folio_test_clear_active(folio)) {
649 VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
650 folio_set_active(newfolio);
651 } else if (folio_test_clear_unevictable(folio))
652 folio_set_unevictable(newfolio);
653 if (folio_test_workingset(folio))
654 folio_set_workingset(newfolio);
655 if (folio_test_checked(folio))
656 folio_set_checked(newfolio);
657 /*
658 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
659 * migration entries. We can still have PG_anon_exclusive set on an
660 * effectively unmapped and unreferenced first sub-pages of an
661 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
662 */
663 if (folio_test_mappedtodisk(folio))
664 folio_set_mappedtodisk(newfolio);
665
666 /* Move dirty on pages not done by folio_migrate_mapping() */
667 if (folio_test_dirty(folio))
668 folio_set_dirty(newfolio);
669
670 if (folio_test_young(folio))
671 folio_set_young(newfolio);
672 if (folio_test_idle(folio))
673 folio_set_idle(newfolio);
674
675 folio_migrate_refs(newfolio, folio);
676 /*
677 * Copy NUMA information to the new page, to prevent over-eager
678 * future migrations of this same page.
679 */
680 cpupid = folio_xchg_last_cpupid(folio, -1);
681 /*
682 * For memory tiering mode, when migrate between slow and fast
683 * memory node, reset cpupid, because that is used to record
684 * page access time in slow memory node.
685 */
686 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
687 bool f_toptier = node_is_toptier(folio_nid(folio));
688 bool t_toptier = node_is_toptier(folio_nid(newfolio));
689
690 if (f_toptier != t_toptier)
691 cpupid = -1;
692 }
693 folio_xchg_last_cpupid(newfolio, cpupid);
694
695 folio_migrate_ksm(newfolio, folio);
696 /*
697 * Please do not reorder this without considering how mm/ksm.c's
698 * ksm_get_folio() depends upon ksm_migrate_page() and the
699 * swapcache flag.
700 */
701 if (folio_test_swapcache(folio))
702 folio_clear_swapcache(folio);
703 folio_clear_private(folio);
704
705 /* page->private contains hugetlb specific flags */
706 if (!folio_test_hugetlb(folio))
707 folio->private = NULL;
708
709 /*
710 * If any waiters have accumulated on the new page then
711 * wake them up.
712 */
713 if (folio_test_writeback(newfolio))
714 folio_end_writeback(newfolio);
715
716 /*
717 * PG_readahead shares the same bit with PG_reclaim. The above
718 * end_page_writeback() may clear PG_readahead mistakenly, so set the
719 * bit after that.
720 */
721 if (folio_test_readahead(folio))
722 folio_set_readahead(newfolio);
723
724 folio_copy_owner(newfolio, folio);
725 pgalloc_tag_swap(newfolio, folio);
726
727 mem_cgroup_migrate(folio, newfolio);
728 }
729 EXPORT_SYMBOL(folio_migrate_flags);
730
731 /************************************************************
732 * Migration functions
733 ***********************************************************/
734
__migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,void * src_private,enum migrate_mode mode)735 static int __migrate_folio(struct address_space *mapping, struct folio *dst,
736 struct folio *src, void *src_private,
737 enum migrate_mode mode)
738 {
739 int rc, expected_count = folio_expected_ref_count(src) + 1;
740
741 /* Check whether src does not have extra refs before we do more work */
742 if (folio_ref_count(src) != expected_count)
743 return -EAGAIN;
744
745 rc = folio_mc_copy(dst, src);
746 if (unlikely(rc))
747 return rc;
748
749 rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
750 if (rc != MIGRATEPAGE_SUCCESS)
751 return rc;
752
753 if (src_private)
754 folio_attach_private(dst, folio_detach_private(src));
755
756 folio_migrate_flags(dst, src);
757 return MIGRATEPAGE_SUCCESS;
758 }
759
760 /**
761 * migrate_folio() - Simple folio migration.
762 * @mapping: The address_space containing the folio.
763 * @dst: The folio to migrate the data to.
764 * @src: The folio containing the current data.
765 * @mode: How to migrate the page.
766 *
767 * Common logic to directly migrate a single LRU folio suitable for
768 * folios that do not have private data.
769 *
770 * Folios are locked upon entry and exit.
771 */
migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)772 int migrate_folio(struct address_space *mapping, struct folio *dst,
773 struct folio *src, enum migrate_mode mode)
774 {
775 BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
776 return __migrate_folio(mapping, dst, src, NULL, mode);
777 }
778 EXPORT_SYMBOL(migrate_folio);
779
780 #ifdef CONFIG_BUFFER_HEAD
781 /* Returns true if all buffers are successfully locked */
buffer_migrate_lock_buffers(struct buffer_head * head,enum migrate_mode mode)782 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
783 enum migrate_mode mode)
784 {
785 struct buffer_head *bh = head;
786 struct buffer_head *failed_bh;
787
788 do {
789 if (!trylock_buffer(bh)) {
790 if (mode == MIGRATE_ASYNC)
791 goto unlock;
792 if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
793 goto unlock;
794 lock_buffer(bh);
795 }
796
797 bh = bh->b_this_page;
798 } while (bh != head);
799
800 return true;
801
802 unlock:
803 /* We failed to lock the buffer and cannot stall. */
804 failed_bh = bh;
805 bh = head;
806 while (bh != failed_bh) {
807 unlock_buffer(bh);
808 bh = bh->b_this_page;
809 }
810
811 return false;
812 }
813
__buffer_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode,bool check_refs)814 static int __buffer_migrate_folio(struct address_space *mapping,
815 struct folio *dst, struct folio *src, enum migrate_mode mode,
816 bool check_refs)
817 {
818 struct buffer_head *bh, *head;
819 int rc;
820 int expected_count;
821
822 head = folio_buffers(src);
823 if (!head)
824 return migrate_folio(mapping, dst, src, mode);
825
826 /* Check whether page does not have extra refs before we do more work */
827 expected_count = folio_expected_ref_count(src) + 1;
828 if (folio_ref_count(src) != expected_count)
829 return -EAGAIN;
830
831 if (!buffer_migrate_lock_buffers(head, mode))
832 return -EAGAIN;
833
834 if (check_refs) {
835 bool busy, migrating;
836 bool invalidated = false;
837
838 migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
839 VM_WARN_ON_ONCE(migrating);
840 recheck_buffers:
841 busy = false;
842 spin_lock(&mapping->i_private_lock);
843 bh = head;
844 do {
845 if (atomic_read(&bh->b_count)) {
846 busy = true;
847 break;
848 }
849 bh = bh->b_this_page;
850 } while (bh != head);
851 spin_unlock(&mapping->i_private_lock);
852 if (busy) {
853 if (invalidated) {
854 rc = -EAGAIN;
855 goto unlock_buffers;
856 }
857 invalidate_bh_lrus();
858 invalidated = true;
859 goto recheck_buffers;
860 }
861 }
862
863 rc = filemap_migrate_folio(mapping, dst, src, mode);
864 if (rc != MIGRATEPAGE_SUCCESS)
865 goto unlock_buffers;
866
867 bh = head;
868 do {
869 folio_set_bh(bh, dst, bh_offset(bh));
870 bh = bh->b_this_page;
871 } while (bh != head);
872
873 unlock_buffers:
874 if (check_refs)
875 clear_bit_unlock(BH_Migrate, &head->b_state);
876 bh = head;
877 do {
878 unlock_buffer(bh);
879 bh = bh->b_this_page;
880 } while (bh != head);
881
882 return rc;
883 }
884
885 /**
886 * buffer_migrate_folio() - Migration function for folios with buffers.
887 * @mapping: The address space containing @src.
888 * @dst: The folio to migrate to.
889 * @src: The folio to migrate from.
890 * @mode: How to migrate the folio.
891 *
892 * This function can only be used if the underlying filesystem guarantees
893 * that no other references to @src exist. For example attached buffer
894 * heads are accessed only under the folio lock. If your filesystem cannot
895 * provide this guarantee, buffer_migrate_folio_norefs() may be more
896 * appropriate.
897 *
898 * Return: 0 on success or a negative errno on failure.
899 */
buffer_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)900 int buffer_migrate_folio(struct address_space *mapping,
901 struct folio *dst, struct folio *src, enum migrate_mode mode)
902 {
903 return __buffer_migrate_folio(mapping, dst, src, mode, false);
904 }
905 EXPORT_SYMBOL(buffer_migrate_folio);
906
907 /**
908 * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
909 * @mapping: The address space containing @src.
910 * @dst: The folio to migrate to.
911 * @src: The folio to migrate from.
912 * @mode: How to migrate the folio.
913 *
914 * Like buffer_migrate_folio() except that this variant is more careful
915 * and checks that there are also no buffer head references. This function
916 * is the right one for mappings where buffer heads are directly looked
917 * up and referenced (such as block device mappings).
918 *
919 * Return: 0 on success or a negative errno on failure.
920 */
buffer_migrate_folio_norefs(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)921 int buffer_migrate_folio_norefs(struct address_space *mapping,
922 struct folio *dst, struct folio *src, enum migrate_mode mode)
923 {
924 return __buffer_migrate_folio(mapping, dst, src, mode, true);
925 }
926 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
927 #endif /* CONFIG_BUFFER_HEAD */
928
filemap_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)929 int filemap_migrate_folio(struct address_space *mapping,
930 struct folio *dst, struct folio *src, enum migrate_mode mode)
931 {
932 return __migrate_folio(mapping, dst, src, folio_get_private(src), mode);
933 }
934 EXPORT_SYMBOL_GPL(filemap_migrate_folio);
935
936 /*
937 * Default handling if a filesystem does not provide a migration function.
938 */
fallback_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)939 static int fallback_migrate_folio(struct address_space *mapping,
940 struct folio *dst, struct folio *src, enum migrate_mode mode)
941 {
942 WARN_ONCE(mapping->a_ops->writepages,
943 "%ps does not implement migrate_folio\n",
944 mapping->a_ops);
945 if (folio_test_dirty(src))
946 return -EBUSY;
947
948 /*
949 * Filesystem may have private data at folio->private that we
950 * can't migrate automatically.
951 */
952 if (!filemap_release_folio(src, GFP_KERNEL))
953 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
954
955 return migrate_folio(mapping, dst, src, mode);
956 }
957
958 /*
959 * Move a page to a newly allocated page
960 * The page is locked and all ptes have been successfully removed.
961 *
962 * The new page will have replaced the old page if this function
963 * is successful.
964 *
965 * Return value:
966 * < 0 - error code
967 * MIGRATEPAGE_SUCCESS - success
968 */
move_to_new_folio(struct folio * dst,struct folio * src,enum migrate_mode mode)969 static int move_to_new_folio(struct folio *dst, struct folio *src,
970 enum migrate_mode mode)
971 {
972 int rc = -EAGAIN;
973 bool is_lru = !__folio_test_movable(src);
974
975 VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
976 VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
977
978 if (likely(is_lru)) {
979 struct address_space *mapping = folio_mapping(src);
980
981 if (!mapping)
982 rc = migrate_folio(mapping, dst, src, mode);
983 else if (mapping_inaccessible(mapping))
984 rc = -EOPNOTSUPP;
985 else if (mapping->a_ops->migrate_folio)
986 /*
987 * Most folios have a mapping and most filesystems
988 * provide a migrate_folio callback. Anonymous folios
989 * are part of swap space which also has its own
990 * migrate_folio callback. This is the most common path
991 * for page migration.
992 */
993 rc = mapping->a_ops->migrate_folio(mapping, dst, src,
994 mode);
995 else
996 rc = fallback_migrate_folio(mapping, dst, src, mode);
997 } else {
998 const struct movable_operations *mops;
999
1000 /*
1001 * In case of non-lru page, it could be released after
1002 * isolation step. In that case, we shouldn't try migration.
1003 */
1004 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
1005 if (!folio_test_movable(src)) {
1006 rc = MIGRATEPAGE_SUCCESS;
1007 folio_clear_isolated(src);
1008 goto out;
1009 }
1010
1011 mops = folio_movable_ops(src);
1012 rc = mops->migrate_page(&dst->page, &src->page, mode);
1013 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
1014 !folio_test_isolated(src));
1015 }
1016
1017 /*
1018 * When successful, old pagecache src->mapping must be cleared before
1019 * src is freed; but stats require that PageAnon be left as PageAnon.
1020 */
1021 if (rc == MIGRATEPAGE_SUCCESS) {
1022 if (__folio_test_movable(src)) {
1023 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
1024
1025 /*
1026 * We clear PG_movable under page_lock so any compactor
1027 * cannot try to migrate this page.
1028 */
1029 folio_clear_isolated(src);
1030 }
1031
1032 /*
1033 * Anonymous and movable src->mapping will be cleared by
1034 * free_pages_prepare so don't reset it here for keeping
1035 * the type to work PageAnon, for example.
1036 */
1037 if (!folio_mapping_flags(src))
1038 src->mapping = NULL;
1039
1040 if (likely(!folio_is_zone_device(dst)))
1041 flush_dcache_folio(dst);
1042 }
1043 out:
1044 return rc;
1045 }
1046
1047 /*
1048 * To record some information during migration, we use unused private
1049 * field of struct folio of the newly allocated destination folio.
1050 * This is safe because nobody is using it except us.
1051 */
1052 enum {
1053 PAGE_WAS_MAPPED = BIT(0),
1054 PAGE_WAS_MLOCKED = BIT(1),
1055 PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
1056 };
1057
__migrate_folio_record(struct folio * dst,int old_page_state,struct anon_vma * anon_vma)1058 static void __migrate_folio_record(struct folio *dst,
1059 int old_page_state,
1060 struct anon_vma *anon_vma)
1061 {
1062 dst->private = (void *)anon_vma + old_page_state;
1063 }
1064
__migrate_folio_extract(struct folio * dst,int * old_page_state,struct anon_vma ** anon_vmap)1065 static void __migrate_folio_extract(struct folio *dst,
1066 int *old_page_state,
1067 struct anon_vma **anon_vmap)
1068 {
1069 unsigned long private = (unsigned long)dst->private;
1070
1071 *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
1072 *old_page_state = private & PAGE_OLD_STATES;
1073 dst->private = NULL;
1074 }
1075
1076 /* Restore the source folio to the original state upon failure */
migrate_folio_undo_src(struct folio * src,int page_was_mapped,struct anon_vma * anon_vma,bool locked,struct list_head * ret)1077 static void migrate_folio_undo_src(struct folio *src,
1078 int page_was_mapped,
1079 struct anon_vma *anon_vma,
1080 bool locked,
1081 struct list_head *ret)
1082 {
1083 if (page_was_mapped)
1084 remove_migration_ptes(src, src, 0);
1085 /* Drop an anon_vma reference if we took one */
1086 if (anon_vma)
1087 put_anon_vma(anon_vma);
1088 if (locked)
1089 folio_unlock(src);
1090 if (ret)
1091 list_move_tail(&src->lru, ret);
1092 }
1093
1094 /* Restore the destination folio to the original state upon failure */
migrate_folio_undo_dst(struct folio * dst,bool locked,free_folio_t put_new_folio,unsigned long private)1095 static void migrate_folio_undo_dst(struct folio *dst, bool locked,
1096 free_folio_t put_new_folio, unsigned long private)
1097 {
1098 if (locked)
1099 folio_unlock(dst);
1100 if (put_new_folio)
1101 put_new_folio(dst, private);
1102 else
1103 folio_put(dst);
1104 }
1105
1106 /* Cleanup src folio upon migration success */
migrate_folio_done(struct folio * src,enum migrate_reason reason)1107 static void migrate_folio_done(struct folio *src,
1108 enum migrate_reason reason)
1109 {
1110 /*
1111 * Compaction can migrate also non-LRU pages which are
1112 * not accounted to NR_ISOLATED_*. They can be recognized
1113 * as __folio_test_movable
1114 */
1115 if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
1116 mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
1117 folio_is_file_lru(src), -folio_nr_pages(src));
1118
1119 if (reason != MR_MEMORY_FAILURE)
1120 /* We release the page in page_handle_poison. */
1121 folio_put(src);
1122 }
1123
1124 /* Obtain the lock on page, remove all ptes. */
migrate_folio_unmap(new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,struct folio * src,struct folio ** dstp,enum migrate_mode mode,enum migrate_reason reason,struct list_head * ret)1125 static int migrate_folio_unmap(new_folio_t get_new_folio,
1126 free_folio_t put_new_folio, unsigned long private,
1127 struct folio *src, struct folio **dstp, enum migrate_mode mode,
1128 enum migrate_reason reason, struct list_head *ret)
1129 {
1130 struct folio *dst;
1131 int rc = -EAGAIN;
1132 int old_page_state = 0;
1133 struct anon_vma *anon_vma = NULL;
1134 bool is_lru = data_race(!__folio_test_movable(src));
1135 bool locked = false;
1136 bool dst_locked = false;
1137
1138 if (folio_ref_count(src) == 1) {
1139 /* Folio was freed from under us. So we are done. */
1140 folio_clear_active(src);
1141 folio_clear_unevictable(src);
1142 /* free_pages_prepare() will clear PG_isolated. */
1143 list_del(&src->lru);
1144 migrate_folio_done(src, reason);
1145 return MIGRATEPAGE_SUCCESS;
1146 }
1147
1148 dst = get_new_folio(src, private);
1149 if (!dst)
1150 return -ENOMEM;
1151 *dstp = dst;
1152
1153 dst->private = NULL;
1154
1155 if (!folio_trylock(src)) {
1156 if (mode == MIGRATE_ASYNC)
1157 goto out;
1158
1159 /*
1160 * It's not safe for direct compaction to call lock_page.
1161 * For example, during page readahead pages are added locked
1162 * to the LRU. Later, when the IO completes the pages are
1163 * marked uptodate and unlocked. However, the queueing
1164 * could be merging multiple pages for one bio (e.g.
1165 * mpage_readahead). If an allocation happens for the
1166 * second or third page, the process can end up locking
1167 * the same page twice and deadlocking. Rather than
1168 * trying to be clever about what pages can be locked,
1169 * avoid the use of lock_page for direct compaction
1170 * altogether.
1171 */
1172 if (current->flags & PF_MEMALLOC)
1173 goto out;
1174
1175 /*
1176 * In "light" mode, we can wait for transient locks (eg
1177 * inserting a page into the page table), but it's not
1178 * worth waiting for I/O.
1179 */
1180 if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
1181 goto out;
1182
1183 folio_lock(src);
1184 }
1185 locked = true;
1186 if (folio_test_mlocked(src))
1187 old_page_state |= PAGE_WAS_MLOCKED;
1188
1189 if (folio_test_writeback(src)) {
1190 /*
1191 * Only in the case of a full synchronous migration is it
1192 * necessary to wait for PageWriteback. In the async case,
1193 * the retry loop is too short and in the sync-light case,
1194 * the overhead of stalling is too much
1195 */
1196 switch (mode) {
1197 case MIGRATE_SYNC:
1198 break;
1199 default:
1200 rc = -EBUSY;
1201 goto out;
1202 }
1203 folio_wait_writeback(src);
1204 }
1205
1206 /*
1207 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
1208 * we cannot notice that anon_vma is freed while we migrate a page.
1209 * This get_anon_vma() delays freeing anon_vma pointer until the end
1210 * of migration. File cache pages are no problem because of page_lock()
1211 * File Caches may use write_page() or lock_page() in migration, then,
1212 * just care Anon page here.
1213 *
1214 * Only folio_get_anon_vma() understands the subtleties of
1215 * getting a hold on an anon_vma from outside one of its mms.
1216 * But if we cannot get anon_vma, then we won't need it anyway,
1217 * because that implies that the anon page is no longer mapped
1218 * (and cannot be remapped so long as we hold the page lock).
1219 */
1220 if (folio_test_anon(src) && !folio_test_ksm(src))
1221 anon_vma = folio_get_anon_vma(src);
1222
1223 /*
1224 * Block others from accessing the new page when we get around to
1225 * establishing additional references. We are usually the only one
1226 * holding a reference to dst at this point. We used to have a BUG
1227 * here if folio_trylock(dst) fails, but would like to allow for
1228 * cases where there might be a race with the previous use of dst.
1229 * This is much like races on refcount of oldpage: just don't BUG().
1230 */
1231 if (unlikely(!folio_trylock(dst)))
1232 goto out;
1233 dst_locked = true;
1234
1235 if (unlikely(!is_lru)) {
1236 __migrate_folio_record(dst, old_page_state, anon_vma);
1237 return MIGRATEPAGE_UNMAP;
1238 }
1239
1240 /*
1241 * Corner case handling:
1242 * 1. When a new swap-cache page is read into, it is added to the LRU
1243 * and treated as swapcache but it has no rmap yet.
1244 * Calling try_to_unmap() against a src->mapping==NULL page will
1245 * trigger a BUG. So handle it here.
1246 * 2. An orphaned page (see truncate_cleanup_page) might have
1247 * fs-private metadata. The page can be picked up due to memory
1248 * offlining. Everywhere else except page reclaim, the page is
1249 * invisible to the vm, so the page can not be migrated. So try to
1250 * free the metadata, so the page can be freed.
1251 */
1252 if (!src->mapping) {
1253 if (folio_test_private(src)) {
1254 try_to_free_buffers(src);
1255 goto out;
1256 }
1257 } else if (folio_mapped(src)) {
1258 /* Establish migration ptes */
1259 VM_BUG_ON_FOLIO(folio_test_anon(src) &&
1260 !folio_test_ksm(src) && !anon_vma, src);
1261 try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
1262 old_page_state |= PAGE_WAS_MAPPED;
1263 }
1264
1265 if (!folio_mapped(src)) {
1266 __migrate_folio_record(dst, old_page_state, anon_vma);
1267 return MIGRATEPAGE_UNMAP;
1268 }
1269
1270 out:
1271 /*
1272 * A folio that has not been unmapped will be restored to
1273 * right list unless we want to retry.
1274 */
1275 if (rc == -EAGAIN)
1276 ret = NULL;
1277
1278 migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
1279 anon_vma, locked, ret);
1280 migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
1281
1282 return rc;
1283 }
1284
1285 /* Migrate the folio to the newly allocated folio in dst. */
migrate_folio_move(free_folio_t put_new_folio,unsigned long private,struct folio * src,struct folio * dst,enum migrate_mode mode,enum migrate_reason reason,struct list_head * ret)1286 static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
1287 struct folio *src, struct folio *dst,
1288 enum migrate_mode mode, enum migrate_reason reason,
1289 struct list_head *ret)
1290 {
1291 int rc;
1292 int old_page_state = 0;
1293 struct anon_vma *anon_vma = NULL;
1294 bool is_lru = !__folio_test_movable(src);
1295 struct list_head *prev;
1296
1297 __migrate_folio_extract(dst, &old_page_state, &anon_vma);
1298 prev = dst->lru.prev;
1299 list_del(&dst->lru);
1300
1301 rc = move_to_new_folio(dst, src, mode);
1302 if (rc)
1303 goto out;
1304
1305 if (unlikely(!is_lru))
1306 goto out_unlock_both;
1307
1308 /*
1309 * When successful, push dst to LRU immediately: so that if it
1310 * turns out to be an mlocked page, remove_migration_ptes() will
1311 * automatically build up the correct dst->mlock_count for it.
1312 *
1313 * We would like to do something similar for the old page, when
1314 * unsuccessful, and other cases when a page has been temporarily
1315 * isolated from the unevictable LRU: but this case is the easiest.
1316 */
1317 folio_add_lru(dst);
1318 if (old_page_state & PAGE_WAS_MLOCKED)
1319 lru_add_drain();
1320
1321 if (old_page_state & PAGE_WAS_MAPPED)
1322 remove_migration_ptes(src, dst, 0);
1323
1324 out_unlock_both:
1325 folio_unlock(dst);
1326 set_page_owner_migrate_reason(&dst->page, reason);
1327 /*
1328 * If migration is successful, decrease refcount of dst,
1329 * which will not free the page because new page owner increased
1330 * refcounter.
1331 */
1332 folio_put(dst);
1333
1334 /*
1335 * A folio that has been migrated has all references removed
1336 * and will be freed.
1337 */
1338 list_del(&src->lru);
1339 /* Drop an anon_vma reference if we took one */
1340 if (anon_vma)
1341 put_anon_vma(anon_vma);
1342 folio_unlock(src);
1343 migrate_folio_done(src, reason);
1344
1345 return rc;
1346 out:
1347 /*
1348 * A folio that has not been migrated will be restored to
1349 * right list unless we want to retry.
1350 */
1351 if (rc == -EAGAIN) {
1352 list_add(&dst->lru, prev);
1353 __migrate_folio_record(dst, old_page_state, anon_vma);
1354 return rc;
1355 }
1356
1357 migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
1358 anon_vma, true, ret);
1359 migrate_folio_undo_dst(dst, true, put_new_folio, private);
1360
1361 return rc;
1362 }
1363
1364 /*
1365 * Counterpart of unmap_and_move_page() for hugepage migration.
1366 *
1367 * This function doesn't wait the completion of hugepage I/O
1368 * because there is no race between I/O and migration for hugepage.
1369 * Note that currently hugepage I/O occurs only in direct I/O
1370 * where no lock is held and PG_writeback is irrelevant,
1371 * and writeback status of all subpages are counted in the reference
1372 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1373 * under direct I/O, the reference of the head page is 512 and a bit more.)
1374 * This means that when we try to migrate hugepage whose subpages are
1375 * doing direct I/O, some references remain after try_to_unmap() and
1376 * hugepage migration fails without data corruption.
1377 *
1378 * There is also no race when direct I/O is issued on the page under migration,
1379 * because then pte is replaced with migration swap entry and direct I/O code
1380 * will wait in the page fault for migration to complete.
1381 */
unmap_and_move_huge_page(new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,struct folio * src,int force,enum migrate_mode mode,int reason,struct list_head * ret)1382 static int unmap_and_move_huge_page(new_folio_t get_new_folio,
1383 free_folio_t put_new_folio, unsigned long private,
1384 struct folio *src, int force, enum migrate_mode mode,
1385 int reason, struct list_head *ret)
1386 {
1387 struct folio *dst;
1388 int rc = -EAGAIN;
1389 int page_was_mapped = 0;
1390 struct anon_vma *anon_vma = NULL;
1391 struct address_space *mapping = NULL;
1392
1393 if (folio_ref_count(src) == 1) {
1394 /* page was freed from under us. So we are done. */
1395 folio_putback_hugetlb(src);
1396 return MIGRATEPAGE_SUCCESS;
1397 }
1398
1399 dst = get_new_folio(src, private);
1400 if (!dst)
1401 return -ENOMEM;
1402
1403 if (!folio_trylock(src)) {
1404 if (!force)
1405 goto out;
1406 switch (mode) {
1407 case MIGRATE_SYNC:
1408 break;
1409 default:
1410 goto out;
1411 }
1412 folio_lock(src);
1413 }
1414
1415 /*
1416 * Check for pages which are in the process of being freed. Without
1417 * folio_mapping() set, hugetlbfs specific move page routine will not
1418 * be called and we could leak usage counts for subpools.
1419 */
1420 if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
1421 rc = -EBUSY;
1422 goto out_unlock;
1423 }
1424
1425 if (folio_test_anon(src))
1426 anon_vma = folio_get_anon_vma(src);
1427
1428 if (unlikely(!folio_trylock(dst)))
1429 goto put_anon;
1430
1431 if (folio_mapped(src)) {
1432 enum ttu_flags ttu = 0;
1433
1434 if (!folio_test_anon(src)) {
1435 /*
1436 * In shared mappings, try_to_unmap could potentially
1437 * call huge_pmd_unshare. Because of this, take
1438 * semaphore in write mode here and set TTU_RMAP_LOCKED
1439 * to let lower levels know we have taken the lock.
1440 */
1441 mapping = hugetlb_folio_mapping_lock_write(src);
1442 if (unlikely(!mapping))
1443 goto unlock_put_anon;
1444
1445 ttu = TTU_RMAP_LOCKED;
1446 }
1447
1448 try_to_migrate(src, ttu);
1449 page_was_mapped = 1;
1450
1451 if (ttu & TTU_RMAP_LOCKED)
1452 i_mmap_unlock_write(mapping);
1453 }
1454
1455 if (!folio_mapped(src))
1456 rc = move_to_new_folio(dst, src, mode);
1457
1458 if (page_was_mapped)
1459 remove_migration_ptes(src,
1460 rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
1461
1462 unlock_put_anon:
1463 folio_unlock(dst);
1464
1465 put_anon:
1466 if (anon_vma)
1467 put_anon_vma(anon_vma);
1468
1469 if (rc == MIGRATEPAGE_SUCCESS) {
1470 move_hugetlb_state(src, dst, reason);
1471 put_new_folio = NULL;
1472 }
1473
1474 out_unlock:
1475 folio_unlock(src);
1476 out:
1477 if (rc == MIGRATEPAGE_SUCCESS)
1478 folio_putback_hugetlb(src);
1479 else if (rc != -EAGAIN)
1480 list_move_tail(&src->lru, ret);
1481
1482 /*
1483 * If migration was not successful and there's a freeing callback,
1484 * return the folio to that special allocator. Otherwise, simply drop
1485 * our additional reference.
1486 */
1487 if (put_new_folio)
1488 put_new_folio(dst, private);
1489 else
1490 folio_put(dst);
1491
1492 return rc;
1493 }
1494
try_split_folio(struct folio * folio,struct list_head * split_folios,enum migrate_mode mode)1495 static inline int try_split_folio(struct folio *folio, struct list_head *split_folios,
1496 enum migrate_mode mode)
1497 {
1498 int rc;
1499
1500 if (mode == MIGRATE_ASYNC) {
1501 if (!folio_trylock(folio))
1502 return -EAGAIN;
1503 } else {
1504 folio_lock(folio);
1505 }
1506 rc = split_folio_to_list(folio, split_folios);
1507 folio_unlock(folio);
1508 if (!rc)
1509 list_move_tail(&folio->lru, split_folios);
1510
1511 return rc;
1512 }
1513
1514 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1515 #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
1516 #else
1517 #define NR_MAX_BATCHED_MIGRATION 512
1518 #endif
1519 #define NR_MAX_MIGRATE_PAGES_RETRY 10
1520 #define NR_MAX_MIGRATE_ASYNC_RETRY 3
1521 #define NR_MAX_MIGRATE_SYNC_RETRY \
1522 (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
1523
1524 struct migrate_pages_stats {
1525 int nr_succeeded; /* Normal and large folios migrated successfully, in
1526 units of base pages */
1527 int nr_failed_pages; /* Normal and large folios failed to be migrated, in
1528 units of base pages. Untried folios aren't counted */
1529 int nr_thp_succeeded; /* THP migrated successfully */
1530 int nr_thp_failed; /* THP failed to be migrated */
1531 int nr_thp_split; /* THP split before migrating */
1532 int nr_split; /* Large folio (include THP) split before migrating */
1533 };
1534
1535 /*
1536 * Returns the number of hugetlb folios that were not migrated, or an error code
1537 * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
1538 * any more because the list has become empty or no retryable hugetlb folios
1539 * exist any more. It is caller's responsibility to call putback_movable_pages()
1540 * only if ret != 0.
1541 */
migrate_hugetlbs(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct migrate_pages_stats * stats,struct list_head * ret_folios)1542 static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
1543 free_folio_t put_new_folio, unsigned long private,
1544 enum migrate_mode mode, int reason,
1545 struct migrate_pages_stats *stats,
1546 struct list_head *ret_folios)
1547 {
1548 int retry = 1;
1549 int nr_failed = 0;
1550 int nr_retry_pages = 0;
1551 int pass = 0;
1552 struct folio *folio, *folio2;
1553 int rc, nr_pages;
1554
1555 for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
1556 retry = 0;
1557 nr_retry_pages = 0;
1558
1559 list_for_each_entry_safe(folio, folio2, from, lru) {
1560 if (!folio_test_hugetlb(folio))
1561 continue;
1562
1563 nr_pages = folio_nr_pages(folio);
1564
1565 cond_resched();
1566
1567 /*
1568 * Migratability of hugepages depends on architectures and
1569 * their size. This check is necessary because some callers
1570 * of hugepage migration like soft offline and memory
1571 * hotremove don't walk through page tables or check whether
1572 * the hugepage is pmd-based or not before kicking migration.
1573 */
1574 if (!hugepage_migration_supported(folio_hstate(folio))) {
1575 nr_failed++;
1576 stats->nr_failed_pages += nr_pages;
1577 list_move_tail(&folio->lru, ret_folios);
1578 continue;
1579 }
1580
1581 rc = unmap_and_move_huge_page(get_new_folio,
1582 put_new_folio, private,
1583 folio, pass > 2, mode,
1584 reason, ret_folios);
1585 /*
1586 * The rules are:
1587 * Success: hugetlb folio will be put back
1588 * -EAGAIN: stay on the from list
1589 * -ENOMEM: stay on the from list
1590 * Other errno: put on ret_folios list
1591 */
1592 switch(rc) {
1593 case -ENOMEM:
1594 /*
1595 * When memory is low, don't bother to try to migrate
1596 * other folios, just exit.
1597 */
1598 stats->nr_failed_pages += nr_pages + nr_retry_pages;
1599 return -ENOMEM;
1600 case -EAGAIN:
1601 retry++;
1602 nr_retry_pages += nr_pages;
1603 break;
1604 case MIGRATEPAGE_SUCCESS:
1605 stats->nr_succeeded += nr_pages;
1606 break;
1607 default:
1608 /*
1609 * Permanent failure (-EBUSY, etc.):
1610 * unlike -EAGAIN case, the failed folio is
1611 * removed from migration folio list and not
1612 * retried in the next outer loop.
1613 */
1614 nr_failed++;
1615 stats->nr_failed_pages += nr_pages;
1616 break;
1617 }
1618 }
1619 }
1620 /*
1621 * nr_failed is number of hugetlb folios failed to be migrated. After
1622 * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
1623 * folios as failed.
1624 */
1625 nr_failed += retry;
1626 stats->nr_failed_pages += nr_retry_pages;
1627
1628 return nr_failed;
1629 }
1630
migrate_folios_move(struct list_head * src_folios,struct list_head * dst_folios,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct migrate_pages_stats * stats,int * retry,int * thp_retry,int * nr_failed,int * nr_retry_pages)1631 static void migrate_folios_move(struct list_head *src_folios,
1632 struct list_head *dst_folios,
1633 free_folio_t put_new_folio, unsigned long private,
1634 enum migrate_mode mode, int reason,
1635 struct list_head *ret_folios,
1636 struct migrate_pages_stats *stats,
1637 int *retry, int *thp_retry, int *nr_failed,
1638 int *nr_retry_pages)
1639 {
1640 struct folio *folio, *folio2, *dst, *dst2;
1641 bool is_thp;
1642 int nr_pages;
1643 int rc;
1644
1645 dst = list_first_entry(dst_folios, struct folio, lru);
1646 dst2 = list_next_entry(dst, lru);
1647 list_for_each_entry_safe(folio, folio2, src_folios, lru) {
1648 is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
1649 nr_pages = folio_nr_pages(folio);
1650
1651 cond_resched();
1652
1653 rc = migrate_folio_move(put_new_folio, private,
1654 folio, dst, mode,
1655 reason, ret_folios);
1656 /*
1657 * The rules are:
1658 * Success: folio will be freed
1659 * -EAGAIN: stay on the unmap_folios list
1660 * Other errno: put on ret_folios list
1661 */
1662 switch (rc) {
1663 case -EAGAIN:
1664 *retry += 1;
1665 *thp_retry += is_thp;
1666 *nr_retry_pages += nr_pages;
1667 break;
1668 case MIGRATEPAGE_SUCCESS:
1669 stats->nr_succeeded += nr_pages;
1670 stats->nr_thp_succeeded += is_thp;
1671 break;
1672 default:
1673 *nr_failed += 1;
1674 stats->nr_thp_failed += is_thp;
1675 stats->nr_failed_pages += nr_pages;
1676 break;
1677 }
1678 dst = dst2;
1679 dst2 = list_next_entry(dst, lru);
1680 }
1681 }
1682
migrate_folios_undo(struct list_head * src_folios,struct list_head * dst_folios,free_folio_t put_new_folio,unsigned long private,struct list_head * ret_folios)1683 static void migrate_folios_undo(struct list_head *src_folios,
1684 struct list_head *dst_folios,
1685 free_folio_t put_new_folio, unsigned long private,
1686 struct list_head *ret_folios)
1687 {
1688 struct folio *folio, *folio2, *dst, *dst2;
1689
1690 dst = list_first_entry(dst_folios, struct folio, lru);
1691 dst2 = list_next_entry(dst, lru);
1692 list_for_each_entry_safe(folio, folio2, src_folios, lru) {
1693 int old_page_state = 0;
1694 struct anon_vma *anon_vma = NULL;
1695
1696 __migrate_folio_extract(dst, &old_page_state, &anon_vma);
1697 migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
1698 anon_vma, true, ret_folios);
1699 list_del(&dst->lru);
1700 migrate_folio_undo_dst(dst, true, put_new_folio, private);
1701 dst = dst2;
1702 dst2 = list_next_entry(dst, lru);
1703 }
1704 }
1705
1706 /*
1707 * migrate_pages_batch() first unmaps folios in the from list as many as
1708 * possible, then move the unmapped folios.
1709 *
1710 * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
1711 * lock or bit when we have locked more than one folio. Which may cause
1712 * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
1713 * length of the from list must be <= 1.
1714 */
migrate_pages_batch(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct list_head * split_folios,struct migrate_pages_stats * stats,int nr_pass)1715 static int migrate_pages_batch(struct list_head *from,
1716 new_folio_t get_new_folio, free_folio_t put_new_folio,
1717 unsigned long private, enum migrate_mode mode, int reason,
1718 struct list_head *ret_folios, struct list_head *split_folios,
1719 struct migrate_pages_stats *stats, int nr_pass)
1720 {
1721 int retry = 1;
1722 int thp_retry = 1;
1723 int nr_failed = 0;
1724 int nr_retry_pages = 0;
1725 int pass = 0;
1726 bool is_thp = false;
1727 bool is_large = false;
1728 struct folio *folio, *folio2, *dst = NULL;
1729 int rc, rc_saved = 0, nr_pages;
1730 LIST_HEAD(unmap_folios);
1731 LIST_HEAD(dst_folios);
1732 bool nosplit = (reason == MR_NUMA_MISPLACED);
1733
1734 VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
1735 !list_empty(from) && !list_is_singular(from));
1736
1737 for (pass = 0; pass < nr_pass && retry; pass++) {
1738 retry = 0;
1739 thp_retry = 0;
1740 nr_retry_pages = 0;
1741
1742 list_for_each_entry_safe(folio, folio2, from, lru) {
1743 is_large = folio_test_large(folio);
1744 is_thp = folio_test_pmd_mappable(folio);
1745 nr_pages = folio_nr_pages(folio);
1746
1747 cond_resched();
1748
1749 /*
1750 * The rare folio on the deferred split list should
1751 * be split now. It should not count as a failure:
1752 * but increment nr_failed because, without doing so,
1753 * migrate_pages() may report success with (split but
1754 * unmigrated) pages still on its fromlist; whereas it
1755 * always reports success when its fromlist is empty.
1756 * stats->nr_thp_failed should be increased too,
1757 * otherwise stats inconsistency will happen when
1758 * migrate_pages_batch is called via migrate_pages()
1759 * with MIGRATE_SYNC and MIGRATE_ASYNC.
1760 *
1761 * Only check it without removing it from the list.
1762 * Since the folio can be on deferred_split_scan()
1763 * local list and removing it can cause the local list
1764 * corruption. Folio split process below can handle it
1765 * with the help of folio_ref_freeze().
1766 *
1767 * nr_pages > 2 is needed to avoid checking order-1
1768 * page cache folios. They exist, in contrast to
1769 * non-existent order-1 anonymous folios, and do not
1770 * use _deferred_list.
1771 */
1772 if (nr_pages > 2 &&
1773 !list_empty(&folio->_deferred_list) &&
1774 folio_test_partially_mapped(folio)) {
1775 if (!try_split_folio(folio, split_folios, mode)) {
1776 nr_failed++;
1777 stats->nr_thp_failed += is_thp;
1778 stats->nr_thp_split += is_thp;
1779 stats->nr_split++;
1780 continue;
1781 }
1782 }
1783
1784 /*
1785 * Large folio migration might be unsupported or
1786 * the allocation might be failed so we should retry
1787 * on the same folio with the large folio split
1788 * to normal folios.
1789 *
1790 * Split folios are put in split_folios, and
1791 * we will migrate them after the rest of the
1792 * list is processed.
1793 */
1794 if (!thp_migration_supported() && is_thp) {
1795 nr_failed++;
1796 stats->nr_thp_failed++;
1797 if (!try_split_folio(folio, split_folios, mode)) {
1798 stats->nr_thp_split++;
1799 stats->nr_split++;
1800 continue;
1801 }
1802 stats->nr_failed_pages += nr_pages;
1803 list_move_tail(&folio->lru, ret_folios);
1804 continue;
1805 }
1806
1807 rc = migrate_folio_unmap(get_new_folio, put_new_folio,
1808 private, folio, &dst, mode, reason,
1809 ret_folios);
1810 /*
1811 * The rules are:
1812 * Success: folio will be freed
1813 * Unmap: folio will be put on unmap_folios list,
1814 * dst folio put on dst_folios list
1815 * -EAGAIN: stay on the from list
1816 * -ENOMEM: stay on the from list
1817 * Other errno: put on ret_folios list
1818 */
1819 switch(rc) {
1820 case -ENOMEM:
1821 /*
1822 * When memory is low, don't bother to try to migrate
1823 * other folios, move unmapped folios, then exit.
1824 */
1825 nr_failed++;
1826 stats->nr_thp_failed += is_thp;
1827 /* Large folio NUMA faulting doesn't split to retry. */
1828 if (is_large && !nosplit) {
1829 int ret = try_split_folio(folio, split_folios, mode);
1830
1831 if (!ret) {
1832 stats->nr_thp_split += is_thp;
1833 stats->nr_split++;
1834 break;
1835 } else if (reason == MR_LONGTERM_PIN &&
1836 ret == -EAGAIN) {
1837 /*
1838 * Try again to split large folio to
1839 * mitigate the failure of longterm pinning.
1840 */
1841 retry++;
1842 thp_retry += is_thp;
1843 nr_retry_pages += nr_pages;
1844 /* Undo duplicated failure counting. */
1845 nr_failed--;
1846 stats->nr_thp_failed -= is_thp;
1847 break;
1848 }
1849 }
1850
1851 stats->nr_failed_pages += nr_pages + nr_retry_pages;
1852 /* nr_failed isn't updated for not used */
1853 stats->nr_thp_failed += thp_retry;
1854 rc_saved = rc;
1855 if (list_empty(&unmap_folios))
1856 goto out;
1857 else
1858 goto move;
1859 case -EAGAIN:
1860 retry++;
1861 thp_retry += is_thp;
1862 nr_retry_pages += nr_pages;
1863 break;
1864 case MIGRATEPAGE_SUCCESS:
1865 stats->nr_succeeded += nr_pages;
1866 stats->nr_thp_succeeded += is_thp;
1867 break;
1868 case MIGRATEPAGE_UNMAP:
1869 list_move_tail(&folio->lru, &unmap_folios);
1870 list_add_tail(&dst->lru, &dst_folios);
1871 break;
1872 default:
1873 /*
1874 * Permanent failure (-EBUSY, etc.):
1875 * unlike -EAGAIN case, the failed folio is
1876 * removed from migration folio list and not
1877 * retried in the next outer loop.
1878 */
1879 nr_failed++;
1880 stats->nr_thp_failed += is_thp;
1881 stats->nr_failed_pages += nr_pages;
1882 break;
1883 }
1884 }
1885 }
1886 nr_failed += retry;
1887 stats->nr_thp_failed += thp_retry;
1888 stats->nr_failed_pages += nr_retry_pages;
1889 move:
1890 /* Flush TLBs for all unmapped folios */
1891 try_to_unmap_flush();
1892
1893 retry = 1;
1894 for (pass = 0; pass < nr_pass && retry; pass++) {
1895 retry = 0;
1896 thp_retry = 0;
1897 nr_retry_pages = 0;
1898
1899 /* Move the unmapped folios */
1900 migrate_folios_move(&unmap_folios, &dst_folios,
1901 put_new_folio, private, mode, reason,
1902 ret_folios, stats, &retry, &thp_retry,
1903 &nr_failed, &nr_retry_pages);
1904 }
1905 nr_failed += retry;
1906 stats->nr_thp_failed += thp_retry;
1907 stats->nr_failed_pages += nr_retry_pages;
1908
1909 rc = rc_saved ? : nr_failed;
1910 out:
1911 /* Cleanup remaining folios */
1912 migrate_folios_undo(&unmap_folios, &dst_folios,
1913 put_new_folio, private, ret_folios);
1914
1915 return rc;
1916 }
1917
migrate_pages_sync(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct list_head * split_folios,struct migrate_pages_stats * stats)1918 static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
1919 free_folio_t put_new_folio, unsigned long private,
1920 enum migrate_mode mode, int reason,
1921 struct list_head *ret_folios, struct list_head *split_folios,
1922 struct migrate_pages_stats *stats)
1923 {
1924 int rc, nr_failed = 0;
1925 LIST_HEAD(folios);
1926 struct migrate_pages_stats astats;
1927
1928 memset(&astats, 0, sizeof(astats));
1929 /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
1930 rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
1931 reason, &folios, split_folios, &astats,
1932 NR_MAX_MIGRATE_ASYNC_RETRY);
1933 stats->nr_succeeded += astats.nr_succeeded;
1934 stats->nr_thp_succeeded += astats.nr_thp_succeeded;
1935 stats->nr_thp_split += astats.nr_thp_split;
1936 stats->nr_split += astats.nr_split;
1937 if (rc < 0) {
1938 stats->nr_failed_pages += astats.nr_failed_pages;
1939 stats->nr_thp_failed += astats.nr_thp_failed;
1940 list_splice_tail(&folios, ret_folios);
1941 return rc;
1942 }
1943 stats->nr_thp_failed += astats.nr_thp_split;
1944 /*
1945 * Do not count rc, as pages will be retried below.
1946 * Count nr_split only, since it includes nr_thp_split.
1947 */
1948 nr_failed += astats.nr_split;
1949 /*
1950 * Fall back to migrate all failed folios one by one synchronously. All
1951 * failed folios except split THPs will be retried, so their failure
1952 * isn't counted
1953 */
1954 list_splice_tail_init(&folios, from);
1955 while (!list_empty(from)) {
1956 list_move(from->next, &folios);
1957 rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
1958 private, mode, reason, ret_folios,
1959 split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
1960 list_splice_tail_init(&folios, ret_folios);
1961 if (rc < 0)
1962 return rc;
1963 nr_failed += rc;
1964 }
1965
1966 return nr_failed;
1967 }
1968
1969 /*
1970 * migrate_pages - migrate the folios specified in a list, to the free folios
1971 * supplied as the target for the page migration
1972 *
1973 * @from: The list of folios to be migrated.
1974 * @get_new_folio: The function used to allocate free folios to be used
1975 * as the target of the folio migration.
1976 * @put_new_folio: The function used to free target folios if migration
1977 * fails, or NULL if no special handling is necessary.
1978 * @private: Private data to be passed on to get_new_folio()
1979 * @mode: The migration mode that specifies the constraints for
1980 * folio migration, if any.
1981 * @reason: The reason for folio migration.
1982 * @ret_succeeded: Set to the number of folios migrated successfully if
1983 * the caller passes a non-NULL pointer.
1984 *
1985 * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
1986 * are movable any more because the list has become empty or no retryable folios
1987 * exist any more. It is caller's responsibility to call putback_movable_pages()
1988 * only if ret != 0.
1989 *
1990 * Returns the number of {normal folio, large folio, hugetlb} that were not
1991 * migrated, or an error code. The number of large folio splits will be
1992 * considered as the number of non-migrated large folio, no matter how many
1993 * split folios of the large folio are migrated successfully.
1994 */
migrate_pages(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,unsigned int * ret_succeeded)1995 int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
1996 free_folio_t put_new_folio, unsigned long private,
1997 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1998 {
1999 int rc, rc_gather;
2000 int nr_pages;
2001 struct folio *folio, *folio2;
2002 LIST_HEAD(folios);
2003 LIST_HEAD(ret_folios);
2004 LIST_HEAD(split_folios);
2005 struct migrate_pages_stats stats;
2006
2007 trace_mm_migrate_pages_start(mode, reason);
2008
2009 memset(&stats, 0, sizeof(stats));
2010
2011 rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
2012 mode, reason, &stats, &ret_folios);
2013 if (rc_gather < 0)
2014 goto out;
2015
2016 again:
2017 nr_pages = 0;
2018 list_for_each_entry_safe(folio, folio2, from, lru) {
2019 /* Retried hugetlb folios will be kept in list */
2020 if (folio_test_hugetlb(folio)) {
2021 list_move_tail(&folio->lru, &ret_folios);
2022 continue;
2023 }
2024
2025 nr_pages += folio_nr_pages(folio);
2026 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
2027 break;
2028 }
2029 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
2030 list_cut_before(&folios, from, &folio2->lru);
2031 else
2032 list_splice_init(from, &folios);
2033 if (mode == MIGRATE_ASYNC)
2034 rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
2035 private, mode, reason, &ret_folios,
2036 &split_folios, &stats,
2037 NR_MAX_MIGRATE_PAGES_RETRY);
2038 else
2039 rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
2040 private, mode, reason, &ret_folios,
2041 &split_folios, &stats);
2042 list_splice_tail_init(&folios, &ret_folios);
2043 if (rc < 0) {
2044 rc_gather = rc;
2045 list_splice_tail(&split_folios, &ret_folios);
2046 goto out;
2047 }
2048 if (!list_empty(&split_folios)) {
2049 /*
2050 * Failure isn't counted since all split folios of a large folio
2051 * is counted as 1 failure already. And, we only try to migrate
2052 * with minimal effort, force MIGRATE_ASYNC mode and retry once.
2053 */
2054 migrate_pages_batch(&split_folios, get_new_folio,
2055 put_new_folio, private, MIGRATE_ASYNC, reason,
2056 &ret_folios, NULL, &stats, 1);
2057 list_splice_tail_init(&split_folios, &ret_folios);
2058 }
2059 rc_gather += rc;
2060 if (!list_empty(from))
2061 goto again;
2062 out:
2063 /*
2064 * Put the permanent failure folio back to migration list, they
2065 * will be put back to the right list by the caller.
2066 */
2067 list_splice(&ret_folios, from);
2068
2069 /*
2070 * Return 0 in case all split folios of fail-to-migrate large folios
2071 * are migrated successfully.
2072 */
2073 if (list_empty(from))
2074 rc_gather = 0;
2075
2076 count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
2077 count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
2078 count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
2079 count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
2080 count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
2081 trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
2082 stats.nr_thp_succeeded, stats.nr_thp_failed,
2083 stats.nr_thp_split, stats.nr_split, mode,
2084 reason);
2085
2086 if (ret_succeeded)
2087 *ret_succeeded = stats.nr_succeeded;
2088
2089 return rc_gather;
2090 }
2091
alloc_migration_target(struct folio * src,unsigned long private)2092 struct folio *alloc_migration_target(struct folio *src, unsigned long private)
2093 {
2094 struct migration_target_control *mtc;
2095 gfp_t gfp_mask;
2096 unsigned int order = 0;
2097 int nid;
2098 int zidx;
2099
2100 mtc = (struct migration_target_control *)private;
2101 gfp_mask = mtc->gfp_mask;
2102 nid = mtc->nid;
2103 if (nid == NUMA_NO_NODE)
2104 nid = folio_nid(src);
2105
2106 if (folio_test_hugetlb(src)) {
2107 struct hstate *h = folio_hstate(src);
2108
2109 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
2110 return alloc_hugetlb_folio_nodemask(h, nid,
2111 mtc->nmask, gfp_mask,
2112 htlb_allow_alloc_fallback(mtc->reason));
2113 }
2114
2115 if (folio_test_large(src)) {
2116 /*
2117 * clear __GFP_RECLAIM to make the migration callback
2118 * consistent with regular THP allocations.
2119 */
2120 gfp_mask &= ~__GFP_RECLAIM;
2121 gfp_mask |= GFP_TRANSHUGE;
2122 order = folio_order(src);
2123 }
2124 zidx = zone_idx(folio_zone(src));
2125 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
2126 gfp_mask |= __GFP_HIGHMEM;
2127
2128 return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
2129 }
2130
2131 #ifdef CONFIG_NUMA
2132
store_status(int __user * status,int start,int value,int nr)2133 static int store_status(int __user *status, int start, int value, int nr)
2134 {
2135 while (nr-- > 0) {
2136 if (put_user(value, status + start))
2137 return -EFAULT;
2138 start++;
2139 }
2140
2141 return 0;
2142 }
2143
do_move_pages_to_node(struct list_head * pagelist,int node)2144 static int do_move_pages_to_node(struct list_head *pagelist, int node)
2145 {
2146 int err;
2147 struct migration_target_control mtc = {
2148 .nid = node,
2149 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
2150 .reason = MR_SYSCALL,
2151 };
2152
2153 err = migrate_pages(pagelist, alloc_migration_target, NULL,
2154 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
2155 if (err)
2156 putback_movable_pages(pagelist);
2157 return err;
2158 }
2159
__add_folio_for_migration(struct folio * folio,int node,struct list_head * pagelist,bool migrate_all)2160 static int __add_folio_for_migration(struct folio *folio, int node,
2161 struct list_head *pagelist, bool migrate_all)
2162 {
2163 if (is_zero_folio(folio) || is_huge_zero_folio(folio))
2164 return -EFAULT;
2165
2166 if (folio_is_zone_device(folio))
2167 return -ENOENT;
2168
2169 if (folio_nid(folio) == node)
2170 return 0;
2171
2172 if (folio_maybe_mapped_shared(folio) && !migrate_all)
2173 return -EACCES;
2174
2175 if (folio_test_hugetlb(folio)) {
2176 if (folio_isolate_hugetlb(folio, pagelist))
2177 return 1;
2178 } else if (folio_isolate_lru(folio)) {
2179 list_add_tail(&folio->lru, pagelist);
2180 node_stat_mod_folio(folio,
2181 NR_ISOLATED_ANON + folio_is_file_lru(folio),
2182 folio_nr_pages(folio));
2183 return 1;
2184 }
2185 return -EBUSY;
2186 }
2187
2188 /*
2189 * Resolves the given address to a struct folio, isolates it from the LRU and
2190 * puts it to the given pagelist.
2191 * Returns:
2192 * errno - if the folio cannot be found/isolated
2193 * 0 - when it doesn't have to be migrated because it is already on the
2194 * target node
2195 * 1 - when it has been queued
2196 */
add_folio_for_migration(struct mm_struct * mm,const void __user * p,int node,struct list_head * pagelist,bool migrate_all)2197 static int add_folio_for_migration(struct mm_struct *mm, const void __user *p,
2198 int node, struct list_head *pagelist, bool migrate_all)
2199 {
2200 struct vm_area_struct *vma;
2201 struct folio_walk fw;
2202 struct folio *folio;
2203 unsigned long addr;
2204 int err = -EFAULT;
2205
2206 mmap_read_lock(mm);
2207 addr = (unsigned long)untagged_addr_remote(mm, p);
2208
2209 vma = vma_lookup(mm, addr);
2210 if (vma && vma_migratable(vma)) {
2211 folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
2212 if (folio) {
2213 err = __add_folio_for_migration(folio, node, pagelist,
2214 migrate_all);
2215 folio_walk_end(&fw, vma);
2216 } else {
2217 err = -ENOENT;
2218 }
2219 }
2220 mmap_read_unlock(mm);
2221 return err;
2222 }
2223
move_pages_and_store_status(int node,struct list_head * pagelist,int __user * status,int start,int i,unsigned long nr_pages)2224 static int move_pages_and_store_status(int node,
2225 struct list_head *pagelist, int __user *status,
2226 int start, int i, unsigned long nr_pages)
2227 {
2228 int err;
2229
2230 if (list_empty(pagelist))
2231 return 0;
2232
2233 err = do_move_pages_to_node(pagelist, node);
2234 if (err) {
2235 /*
2236 * Positive err means the number of failed
2237 * pages to migrate. Since we are going to
2238 * abort and return the number of non-migrated
2239 * pages, so need to include the rest of the
2240 * nr_pages that have not been attempted as
2241 * well.
2242 */
2243 if (err > 0)
2244 err += nr_pages - i;
2245 return err;
2246 }
2247 return store_status(status, start, node, i - start);
2248 }
2249
2250 /*
2251 * Migrate an array of page address onto an array of nodes and fill
2252 * the corresponding array of status.
2253 */
do_pages_move(struct mm_struct * mm,nodemask_t task_nodes,unsigned long nr_pages,const void __user * __user * pages,const int __user * nodes,int __user * status,int flags)2254 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
2255 unsigned long nr_pages,
2256 const void __user * __user *pages,
2257 const int __user *nodes,
2258 int __user *status, int flags)
2259 {
2260 compat_uptr_t __user *compat_pages = (void __user *)pages;
2261 int current_node = NUMA_NO_NODE;
2262 LIST_HEAD(pagelist);
2263 int start, i;
2264 int err = 0, err1;
2265
2266 lru_cache_disable();
2267
2268 for (i = start = 0; i < nr_pages; i++) {
2269 const void __user *p;
2270 int node;
2271
2272 err = -EFAULT;
2273 if (in_compat_syscall()) {
2274 compat_uptr_t cp;
2275
2276 if (get_user(cp, compat_pages + i))
2277 goto out_flush;
2278
2279 p = compat_ptr(cp);
2280 } else {
2281 if (get_user(p, pages + i))
2282 goto out_flush;
2283 }
2284 if (get_user(node, nodes + i))
2285 goto out_flush;
2286
2287 err = -ENODEV;
2288 if (node < 0 || node >= MAX_NUMNODES)
2289 goto out_flush;
2290 if (!node_state(node, N_MEMORY))
2291 goto out_flush;
2292
2293 err = -EACCES;
2294 if (!node_isset(node, task_nodes))
2295 goto out_flush;
2296
2297 if (current_node == NUMA_NO_NODE) {
2298 current_node = node;
2299 start = i;
2300 } else if (node != current_node) {
2301 err = move_pages_and_store_status(current_node,
2302 &pagelist, status, start, i, nr_pages);
2303 if (err)
2304 goto out;
2305 start = i;
2306 current_node = node;
2307 }
2308
2309 /*
2310 * Errors in the page lookup or isolation are not fatal and we simply
2311 * report them via status
2312 */
2313 err = add_folio_for_migration(mm, p, current_node, &pagelist,
2314 flags & MPOL_MF_MOVE_ALL);
2315
2316 if (err > 0) {
2317 /* The page is successfully queued for migration */
2318 continue;
2319 }
2320
2321 /*
2322 * The move_pages() man page does not have an -EEXIST choice, so
2323 * use -EFAULT instead.
2324 */
2325 if (err == -EEXIST)
2326 err = -EFAULT;
2327
2328 /*
2329 * If the page is already on the target node (!err), store the
2330 * node, otherwise, store the err.
2331 */
2332 err = store_status(status, i, err ? : current_node, 1);
2333 if (err)
2334 goto out_flush;
2335
2336 err = move_pages_and_store_status(current_node, &pagelist,
2337 status, start, i, nr_pages);
2338 if (err) {
2339 /* We have accounted for page i */
2340 if (err > 0)
2341 err--;
2342 goto out;
2343 }
2344 current_node = NUMA_NO_NODE;
2345 }
2346 out_flush:
2347 /* Make sure we do not overwrite the existing error */
2348 err1 = move_pages_and_store_status(current_node, &pagelist,
2349 status, start, i, nr_pages);
2350 if (err >= 0)
2351 err = err1;
2352 out:
2353 lru_cache_enable();
2354 return err;
2355 }
2356
2357 /*
2358 * Determine the nodes of an array of pages and store it in an array of status.
2359 */
do_pages_stat_array(struct mm_struct * mm,unsigned long nr_pages,const void __user ** pages,int * status)2360 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
2361 const void __user **pages, int *status)
2362 {
2363 unsigned long i;
2364
2365 mmap_read_lock(mm);
2366
2367 for (i = 0; i < nr_pages; i++) {
2368 unsigned long addr = (unsigned long)(*pages);
2369 struct vm_area_struct *vma;
2370 struct folio_walk fw;
2371 struct folio *folio;
2372 int err = -EFAULT;
2373
2374 vma = vma_lookup(mm, addr);
2375 if (!vma)
2376 goto set_status;
2377
2378 folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
2379 if (folio) {
2380 if (is_zero_folio(folio) || is_huge_zero_folio(folio))
2381 err = -EFAULT;
2382 else if (folio_is_zone_device(folio))
2383 err = -ENOENT;
2384 else
2385 err = folio_nid(folio);
2386 folio_walk_end(&fw, vma);
2387 } else {
2388 err = -ENOENT;
2389 }
2390 set_status:
2391 *status = err;
2392
2393 pages++;
2394 status++;
2395 }
2396
2397 mmap_read_unlock(mm);
2398 }
2399
get_compat_pages_array(const void __user * chunk_pages[],const void __user * __user * pages,unsigned long chunk_nr)2400 static int get_compat_pages_array(const void __user *chunk_pages[],
2401 const void __user * __user *pages,
2402 unsigned long chunk_nr)
2403 {
2404 compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
2405 compat_uptr_t p;
2406 int i;
2407
2408 for (i = 0; i < chunk_nr; i++) {
2409 if (get_user(p, pages32 + i))
2410 return -EFAULT;
2411 chunk_pages[i] = compat_ptr(p);
2412 }
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * Determine the nodes of a user array of pages and store it in
2419 * a user array of status.
2420 */
do_pages_stat(struct mm_struct * mm,unsigned long nr_pages,const void __user * __user * pages,int __user * status)2421 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
2422 const void __user * __user *pages,
2423 int __user *status)
2424 {
2425 #define DO_PAGES_STAT_CHUNK_NR 16UL
2426 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
2427 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
2428
2429 while (nr_pages) {
2430 unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
2431
2432 if (in_compat_syscall()) {
2433 if (get_compat_pages_array(chunk_pages, pages,
2434 chunk_nr))
2435 break;
2436 } else {
2437 if (copy_from_user(chunk_pages, pages,
2438 chunk_nr * sizeof(*chunk_pages)))
2439 break;
2440 }
2441
2442 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
2443
2444 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
2445 break;
2446
2447 pages += chunk_nr;
2448 status += chunk_nr;
2449 nr_pages -= chunk_nr;
2450 }
2451 return nr_pages ? -EFAULT : 0;
2452 }
2453
find_mm_struct(pid_t pid,nodemask_t * mem_nodes)2454 static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
2455 {
2456 struct task_struct *task;
2457 struct mm_struct *mm;
2458
2459 /*
2460 * There is no need to check if current process has the right to modify
2461 * the specified process when they are same.
2462 */
2463 if (!pid) {
2464 mmget(current->mm);
2465 *mem_nodes = cpuset_mems_allowed(current);
2466 return current->mm;
2467 }
2468
2469 task = find_get_task_by_vpid(pid);
2470 if (!task) {
2471 return ERR_PTR(-ESRCH);
2472 }
2473
2474 /*
2475 * Check if this process has the right to modify the specified
2476 * process. Use the regular "ptrace_may_access()" checks.
2477 */
2478 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
2479 mm = ERR_PTR(-EPERM);
2480 goto out;
2481 }
2482
2483 mm = ERR_PTR(security_task_movememory(task));
2484 if (IS_ERR(mm))
2485 goto out;
2486 *mem_nodes = cpuset_mems_allowed(task);
2487 mm = get_task_mm(task);
2488 out:
2489 put_task_struct(task);
2490 if (!mm)
2491 mm = ERR_PTR(-EINVAL);
2492 return mm;
2493 }
2494
2495 /*
2496 * Move a list of pages in the address space of the currently executing
2497 * process.
2498 */
kernel_move_pages(pid_t pid,unsigned long nr_pages,const void __user * __user * pages,const int __user * nodes,int __user * status,int flags)2499 static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
2500 const void __user * __user *pages,
2501 const int __user *nodes,
2502 int __user *status, int flags)
2503 {
2504 struct mm_struct *mm;
2505 int err;
2506 nodemask_t task_nodes;
2507
2508 /* Check flags */
2509 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2510 return -EINVAL;
2511
2512 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2513 return -EPERM;
2514
2515 mm = find_mm_struct(pid, &task_nodes);
2516 if (IS_ERR(mm))
2517 return PTR_ERR(mm);
2518
2519 if (nodes)
2520 err = do_pages_move(mm, task_nodes, nr_pages, pages,
2521 nodes, status, flags);
2522 else
2523 err = do_pages_stat(mm, nr_pages, pages, status);
2524
2525 mmput(mm);
2526 return err;
2527 }
2528
SYSCALL_DEFINE6(move_pages,pid_t,pid,unsigned long,nr_pages,const void __user * __user *,pages,const int __user *,nodes,int __user *,status,int,flags)2529 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
2530 const void __user * __user *, pages,
2531 const int __user *, nodes,
2532 int __user *, status, int, flags)
2533 {
2534 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
2535 }
2536
2537 #ifdef CONFIG_NUMA_BALANCING
2538 /*
2539 * Returns true if this is a safe migration target node for misplaced NUMA
2540 * pages. Currently it only checks the watermarks which is crude.
2541 */
migrate_balanced_pgdat(struct pglist_data * pgdat,unsigned long nr_migrate_pages)2542 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
2543 unsigned long nr_migrate_pages)
2544 {
2545 int z;
2546
2547 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2548 struct zone *zone = pgdat->node_zones + z;
2549
2550 if (!managed_zone(zone))
2551 continue;
2552
2553 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
2554 if (!zone_watermark_ok(zone, 0,
2555 high_wmark_pages(zone) +
2556 nr_migrate_pages,
2557 ZONE_MOVABLE, ALLOC_CMA))
2558 continue;
2559 return true;
2560 }
2561 return false;
2562 }
2563
alloc_misplaced_dst_folio(struct folio * src,unsigned long data)2564 static struct folio *alloc_misplaced_dst_folio(struct folio *src,
2565 unsigned long data)
2566 {
2567 int nid = (int) data;
2568 int order = folio_order(src);
2569 gfp_t gfp = __GFP_THISNODE;
2570
2571 if (order > 0)
2572 gfp |= GFP_TRANSHUGE_LIGHT;
2573 else {
2574 gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2575 __GFP_NOWARN;
2576 gfp &= ~__GFP_RECLAIM;
2577 }
2578 return __folio_alloc_node(gfp, order, nid);
2579 }
2580
2581 /*
2582 * Prepare for calling migrate_misplaced_folio() by isolating the folio if
2583 * permitted. Must be called with the PTL still held.
2584 */
migrate_misplaced_folio_prepare(struct folio * folio,struct vm_area_struct * vma,int node)2585 int migrate_misplaced_folio_prepare(struct folio *folio,
2586 struct vm_area_struct *vma, int node)
2587 {
2588 int nr_pages = folio_nr_pages(folio);
2589 pg_data_t *pgdat = NODE_DATA(node);
2590
2591 if (folio_is_file_lru(folio)) {
2592 /*
2593 * Do not migrate file folios that are mapped in multiple
2594 * processes with execute permissions as they are probably
2595 * shared libraries.
2596 *
2597 * See folio_maybe_mapped_shared() on possible imprecision
2598 * when we cannot easily detect if a folio is shared.
2599 */
2600 if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
2601 return -EACCES;
2602
2603 /*
2604 * Do not migrate dirty folios as not all filesystems can move
2605 * dirty folios in MIGRATE_ASYNC mode which is a waste of
2606 * cycles.
2607 */
2608 if (folio_test_dirty(folio))
2609 return -EAGAIN;
2610 }
2611
2612 /* Avoid migrating to a node that is nearly full */
2613 if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2614 int z;
2615
2616 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2617 return -EAGAIN;
2618 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2619 if (managed_zone(pgdat->node_zones + z))
2620 break;
2621 }
2622
2623 /*
2624 * If there are no managed zones, it should not proceed
2625 * further.
2626 */
2627 if (z < 0)
2628 return -EAGAIN;
2629
2630 wakeup_kswapd(pgdat->node_zones + z, 0,
2631 folio_order(folio), ZONE_MOVABLE);
2632 return -EAGAIN;
2633 }
2634
2635 if (!folio_isolate_lru(folio))
2636 return -EAGAIN;
2637
2638 node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio),
2639 nr_pages);
2640 return 0;
2641 }
2642
2643 /*
2644 * Attempt to migrate a misplaced folio to the specified destination
2645 * node. Caller is expected to have isolated the folio by calling
2646 * migrate_misplaced_folio_prepare(), which will result in an
2647 * elevated reference count on the folio. This function will un-isolate the
2648 * folio, dereferencing the folio before returning.
2649 */
migrate_misplaced_folio(struct folio * folio,int node)2650 int migrate_misplaced_folio(struct folio *folio, int node)
2651 {
2652 pg_data_t *pgdat = NODE_DATA(node);
2653 int nr_remaining;
2654 unsigned int nr_succeeded;
2655 LIST_HEAD(migratepages);
2656 struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
2657 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2658
2659 list_add(&folio->lru, &migratepages);
2660 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
2661 NULL, node, MIGRATE_ASYNC,
2662 MR_NUMA_MISPLACED, &nr_succeeded);
2663 if (nr_remaining && !list_empty(&migratepages))
2664 putback_movable_pages(&migratepages);
2665 if (nr_succeeded) {
2666 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2667 count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
2668 if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
2669 && !node_is_toptier(folio_nid(folio))
2670 && node_is_toptier(node))
2671 mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded);
2672 }
2673 mem_cgroup_put(memcg);
2674 BUG_ON(!list_empty(&migratepages));
2675 return nr_remaining ? -EAGAIN : 0;
2676 }
2677 #endif /* CONFIG_NUMA_BALANCING */
2678 #endif /* CONFIG_NUMA */
2679