1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/task.h> 89 #include <linux/nodemask.h> 90 #include <linux/cpuset.h> 91 #include <linux/slab.h> 92 #include <linux/string.h> 93 #include <linux/export.h> 94 #include <linux/nsproxy.h> 95 #include <linux/interrupt.h> 96 #include <linux/init.h> 97 #include <linux/compat.h> 98 #include <linux/ptrace.h> 99 #include <linux/swap.h> 100 #include <linux/seq_file.h> 101 #include <linux/proc_fs.h> 102 #include <linux/migrate.h> 103 #include <linux/ksm.h> 104 #include <linux/rmap.h> 105 #include <linux/security.h> 106 #include <linux/syscalls.h> 107 #include <linux/ctype.h> 108 #include <linux/mm_inline.h> 109 #include <linux/mmu_notifier.h> 110 #include <linux/printk.h> 111 #include <linux/swapops.h> 112 113 #include <asm/tlbflush.h> 114 #include <asm/tlb.h> 115 #include <linux/uaccess.h> 116 #include <linux/memory.h> 117 118 #include "internal.h" 119 120 /* Internal flags */ 121 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 122 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 123 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 124 125 static struct kmem_cache *policy_cache; 126 static struct kmem_cache *sn_cache; 127 128 /* Highest zone. An specific allocation for a zone below that is not 129 policied. */ 130 enum zone_type policy_zone = 0; 131 132 /* 133 * run-time system-wide default policy => local allocation 134 */ 135 static struct mempolicy default_policy = { 136 .refcnt = ATOMIC_INIT(1), /* never free it */ 137 .mode = MPOL_LOCAL, 138 }; 139 140 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 141 142 /* 143 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes 144 * system-default value should be used. A NULL iw_table also denotes that 145 * system-default values should be used. Until the system-default table 146 * is implemented, the system-default is always 1. 147 * 148 * iw_table is RCU protected 149 */ 150 static u8 __rcu *iw_table; 151 static DEFINE_MUTEX(iw_table_lock); 152 153 static u8 get_il_weight(int node) 154 { 155 u8 *table; 156 u8 weight; 157 158 rcu_read_lock(); 159 table = rcu_dereference(iw_table); 160 /* if no iw_table, use system default */ 161 weight = table ? table[node] : 1; 162 /* if value in iw_table is 0, use system default */ 163 weight = weight ? weight : 1; 164 rcu_read_unlock(); 165 return weight; 166 } 167 168 /** 169 * numa_nearest_node - Find nearest node by state 170 * @node: Node id to start the search 171 * @state: State to filter the search 172 * 173 * Lookup the closest node by distance if @nid is not in state. 174 * 175 * Return: this @node if it is in state, otherwise the closest node by distance 176 */ 177 int numa_nearest_node(int node, unsigned int state) 178 { 179 int min_dist = INT_MAX, dist, n, min_node; 180 181 if (state >= NR_NODE_STATES) 182 return -EINVAL; 183 184 if (node == NUMA_NO_NODE || node_state(node, state)) 185 return node; 186 187 min_node = node; 188 for_each_node_state(n, state) { 189 dist = node_distance(node, n); 190 if (dist < min_dist) { 191 min_dist = dist; 192 min_node = n; 193 } 194 } 195 196 return min_node; 197 } 198 EXPORT_SYMBOL_GPL(numa_nearest_node); 199 200 /** 201 * nearest_node_nodemask - Find the node in @mask at the nearest distance 202 * from @node. 203 * 204 * @node: a valid node ID to start the search from. 205 * @mask: a pointer to a nodemask representing the allowed nodes. 206 * 207 * This function iterates over all nodes in @mask and calculates the 208 * distance from the starting @node, then it returns the node ID that is 209 * the closest to @node, or MAX_NUMNODES if no node is found. 210 * 211 * Note that @node must be a valid node ID usable with node_distance(), 212 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 213 * or unexpected behavior. 214 */ 215 int nearest_node_nodemask(int node, nodemask_t *mask) 216 { 217 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 218 219 for_each_node_mask(n, *mask) { 220 dist = node_distance(node, n); 221 if (dist < min_dist) { 222 min_dist = dist; 223 min_node = n; 224 } 225 } 226 227 return min_node; 228 } 229 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 230 231 struct mempolicy *get_task_policy(struct task_struct *p) 232 { 233 struct mempolicy *pol = p->mempolicy; 234 int node; 235 236 if (pol) 237 return pol; 238 239 node = numa_node_id(); 240 if (node != NUMA_NO_NODE) { 241 pol = &preferred_node_policy[node]; 242 /* preferred_node_policy is not initialised early in boot */ 243 if (pol->mode) 244 return pol; 245 } 246 247 return &default_policy; 248 } 249 250 static const struct mempolicy_operations { 251 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 252 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 253 } mpol_ops[MPOL_MAX]; 254 255 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 256 { 257 return pol->flags & MPOL_MODE_FLAGS; 258 } 259 260 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 261 const nodemask_t *rel) 262 { 263 nodemask_t tmp; 264 nodes_fold(tmp, *orig, nodes_weight(*rel)); 265 nodes_onto(*ret, tmp, *rel); 266 } 267 268 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 269 { 270 if (nodes_empty(*nodes)) 271 return -EINVAL; 272 pol->nodes = *nodes; 273 return 0; 274 } 275 276 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 277 { 278 if (nodes_empty(*nodes)) 279 return -EINVAL; 280 281 nodes_clear(pol->nodes); 282 node_set(first_node(*nodes), pol->nodes); 283 return 0; 284 } 285 286 /* 287 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 288 * any, for the new policy. mpol_new() has already validated the nodes 289 * parameter with respect to the policy mode and flags. 290 * 291 * Must be called holding task's alloc_lock to protect task's mems_allowed 292 * and mempolicy. May also be called holding the mmap_lock for write. 293 */ 294 static int mpol_set_nodemask(struct mempolicy *pol, 295 const nodemask_t *nodes, struct nodemask_scratch *nsc) 296 { 297 int ret; 298 299 /* 300 * Default (pol==NULL) resp. local memory policies are not a 301 * subject of any remapping. They also do not need any special 302 * constructor. 303 */ 304 if (!pol || pol->mode == MPOL_LOCAL) 305 return 0; 306 307 /* Check N_MEMORY */ 308 nodes_and(nsc->mask1, 309 cpuset_current_mems_allowed, node_states[N_MEMORY]); 310 311 VM_BUG_ON(!nodes); 312 313 if (pol->flags & MPOL_F_RELATIVE_NODES) 314 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 315 else 316 nodes_and(nsc->mask2, *nodes, nsc->mask1); 317 318 if (mpol_store_user_nodemask(pol)) 319 pol->w.user_nodemask = *nodes; 320 else 321 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 322 323 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 324 return ret; 325 } 326 327 /* 328 * This function just creates a new policy, does some check and simple 329 * initialization. You must invoke mpol_set_nodemask() to set nodes. 330 */ 331 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 332 nodemask_t *nodes) 333 { 334 struct mempolicy *policy; 335 336 if (mode == MPOL_DEFAULT) { 337 if (nodes && !nodes_empty(*nodes)) 338 return ERR_PTR(-EINVAL); 339 return NULL; 340 } 341 VM_BUG_ON(!nodes); 342 343 /* 344 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 345 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 346 * All other modes require a valid pointer to a non-empty nodemask. 347 */ 348 if (mode == MPOL_PREFERRED) { 349 if (nodes_empty(*nodes)) { 350 if (((flags & MPOL_F_STATIC_NODES) || 351 (flags & MPOL_F_RELATIVE_NODES))) 352 return ERR_PTR(-EINVAL); 353 354 mode = MPOL_LOCAL; 355 } 356 } else if (mode == MPOL_LOCAL) { 357 if (!nodes_empty(*nodes) || 358 (flags & MPOL_F_STATIC_NODES) || 359 (flags & MPOL_F_RELATIVE_NODES)) 360 return ERR_PTR(-EINVAL); 361 } else if (nodes_empty(*nodes)) 362 return ERR_PTR(-EINVAL); 363 364 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 365 if (!policy) 366 return ERR_PTR(-ENOMEM); 367 atomic_set(&policy->refcnt, 1); 368 policy->mode = mode; 369 policy->flags = flags; 370 policy->home_node = NUMA_NO_NODE; 371 372 return policy; 373 } 374 375 /* Slow path of a mpol destructor. */ 376 void __mpol_put(struct mempolicy *pol) 377 { 378 if (!atomic_dec_and_test(&pol->refcnt)) 379 return; 380 kmem_cache_free(policy_cache, pol); 381 } 382 383 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 384 { 385 } 386 387 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 388 { 389 nodemask_t tmp; 390 391 if (pol->flags & MPOL_F_STATIC_NODES) 392 nodes_and(tmp, pol->w.user_nodemask, *nodes); 393 else if (pol->flags & MPOL_F_RELATIVE_NODES) 394 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 395 else { 396 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 397 *nodes); 398 pol->w.cpuset_mems_allowed = *nodes; 399 } 400 401 if (nodes_empty(tmp)) 402 tmp = *nodes; 403 404 pol->nodes = tmp; 405 } 406 407 static void mpol_rebind_preferred(struct mempolicy *pol, 408 const nodemask_t *nodes) 409 { 410 pol->w.cpuset_mems_allowed = *nodes; 411 } 412 413 /* 414 * mpol_rebind_policy - Migrate a policy to a different set of nodes 415 * 416 * Per-vma policies are protected by mmap_lock. Allocations using per-task 417 * policies are protected by task->mems_allowed_seq to prevent a premature 418 * OOM/allocation failure due to parallel nodemask modification. 419 */ 420 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 421 { 422 if (!pol || pol->mode == MPOL_LOCAL) 423 return; 424 if (!mpol_store_user_nodemask(pol) && 425 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 426 return; 427 428 mpol_ops[pol->mode].rebind(pol, newmask); 429 } 430 431 /* 432 * Wrapper for mpol_rebind_policy() that just requires task 433 * pointer, and updates task mempolicy. 434 * 435 * Called with task's alloc_lock held. 436 */ 437 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 438 { 439 mpol_rebind_policy(tsk->mempolicy, new); 440 } 441 442 /* 443 * Rebind each vma in mm to new nodemask. 444 * 445 * Call holding a reference to mm. Takes mm->mmap_lock during call. 446 */ 447 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 448 { 449 struct vm_area_struct *vma; 450 VMA_ITERATOR(vmi, mm, 0); 451 452 mmap_write_lock(mm); 453 for_each_vma(vmi, vma) { 454 vma_start_write(vma); 455 mpol_rebind_policy(vma->vm_policy, new); 456 } 457 mmap_write_unlock(mm); 458 } 459 460 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 461 [MPOL_DEFAULT] = { 462 .rebind = mpol_rebind_default, 463 }, 464 [MPOL_INTERLEAVE] = { 465 .create = mpol_new_nodemask, 466 .rebind = mpol_rebind_nodemask, 467 }, 468 [MPOL_PREFERRED] = { 469 .create = mpol_new_preferred, 470 .rebind = mpol_rebind_preferred, 471 }, 472 [MPOL_BIND] = { 473 .create = mpol_new_nodemask, 474 .rebind = mpol_rebind_nodemask, 475 }, 476 [MPOL_LOCAL] = { 477 .rebind = mpol_rebind_default, 478 }, 479 [MPOL_PREFERRED_MANY] = { 480 .create = mpol_new_nodemask, 481 .rebind = mpol_rebind_preferred, 482 }, 483 [MPOL_WEIGHTED_INTERLEAVE] = { 484 .create = mpol_new_nodemask, 485 .rebind = mpol_rebind_nodemask, 486 }, 487 }; 488 489 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 490 unsigned long flags); 491 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 492 pgoff_t ilx, int *nid); 493 494 static bool strictly_unmovable(unsigned long flags) 495 { 496 /* 497 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 498 * if any misplaced page is found. 499 */ 500 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 501 MPOL_MF_STRICT; 502 } 503 504 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 505 struct mempolicy *pol; 506 pgoff_t ilx; 507 }; 508 509 struct queue_pages { 510 struct list_head *pagelist; 511 unsigned long flags; 512 nodemask_t *nmask; 513 unsigned long start; 514 unsigned long end; 515 struct vm_area_struct *first; 516 struct folio *large; /* note last large folio encountered */ 517 long nr_failed; /* could not be isolated at this time */ 518 }; 519 520 /* 521 * Check if the folio's nid is in qp->nmask. 522 * 523 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 524 * in the invert of qp->nmask. 525 */ 526 static inline bool queue_folio_required(struct folio *folio, 527 struct queue_pages *qp) 528 { 529 int nid = folio_nid(folio); 530 unsigned long flags = qp->flags; 531 532 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 533 } 534 535 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 536 { 537 struct folio *folio; 538 struct queue_pages *qp = walk->private; 539 540 if (unlikely(is_pmd_migration_entry(*pmd))) { 541 qp->nr_failed++; 542 return; 543 } 544 folio = pmd_folio(*pmd); 545 if (is_huge_zero_folio(folio)) { 546 walk->action = ACTION_CONTINUE; 547 return; 548 } 549 if (!queue_folio_required(folio, qp)) 550 return; 551 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 552 !vma_migratable(walk->vma) || 553 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 554 qp->nr_failed++; 555 } 556 557 /* 558 * Scan through folios, checking if they satisfy the required conditions, 559 * moving them from LRU to local pagelist for migration if they do (or not). 560 * 561 * queue_folios_pte_range() has two possible return values: 562 * 0 - continue walking to scan for more, even if an existing folio on the 563 * wrong node could not be isolated and queued for migration. 564 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 565 * and an existing folio was on a node that does not follow the policy. 566 */ 567 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 568 unsigned long end, struct mm_walk *walk) 569 { 570 const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 571 struct vm_area_struct *vma = walk->vma; 572 struct folio *folio; 573 struct queue_pages *qp = walk->private; 574 unsigned long flags = qp->flags; 575 pte_t *pte, *mapped_pte; 576 pte_t ptent; 577 spinlock_t *ptl; 578 int max_nr, nr; 579 580 ptl = pmd_trans_huge_lock(pmd, vma); 581 if (ptl) { 582 queue_folios_pmd(pmd, walk); 583 spin_unlock(ptl); 584 goto out; 585 } 586 587 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 588 if (!pte) { 589 walk->action = ACTION_AGAIN; 590 return 0; 591 } 592 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 593 max_nr = (end - addr) >> PAGE_SHIFT; 594 nr = 1; 595 ptent = ptep_get(pte); 596 if (pte_none(ptent)) 597 continue; 598 if (!pte_present(ptent)) { 599 if (is_migration_entry(pte_to_swp_entry(ptent))) 600 qp->nr_failed++; 601 continue; 602 } 603 folio = vm_normal_folio(vma, addr, ptent); 604 if (!folio || folio_is_zone_device(folio)) 605 continue; 606 if (folio_test_large(folio) && max_nr != 1) 607 nr = folio_pte_batch(folio, addr, pte, ptent, 608 max_nr, fpb_flags, 609 NULL, NULL, NULL); 610 /* 611 * vm_normal_folio() filters out zero pages, but there might 612 * still be reserved folios to skip, perhaps in a VDSO. 613 */ 614 if (folio_test_reserved(folio)) 615 continue; 616 if (!queue_folio_required(folio, qp)) 617 continue; 618 if (folio_test_large(folio)) { 619 /* 620 * A large folio can only be isolated from LRU once, 621 * but may be mapped by many PTEs (and Copy-On-Write may 622 * intersperse PTEs of other, order 0, folios). This is 623 * a common case, so don't mistake it for failure (but 624 * there can be other cases of multi-mapped pages which 625 * this quick check does not help to filter out - and a 626 * search of the pagelist might grow to be prohibitive). 627 * 628 * migrate_pages(&pagelist) returns nr_failed folios, so 629 * check "large" now so that queue_pages_range() returns 630 * a comparable nr_failed folios. This does imply that 631 * if folio could not be isolated for some racy reason 632 * at its first PTE, later PTEs will not give it another 633 * chance of isolation; but keeps the accounting simple. 634 */ 635 if (folio == qp->large) 636 continue; 637 qp->large = folio; 638 } 639 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 640 !vma_migratable(vma) || 641 !migrate_folio_add(folio, qp->pagelist, flags)) { 642 qp->nr_failed += nr; 643 if (strictly_unmovable(flags)) 644 break; 645 } 646 } 647 pte_unmap_unlock(mapped_pte, ptl); 648 cond_resched(); 649 out: 650 if (qp->nr_failed && strictly_unmovable(flags)) 651 return -EIO; 652 return 0; 653 } 654 655 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 656 unsigned long addr, unsigned long end, 657 struct mm_walk *walk) 658 { 659 #ifdef CONFIG_HUGETLB_PAGE 660 struct queue_pages *qp = walk->private; 661 unsigned long flags = qp->flags; 662 struct folio *folio; 663 spinlock_t *ptl; 664 pte_t entry; 665 666 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 667 entry = huge_ptep_get(walk->mm, addr, pte); 668 if (!pte_present(entry)) { 669 if (unlikely(is_hugetlb_entry_migration(entry))) 670 qp->nr_failed++; 671 goto unlock; 672 } 673 folio = pfn_folio(pte_pfn(entry)); 674 if (!queue_folio_required(folio, qp)) 675 goto unlock; 676 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 677 !vma_migratable(walk->vma)) { 678 qp->nr_failed++; 679 goto unlock; 680 } 681 /* 682 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 683 * Choosing not to migrate a shared folio is not counted as a failure. 684 * 685 * See folio_maybe_mapped_shared() on possible imprecision when we 686 * cannot easily detect if a folio is shared. 687 */ 688 if ((flags & MPOL_MF_MOVE_ALL) || 689 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 690 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 691 qp->nr_failed++; 692 unlock: 693 spin_unlock(ptl); 694 if (qp->nr_failed && strictly_unmovable(flags)) 695 return -EIO; 696 #endif 697 return 0; 698 } 699 700 #ifdef CONFIG_NUMA_BALANCING 701 /* 702 * This is used to mark a range of virtual addresses to be inaccessible. 703 * These are later cleared by a NUMA hinting fault. Depending on these 704 * faults, pages may be migrated for better NUMA placement. 705 * 706 * This is assuming that NUMA faults are handled using PROT_NONE. If 707 * an architecture makes a different choice, it will need further 708 * changes to the core. 709 */ 710 unsigned long change_prot_numa(struct vm_area_struct *vma, 711 unsigned long addr, unsigned long end) 712 { 713 struct mmu_gather tlb; 714 long nr_updated; 715 716 tlb_gather_mmu(&tlb, vma->vm_mm); 717 718 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 719 if (nr_updated > 0) { 720 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 721 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 722 } 723 724 tlb_finish_mmu(&tlb); 725 726 return nr_updated; 727 } 728 #endif /* CONFIG_NUMA_BALANCING */ 729 730 static int queue_pages_test_walk(unsigned long start, unsigned long end, 731 struct mm_walk *walk) 732 { 733 struct vm_area_struct *next, *vma = walk->vma; 734 struct queue_pages *qp = walk->private; 735 unsigned long flags = qp->flags; 736 737 /* range check first */ 738 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 739 740 if (!qp->first) { 741 qp->first = vma; 742 if (!(flags & MPOL_MF_DISCONTIG_OK) && 743 (qp->start < vma->vm_start)) 744 /* hole at head side of range */ 745 return -EFAULT; 746 } 747 next = find_vma(vma->vm_mm, vma->vm_end); 748 if (!(flags & MPOL_MF_DISCONTIG_OK) && 749 ((vma->vm_end < qp->end) && 750 (!next || vma->vm_end < next->vm_start))) 751 /* hole at middle or tail of range */ 752 return -EFAULT; 753 754 /* 755 * Need check MPOL_MF_STRICT to return -EIO if possible 756 * regardless of vma_migratable 757 */ 758 if (!vma_migratable(vma) && 759 !(flags & MPOL_MF_STRICT)) 760 return 1; 761 762 /* 763 * Check page nodes, and queue pages to move, in the current vma. 764 * But if no moving, and no strict checking, the scan can be skipped. 765 */ 766 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 767 return 0; 768 return 1; 769 } 770 771 static const struct mm_walk_ops queue_pages_walk_ops = { 772 .hugetlb_entry = queue_folios_hugetlb, 773 .pmd_entry = queue_folios_pte_range, 774 .test_walk = queue_pages_test_walk, 775 .walk_lock = PGWALK_RDLOCK, 776 }; 777 778 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 779 .hugetlb_entry = queue_folios_hugetlb, 780 .pmd_entry = queue_folios_pte_range, 781 .test_walk = queue_pages_test_walk, 782 .walk_lock = PGWALK_WRLOCK, 783 }; 784 785 /* 786 * Walk through page tables and collect pages to be migrated. 787 * 788 * If pages found in a given range are not on the required set of @nodes, 789 * and migration is allowed, they are isolated and queued to @pagelist. 790 * 791 * queue_pages_range() may return: 792 * 0 - all pages already on the right node, or successfully queued for moving 793 * (or neither strict checking nor moving requested: only range checking). 794 * >0 - this number of misplaced folios could not be queued for moving 795 * (a hugetlbfs page or a transparent huge page being counted as 1). 796 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 797 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 798 */ 799 static long 800 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 801 nodemask_t *nodes, unsigned long flags, 802 struct list_head *pagelist) 803 { 804 int err; 805 struct queue_pages qp = { 806 .pagelist = pagelist, 807 .flags = flags, 808 .nmask = nodes, 809 .start = start, 810 .end = end, 811 .first = NULL, 812 }; 813 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 814 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 815 816 err = walk_page_range(mm, start, end, ops, &qp); 817 818 if (!qp.first) 819 /* whole range in hole */ 820 err = -EFAULT; 821 822 return err ? : qp.nr_failed; 823 } 824 825 /* 826 * Apply policy to a single VMA 827 * This must be called with the mmap_lock held for writing. 828 */ 829 static int vma_replace_policy(struct vm_area_struct *vma, 830 struct mempolicy *pol) 831 { 832 int err; 833 struct mempolicy *old; 834 struct mempolicy *new; 835 836 vma_assert_write_locked(vma); 837 838 new = mpol_dup(pol); 839 if (IS_ERR(new)) 840 return PTR_ERR(new); 841 842 if (vma->vm_ops && vma->vm_ops->set_policy) { 843 err = vma->vm_ops->set_policy(vma, new); 844 if (err) 845 goto err_out; 846 } 847 848 old = vma->vm_policy; 849 vma->vm_policy = new; /* protected by mmap_lock */ 850 mpol_put(old); 851 852 return 0; 853 err_out: 854 mpol_put(new); 855 return err; 856 } 857 858 /* Split or merge the VMA (if required) and apply the new policy */ 859 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 860 struct vm_area_struct **prev, unsigned long start, 861 unsigned long end, struct mempolicy *new_pol) 862 { 863 unsigned long vmstart, vmend; 864 865 vmend = min(end, vma->vm_end); 866 if (start > vma->vm_start) { 867 *prev = vma; 868 vmstart = start; 869 } else { 870 vmstart = vma->vm_start; 871 } 872 873 if (mpol_equal(vma->vm_policy, new_pol)) { 874 *prev = vma; 875 return 0; 876 } 877 878 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 879 if (IS_ERR(vma)) 880 return PTR_ERR(vma); 881 882 *prev = vma; 883 return vma_replace_policy(vma, new_pol); 884 } 885 886 /* Set the process memory policy */ 887 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 888 nodemask_t *nodes) 889 { 890 struct mempolicy *new, *old; 891 NODEMASK_SCRATCH(scratch); 892 int ret; 893 894 if (!scratch) 895 return -ENOMEM; 896 897 new = mpol_new(mode, flags, nodes); 898 if (IS_ERR(new)) { 899 ret = PTR_ERR(new); 900 goto out; 901 } 902 903 task_lock(current); 904 ret = mpol_set_nodemask(new, nodes, scratch); 905 if (ret) { 906 task_unlock(current); 907 mpol_put(new); 908 goto out; 909 } 910 911 old = current->mempolicy; 912 current->mempolicy = new; 913 if (new && (new->mode == MPOL_INTERLEAVE || 914 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 915 current->il_prev = MAX_NUMNODES-1; 916 current->il_weight = 0; 917 } 918 task_unlock(current); 919 mpol_put(old); 920 ret = 0; 921 out: 922 NODEMASK_SCRATCH_FREE(scratch); 923 return ret; 924 } 925 926 /* 927 * Return nodemask for policy for get_mempolicy() query 928 * 929 * Called with task's alloc_lock held 930 */ 931 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 932 { 933 nodes_clear(*nodes); 934 if (pol == &default_policy) 935 return; 936 937 switch (pol->mode) { 938 case MPOL_BIND: 939 case MPOL_INTERLEAVE: 940 case MPOL_PREFERRED: 941 case MPOL_PREFERRED_MANY: 942 case MPOL_WEIGHTED_INTERLEAVE: 943 *nodes = pol->nodes; 944 break; 945 case MPOL_LOCAL: 946 /* return empty node mask for local allocation */ 947 break; 948 default: 949 BUG(); 950 } 951 } 952 953 static int lookup_node(struct mm_struct *mm, unsigned long addr) 954 { 955 struct page *p = NULL; 956 int ret; 957 958 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 959 if (ret > 0) { 960 ret = page_to_nid(p); 961 put_page(p); 962 } 963 return ret; 964 } 965 966 /* Retrieve NUMA policy */ 967 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 968 unsigned long addr, unsigned long flags) 969 { 970 int err; 971 struct mm_struct *mm = current->mm; 972 struct vm_area_struct *vma = NULL; 973 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 974 975 if (flags & 976 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 977 return -EINVAL; 978 979 if (flags & MPOL_F_MEMS_ALLOWED) { 980 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 981 return -EINVAL; 982 *policy = 0; /* just so it's initialized */ 983 task_lock(current); 984 *nmask = cpuset_current_mems_allowed; 985 task_unlock(current); 986 return 0; 987 } 988 989 if (flags & MPOL_F_ADDR) { 990 pgoff_t ilx; /* ignored here */ 991 /* 992 * Do NOT fall back to task policy if the 993 * vma/shared policy at addr is NULL. We 994 * want to return MPOL_DEFAULT in this case. 995 */ 996 mmap_read_lock(mm); 997 vma = vma_lookup(mm, addr); 998 if (!vma) { 999 mmap_read_unlock(mm); 1000 return -EFAULT; 1001 } 1002 pol = __get_vma_policy(vma, addr, &ilx); 1003 } else if (addr) 1004 return -EINVAL; 1005 1006 if (!pol) 1007 pol = &default_policy; /* indicates default behavior */ 1008 1009 if (flags & MPOL_F_NODE) { 1010 if (flags & MPOL_F_ADDR) { 1011 /* 1012 * Take a refcount on the mpol, because we are about to 1013 * drop the mmap_lock, after which only "pol" remains 1014 * valid, "vma" is stale. 1015 */ 1016 pol_refcount = pol; 1017 vma = NULL; 1018 mpol_get(pol); 1019 mmap_read_unlock(mm); 1020 err = lookup_node(mm, addr); 1021 if (err < 0) 1022 goto out; 1023 *policy = err; 1024 } else if (pol == current->mempolicy && 1025 pol->mode == MPOL_INTERLEAVE) { 1026 *policy = next_node_in(current->il_prev, pol->nodes); 1027 } else if (pol == current->mempolicy && 1028 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1029 if (current->il_weight) 1030 *policy = current->il_prev; 1031 else 1032 *policy = next_node_in(current->il_prev, 1033 pol->nodes); 1034 } else { 1035 err = -EINVAL; 1036 goto out; 1037 } 1038 } else { 1039 *policy = pol == &default_policy ? MPOL_DEFAULT : 1040 pol->mode; 1041 /* 1042 * Internal mempolicy flags must be masked off before exposing 1043 * the policy to userspace. 1044 */ 1045 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1046 } 1047 1048 err = 0; 1049 if (nmask) { 1050 if (mpol_store_user_nodemask(pol)) { 1051 *nmask = pol->w.user_nodemask; 1052 } else { 1053 task_lock(current); 1054 get_policy_nodemask(pol, nmask); 1055 task_unlock(current); 1056 } 1057 } 1058 1059 out: 1060 mpol_cond_put(pol); 1061 if (vma) 1062 mmap_read_unlock(mm); 1063 if (pol_refcount) 1064 mpol_put(pol_refcount); 1065 return err; 1066 } 1067 1068 #ifdef CONFIG_MIGRATION 1069 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1070 unsigned long flags) 1071 { 1072 /* 1073 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1074 * Choosing not to migrate a shared folio is not counted as a failure. 1075 * 1076 * See folio_maybe_mapped_shared() on possible imprecision when we 1077 * cannot easily detect if a folio is shared. 1078 */ 1079 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1080 if (folio_isolate_lru(folio)) { 1081 list_add_tail(&folio->lru, foliolist); 1082 node_stat_mod_folio(folio, 1083 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1084 folio_nr_pages(folio)); 1085 } else { 1086 /* 1087 * Non-movable folio may reach here. And, there may be 1088 * temporary off LRU folios or non-LRU movable folios. 1089 * Treat them as unmovable folios since they can't be 1090 * isolated, so they can't be moved at the moment. 1091 */ 1092 return false; 1093 } 1094 } 1095 return true; 1096 } 1097 1098 /* 1099 * Migrate pages from one node to a target node. 1100 * Returns error or the number of pages not migrated. 1101 */ 1102 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1103 int flags) 1104 { 1105 nodemask_t nmask; 1106 struct vm_area_struct *vma; 1107 LIST_HEAD(pagelist); 1108 long nr_failed; 1109 long err = 0; 1110 struct migration_target_control mtc = { 1111 .nid = dest, 1112 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1113 .reason = MR_SYSCALL, 1114 }; 1115 1116 nodes_clear(nmask); 1117 node_set(source, nmask); 1118 1119 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1120 1121 mmap_read_lock(mm); 1122 vma = find_vma(mm, 0); 1123 if (unlikely(!vma)) { 1124 mmap_read_unlock(mm); 1125 return 0; 1126 } 1127 1128 /* 1129 * This does not migrate the range, but isolates all pages that 1130 * need migration. Between passing in the full user address 1131 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1132 * but passes back the count of pages which could not be isolated. 1133 */ 1134 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1135 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1136 mmap_read_unlock(mm); 1137 1138 if (!list_empty(&pagelist)) { 1139 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1140 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1141 if (err) 1142 putback_movable_pages(&pagelist); 1143 } 1144 1145 if (err >= 0) 1146 err += nr_failed; 1147 return err; 1148 } 1149 1150 /* 1151 * Move pages between the two nodesets so as to preserve the physical 1152 * layout as much as possible. 1153 * 1154 * Returns the number of page that could not be moved. 1155 */ 1156 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1157 const nodemask_t *to, int flags) 1158 { 1159 long nr_failed = 0; 1160 long err = 0; 1161 nodemask_t tmp; 1162 1163 lru_cache_disable(); 1164 1165 /* 1166 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1167 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1168 * bit in 'tmp', and return that <source, dest> pair for migration. 1169 * The pair of nodemasks 'to' and 'from' define the map. 1170 * 1171 * If no pair of bits is found that way, fallback to picking some 1172 * pair of 'source' and 'dest' bits that are not the same. If the 1173 * 'source' and 'dest' bits are the same, this represents a node 1174 * that will be migrating to itself, so no pages need move. 1175 * 1176 * If no bits are left in 'tmp', or if all remaining bits left 1177 * in 'tmp' correspond to the same bit in 'to', return false 1178 * (nothing left to migrate). 1179 * 1180 * This lets us pick a pair of nodes to migrate between, such that 1181 * if possible the dest node is not already occupied by some other 1182 * source node, minimizing the risk of overloading the memory on a 1183 * node that would happen if we migrated incoming memory to a node 1184 * before migrating outgoing memory source that same node. 1185 * 1186 * A single scan of tmp is sufficient. As we go, we remember the 1187 * most recent <s, d> pair that moved (s != d). If we find a pair 1188 * that not only moved, but what's better, moved to an empty slot 1189 * (d is not set in tmp), then we break out then, with that pair. 1190 * Otherwise when we finish scanning from_tmp, we at least have the 1191 * most recent <s, d> pair that moved. If we get all the way through 1192 * the scan of tmp without finding any node that moved, much less 1193 * moved to an empty node, then there is nothing left worth migrating. 1194 */ 1195 1196 tmp = *from; 1197 while (!nodes_empty(tmp)) { 1198 int s, d; 1199 int source = NUMA_NO_NODE; 1200 int dest = 0; 1201 1202 for_each_node_mask(s, tmp) { 1203 1204 /* 1205 * do_migrate_pages() tries to maintain the relative 1206 * node relationship of the pages established between 1207 * threads and memory areas. 1208 * 1209 * However if the number of source nodes is not equal to 1210 * the number of destination nodes we can not preserve 1211 * this node relative relationship. In that case, skip 1212 * copying memory from a node that is in the destination 1213 * mask. 1214 * 1215 * Example: [2,3,4] -> [3,4,5] moves everything. 1216 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1217 */ 1218 1219 if ((nodes_weight(*from) != nodes_weight(*to)) && 1220 (node_isset(s, *to))) 1221 continue; 1222 1223 d = node_remap(s, *from, *to); 1224 if (s == d) 1225 continue; 1226 1227 source = s; /* Node moved. Memorize */ 1228 dest = d; 1229 1230 /* dest not in remaining from nodes? */ 1231 if (!node_isset(dest, tmp)) 1232 break; 1233 } 1234 if (source == NUMA_NO_NODE) 1235 break; 1236 1237 node_clear(source, tmp); 1238 err = migrate_to_node(mm, source, dest, flags); 1239 if (err > 0) 1240 nr_failed += err; 1241 if (err < 0) 1242 break; 1243 } 1244 1245 lru_cache_enable(); 1246 if (err < 0) 1247 return err; 1248 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1249 } 1250 1251 /* 1252 * Allocate a new folio for page migration, according to NUMA mempolicy. 1253 */ 1254 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1255 unsigned long private) 1256 { 1257 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1258 struct mempolicy *pol = mmpol->pol; 1259 pgoff_t ilx = mmpol->ilx; 1260 unsigned int order; 1261 int nid = numa_node_id(); 1262 gfp_t gfp; 1263 1264 order = folio_order(src); 1265 ilx += src->index >> order; 1266 1267 if (folio_test_hugetlb(src)) { 1268 nodemask_t *nodemask; 1269 struct hstate *h; 1270 1271 h = folio_hstate(src); 1272 gfp = htlb_alloc_mask(h); 1273 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1274 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1275 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1276 } 1277 1278 if (folio_test_large(src)) 1279 gfp = GFP_TRANSHUGE; 1280 else 1281 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1282 1283 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1284 } 1285 #else 1286 1287 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1288 unsigned long flags) 1289 { 1290 return false; 1291 } 1292 1293 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1294 const nodemask_t *to, int flags) 1295 { 1296 return -ENOSYS; 1297 } 1298 1299 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1300 unsigned long private) 1301 { 1302 return NULL; 1303 } 1304 #endif 1305 1306 static long do_mbind(unsigned long start, unsigned long len, 1307 unsigned short mode, unsigned short mode_flags, 1308 nodemask_t *nmask, unsigned long flags) 1309 { 1310 struct mm_struct *mm = current->mm; 1311 struct vm_area_struct *vma, *prev; 1312 struct vma_iterator vmi; 1313 struct migration_mpol mmpol; 1314 struct mempolicy *new; 1315 unsigned long end; 1316 long err; 1317 long nr_failed; 1318 LIST_HEAD(pagelist); 1319 1320 if (flags & ~(unsigned long)MPOL_MF_VALID) 1321 return -EINVAL; 1322 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1323 return -EPERM; 1324 1325 if (start & ~PAGE_MASK) 1326 return -EINVAL; 1327 1328 if (mode == MPOL_DEFAULT) 1329 flags &= ~MPOL_MF_STRICT; 1330 1331 len = PAGE_ALIGN(len); 1332 end = start + len; 1333 1334 if (end < start) 1335 return -EINVAL; 1336 if (end == start) 1337 return 0; 1338 1339 new = mpol_new(mode, mode_flags, nmask); 1340 if (IS_ERR(new)) 1341 return PTR_ERR(new); 1342 1343 /* 1344 * If we are using the default policy then operation 1345 * on discontinuous address spaces is okay after all 1346 */ 1347 if (!new) 1348 flags |= MPOL_MF_DISCONTIG_OK; 1349 1350 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1351 lru_cache_disable(); 1352 { 1353 NODEMASK_SCRATCH(scratch); 1354 if (scratch) { 1355 mmap_write_lock(mm); 1356 err = mpol_set_nodemask(new, nmask, scratch); 1357 if (err) 1358 mmap_write_unlock(mm); 1359 } else 1360 err = -ENOMEM; 1361 NODEMASK_SCRATCH_FREE(scratch); 1362 } 1363 if (err) 1364 goto mpol_out; 1365 1366 /* 1367 * Lock the VMAs before scanning for pages to migrate, 1368 * to ensure we don't miss a concurrently inserted page. 1369 */ 1370 nr_failed = queue_pages_range(mm, start, end, nmask, 1371 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1372 1373 if (nr_failed < 0) { 1374 err = nr_failed; 1375 nr_failed = 0; 1376 } else { 1377 vma_iter_init(&vmi, mm, start); 1378 prev = vma_prev(&vmi); 1379 for_each_vma_range(vmi, vma, end) { 1380 err = mbind_range(&vmi, vma, &prev, start, end, new); 1381 if (err) 1382 break; 1383 } 1384 } 1385 1386 if (!err && !list_empty(&pagelist)) { 1387 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1388 if (!new) { 1389 new = get_task_policy(current); 1390 mpol_get(new); 1391 } 1392 mmpol.pol = new; 1393 mmpol.ilx = 0; 1394 1395 /* 1396 * In the interleaved case, attempt to allocate on exactly the 1397 * targeted nodes, for the first VMA to be migrated; for later 1398 * VMAs, the nodes will still be interleaved from the targeted 1399 * nodemask, but one by one may be selected differently. 1400 */ 1401 if (new->mode == MPOL_INTERLEAVE || 1402 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1403 struct folio *folio; 1404 unsigned int order; 1405 unsigned long addr = -EFAULT; 1406 1407 list_for_each_entry(folio, &pagelist, lru) { 1408 if (!folio_test_ksm(folio)) 1409 break; 1410 } 1411 if (!list_entry_is_head(folio, &pagelist, lru)) { 1412 vma_iter_init(&vmi, mm, start); 1413 for_each_vma_range(vmi, vma, end) { 1414 addr = page_address_in_vma(folio, 1415 folio_page(folio, 0), vma); 1416 if (addr != -EFAULT) 1417 break; 1418 } 1419 } 1420 if (addr != -EFAULT) { 1421 order = folio_order(folio); 1422 /* We already know the pol, but not the ilx */ 1423 mpol_cond_put(get_vma_policy(vma, addr, order, 1424 &mmpol.ilx)); 1425 /* Set base from which to increment by index */ 1426 mmpol.ilx -= folio->index >> order; 1427 } 1428 } 1429 } 1430 1431 mmap_write_unlock(mm); 1432 1433 if (!err && !list_empty(&pagelist)) { 1434 nr_failed |= migrate_pages(&pagelist, 1435 alloc_migration_target_by_mpol, NULL, 1436 (unsigned long)&mmpol, MIGRATE_SYNC, 1437 MR_MEMPOLICY_MBIND, NULL); 1438 } 1439 1440 if (nr_failed && (flags & MPOL_MF_STRICT)) 1441 err = -EIO; 1442 if (!list_empty(&pagelist)) 1443 putback_movable_pages(&pagelist); 1444 mpol_out: 1445 mpol_put(new); 1446 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1447 lru_cache_enable(); 1448 return err; 1449 } 1450 1451 /* 1452 * User space interface with variable sized bitmaps for nodelists. 1453 */ 1454 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1455 unsigned long maxnode) 1456 { 1457 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1458 int ret; 1459 1460 if (in_compat_syscall()) 1461 ret = compat_get_bitmap(mask, 1462 (const compat_ulong_t __user *)nmask, 1463 maxnode); 1464 else 1465 ret = copy_from_user(mask, nmask, 1466 nlongs * sizeof(unsigned long)); 1467 1468 if (ret) 1469 return -EFAULT; 1470 1471 if (maxnode % BITS_PER_LONG) 1472 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1473 1474 return 0; 1475 } 1476 1477 /* Copy a node mask from user space. */ 1478 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1479 unsigned long maxnode) 1480 { 1481 --maxnode; 1482 nodes_clear(*nodes); 1483 if (maxnode == 0 || !nmask) 1484 return 0; 1485 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1486 return -EINVAL; 1487 1488 /* 1489 * When the user specified more nodes than supported just check 1490 * if the non supported part is all zero, one word at a time, 1491 * starting at the end. 1492 */ 1493 while (maxnode > MAX_NUMNODES) { 1494 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1495 unsigned long t; 1496 1497 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1498 return -EFAULT; 1499 1500 if (maxnode - bits >= MAX_NUMNODES) { 1501 maxnode -= bits; 1502 } else { 1503 maxnode = MAX_NUMNODES; 1504 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1505 } 1506 if (t) 1507 return -EINVAL; 1508 } 1509 1510 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1511 } 1512 1513 /* Copy a kernel node mask to user space */ 1514 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1515 nodemask_t *nodes) 1516 { 1517 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1518 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1519 bool compat = in_compat_syscall(); 1520 1521 if (compat) 1522 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1523 1524 if (copy > nbytes) { 1525 if (copy > PAGE_SIZE) 1526 return -EINVAL; 1527 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1528 return -EFAULT; 1529 copy = nbytes; 1530 maxnode = nr_node_ids; 1531 } 1532 1533 if (compat) 1534 return compat_put_bitmap((compat_ulong_t __user *)mask, 1535 nodes_addr(*nodes), maxnode); 1536 1537 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1538 } 1539 1540 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1541 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1542 { 1543 *flags = *mode & MPOL_MODE_FLAGS; 1544 *mode &= ~MPOL_MODE_FLAGS; 1545 1546 if ((unsigned int)(*mode) >= MPOL_MAX) 1547 return -EINVAL; 1548 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1549 return -EINVAL; 1550 if (*flags & MPOL_F_NUMA_BALANCING) { 1551 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1552 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1553 else 1554 return -EINVAL; 1555 } 1556 return 0; 1557 } 1558 1559 static long kernel_mbind(unsigned long start, unsigned long len, 1560 unsigned long mode, const unsigned long __user *nmask, 1561 unsigned long maxnode, unsigned int flags) 1562 { 1563 unsigned short mode_flags; 1564 nodemask_t nodes; 1565 int lmode = mode; 1566 int err; 1567 1568 start = untagged_addr(start); 1569 err = sanitize_mpol_flags(&lmode, &mode_flags); 1570 if (err) 1571 return err; 1572 1573 err = get_nodes(&nodes, nmask, maxnode); 1574 if (err) 1575 return err; 1576 1577 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1578 } 1579 1580 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1581 unsigned long, home_node, unsigned long, flags) 1582 { 1583 struct mm_struct *mm = current->mm; 1584 struct vm_area_struct *vma, *prev; 1585 struct mempolicy *new, *old; 1586 unsigned long end; 1587 int err = -ENOENT; 1588 VMA_ITERATOR(vmi, mm, start); 1589 1590 start = untagged_addr(start); 1591 if (start & ~PAGE_MASK) 1592 return -EINVAL; 1593 /* 1594 * flags is used for future extension if any. 1595 */ 1596 if (flags != 0) 1597 return -EINVAL; 1598 1599 /* 1600 * Check home_node is online to avoid accessing uninitialized 1601 * NODE_DATA. 1602 */ 1603 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1604 return -EINVAL; 1605 1606 len = PAGE_ALIGN(len); 1607 end = start + len; 1608 1609 if (end < start) 1610 return -EINVAL; 1611 if (end == start) 1612 return 0; 1613 mmap_write_lock(mm); 1614 prev = vma_prev(&vmi); 1615 for_each_vma_range(vmi, vma, end) { 1616 /* 1617 * If any vma in the range got policy other than MPOL_BIND 1618 * or MPOL_PREFERRED_MANY we return error. We don't reset 1619 * the home node for vmas we already updated before. 1620 */ 1621 old = vma_policy(vma); 1622 if (!old) { 1623 prev = vma; 1624 continue; 1625 } 1626 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1627 err = -EOPNOTSUPP; 1628 break; 1629 } 1630 new = mpol_dup(old); 1631 if (IS_ERR(new)) { 1632 err = PTR_ERR(new); 1633 break; 1634 } 1635 1636 vma_start_write(vma); 1637 new->home_node = home_node; 1638 err = mbind_range(&vmi, vma, &prev, start, end, new); 1639 mpol_put(new); 1640 if (err) 1641 break; 1642 } 1643 mmap_write_unlock(mm); 1644 return err; 1645 } 1646 1647 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1648 unsigned long, mode, const unsigned long __user *, nmask, 1649 unsigned long, maxnode, unsigned int, flags) 1650 { 1651 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1652 } 1653 1654 /* Set the process memory policy */ 1655 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1656 unsigned long maxnode) 1657 { 1658 unsigned short mode_flags; 1659 nodemask_t nodes; 1660 int lmode = mode; 1661 int err; 1662 1663 err = sanitize_mpol_flags(&lmode, &mode_flags); 1664 if (err) 1665 return err; 1666 1667 err = get_nodes(&nodes, nmask, maxnode); 1668 if (err) 1669 return err; 1670 1671 return do_set_mempolicy(lmode, mode_flags, &nodes); 1672 } 1673 1674 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1675 unsigned long, maxnode) 1676 { 1677 return kernel_set_mempolicy(mode, nmask, maxnode); 1678 } 1679 1680 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1681 const unsigned long __user *old_nodes, 1682 const unsigned long __user *new_nodes) 1683 { 1684 struct mm_struct *mm = NULL; 1685 struct task_struct *task; 1686 nodemask_t task_nodes; 1687 int err; 1688 nodemask_t *old; 1689 nodemask_t *new; 1690 NODEMASK_SCRATCH(scratch); 1691 1692 if (!scratch) 1693 return -ENOMEM; 1694 1695 old = &scratch->mask1; 1696 new = &scratch->mask2; 1697 1698 err = get_nodes(old, old_nodes, maxnode); 1699 if (err) 1700 goto out; 1701 1702 err = get_nodes(new, new_nodes, maxnode); 1703 if (err) 1704 goto out; 1705 1706 /* Find the mm_struct */ 1707 rcu_read_lock(); 1708 task = pid ? find_task_by_vpid(pid) : current; 1709 if (!task) { 1710 rcu_read_unlock(); 1711 err = -ESRCH; 1712 goto out; 1713 } 1714 get_task_struct(task); 1715 1716 err = -EINVAL; 1717 1718 /* 1719 * Check if this process has the right to modify the specified process. 1720 * Use the regular "ptrace_may_access()" checks. 1721 */ 1722 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1723 rcu_read_unlock(); 1724 err = -EPERM; 1725 goto out_put; 1726 } 1727 rcu_read_unlock(); 1728 1729 task_nodes = cpuset_mems_allowed(task); 1730 /* Is the user allowed to access the target nodes? */ 1731 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1732 err = -EPERM; 1733 goto out_put; 1734 } 1735 1736 task_nodes = cpuset_mems_allowed(current); 1737 nodes_and(*new, *new, task_nodes); 1738 if (nodes_empty(*new)) 1739 goto out_put; 1740 1741 err = security_task_movememory(task); 1742 if (err) 1743 goto out_put; 1744 1745 mm = get_task_mm(task); 1746 put_task_struct(task); 1747 1748 if (!mm) { 1749 err = -EINVAL; 1750 goto out; 1751 } 1752 1753 err = do_migrate_pages(mm, old, new, 1754 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1755 1756 mmput(mm); 1757 out: 1758 NODEMASK_SCRATCH_FREE(scratch); 1759 1760 return err; 1761 1762 out_put: 1763 put_task_struct(task); 1764 goto out; 1765 } 1766 1767 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1768 const unsigned long __user *, old_nodes, 1769 const unsigned long __user *, new_nodes) 1770 { 1771 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1772 } 1773 1774 /* Retrieve NUMA policy */ 1775 static int kernel_get_mempolicy(int __user *policy, 1776 unsigned long __user *nmask, 1777 unsigned long maxnode, 1778 unsigned long addr, 1779 unsigned long flags) 1780 { 1781 int err; 1782 int pval; 1783 nodemask_t nodes; 1784 1785 if (nmask != NULL && maxnode < nr_node_ids) 1786 return -EINVAL; 1787 1788 addr = untagged_addr(addr); 1789 1790 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1791 1792 if (err) 1793 return err; 1794 1795 if (policy && put_user(pval, policy)) 1796 return -EFAULT; 1797 1798 if (nmask) 1799 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1800 1801 return err; 1802 } 1803 1804 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1805 unsigned long __user *, nmask, unsigned long, maxnode, 1806 unsigned long, addr, unsigned long, flags) 1807 { 1808 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1809 } 1810 1811 bool vma_migratable(struct vm_area_struct *vma) 1812 { 1813 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1814 return false; 1815 1816 /* 1817 * DAX device mappings require predictable access latency, so avoid 1818 * incurring periodic faults. 1819 */ 1820 if (vma_is_dax(vma)) 1821 return false; 1822 1823 if (is_vm_hugetlb_page(vma) && 1824 !hugepage_migration_supported(hstate_vma(vma))) 1825 return false; 1826 1827 /* 1828 * Migration allocates pages in the highest zone. If we cannot 1829 * do so then migration (at least from node to node) is not 1830 * possible. 1831 */ 1832 if (vma->vm_file && 1833 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 1834 < policy_zone) 1835 return false; 1836 return true; 1837 } 1838 1839 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1840 unsigned long addr, pgoff_t *ilx) 1841 { 1842 *ilx = 0; 1843 return (vma->vm_ops && vma->vm_ops->get_policy) ? 1844 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 1845 } 1846 1847 /* 1848 * get_vma_policy(@vma, @addr, @order, @ilx) 1849 * @vma: virtual memory area whose policy is sought 1850 * @addr: address in @vma for shared policy lookup 1851 * @order: 0, or appropriate huge_page_order for interleaving 1852 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 1853 * MPOL_WEIGHTED_INTERLEAVE 1854 * 1855 * Returns effective policy for a VMA at specified address. 1856 * Falls back to current->mempolicy or system default policy, as necessary. 1857 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1858 * count--added by the get_policy() vm_op, as appropriate--to protect against 1859 * freeing by another task. It is the caller's responsibility to free the 1860 * extra reference for shared policies. 1861 */ 1862 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1863 unsigned long addr, int order, pgoff_t *ilx) 1864 { 1865 struct mempolicy *pol; 1866 1867 pol = __get_vma_policy(vma, addr, ilx); 1868 if (!pol) 1869 pol = get_task_policy(current); 1870 if (pol->mode == MPOL_INTERLEAVE || 1871 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1872 *ilx += vma->vm_pgoff >> order; 1873 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1874 } 1875 return pol; 1876 } 1877 1878 bool vma_policy_mof(struct vm_area_struct *vma) 1879 { 1880 struct mempolicy *pol; 1881 1882 if (vma->vm_ops && vma->vm_ops->get_policy) { 1883 bool ret = false; 1884 pgoff_t ilx; /* ignored here */ 1885 1886 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1887 if (pol && (pol->flags & MPOL_F_MOF)) 1888 ret = true; 1889 mpol_cond_put(pol); 1890 1891 return ret; 1892 } 1893 1894 pol = vma->vm_policy; 1895 if (!pol) 1896 pol = get_task_policy(current); 1897 1898 return pol->flags & MPOL_F_MOF; 1899 } 1900 1901 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1902 { 1903 enum zone_type dynamic_policy_zone = policy_zone; 1904 1905 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1906 1907 /* 1908 * if policy->nodes has movable memory only, 1909 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1910 * 1911 * policy->nodes is intersect with node_states[N_MEMORY]. 1912 * so if the following test fails, it implies 1913 * policy->nodes has movable memory only. 1914 */ 1915 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 1916 dynamic_policy_zone = ZONE_MOVABLE; 1917 1918 return zone >= dynamic_policy_zone; 1919 } 1920 1921 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 1922 { 1923 unsigned int node; 1924 unsigned int cpuset_mems_cookie; 1925 1926 retry: 1927 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 1928 cpuset_mems_cookie = read_mems_allowed_begin(); 1929 node = current->il_prev; 1930 if (!current->il_weight || !node_isset(node, policy->nodes)) { 1931 node = next_node_in(node, policy->nodes); 1932 if (read_mems_allowed_retry(cpuset_mems_cookie)) 1933 goto retry; 1934 if (node == MAX_NUMNODES) 1935 return node; 1936 current->il_prev = node; 1937 current->il_weight = get_il_weight(node); 1938 } 1939 current->il_weight--; 1940 return node; 1941 } 1942 1943 /* Do dynamic interleaving for a process */ 1944 static unsigned int interleave_nodes(struct mempolicy *policy) 1945 { 1946 unsigned int nid; 1947 unsigned int cpuset_mems_cookie; 1948 1949 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 1950 do { 1951 cpuset_mems_cookie = read_mems_allowed_begin(); 1952 nid = next_node_in(current->il_prev, policy->nodes); 1953 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1954 1955 if (nid < MAX_NUMNODES) 1956 current->il_prev = nid; 1957 return nid; 1958 } 1959 1960 /* 1961 * Depending on the memory policy provide a node from which to allocate the 1962 * next slab entry. 1963 */ 1964 unsigned int mempolicy_slab_node(void) 1965 { 1966 struct mempolicy *policy; 1967 int node = numa_mem_id(); 1968 1969 if (!in_task()) 1970 return node; 1971 1972 policy = current->mempolicy; 1973 if (!policy) 1974 return node; 1975 1976 switch (policy->mode) { 1977 case MPOL_PREFERRED: 1978 return first_node(policy->nodes); 1979 1980 case MPOL_INTERLEAVE: 1981 return interleave_nodes(policy); 1982 1983 case MPOL_WEIGHTED_INTERLEAVE: 1984 return weighted_interleave_nodes(policy); 1985 1986 case MPOL_BIND: 1987 case MPOL_PREFERRED_MANY: 1988 { 1989 struct zoneref *z; 1990 1991 /* 1992 * Follow bind policy behavior and start allocation at the 1993 * first node. 1994 */ 1995 struct zonelist *zonelist; 1996 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1997 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 1998 z = first_zones_zonelist(zonelist, highest_zoneidx, 1999 &policy->nodes); 2000 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2001 } 2002 case MPOL_LOCAL: 2003 return node; 2004 2005 default: 2006 BUG(); 2007 } 2008 } 2009 2010 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2011 nodemask_t *mask) 2012 { 2013 /* 2014 * barrier stabilizes the nodemask locally so that it can be iterated 2015 * over safely without concern for changes. Allocators validate node 2016 * selection does not violate mems_allowed, so this is safe. 2017 */ 2018 barrier(); 2019 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2020 barrier(); 2021 return nodes_weight(*mask); 2022 } 2023 2024 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2025 { 2026 nodemask_t nodemask; 2027 unsigned int target, nr_nodes; 2028 u8 *table; 2029 unsigned int weight_total = 0; 2030 u8 weight; 2031 int nid; 2032 2033 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2034 if (!nr_nodes) 2035 return numa_node_id(); 2036 2037 rcu_read_lock(); 2038 table = rcu_dereference(iw_table); 2039 /* calculate the total weight */ 2040 for_each_node_mask(nid, nodemask) { 2041 /* detect system default usage */ 2042 weight = table ? table[nid] : 1; 2043 weight = weight ? weight : 1; 2044 weight_total += weight; 2045 } 2046 2047 /* Calculate the node offset based on totals */ 2048 target = ilx % weight_total; 2049 nid = first_node(nodemask); 2050 while (target) { 2051 /* detect system default usage */ 2052 weight = table ? table[nid] : 1; 2053 weight = weight ? weight : 1; 2054 if (target < weight) 2055 break; 2056 target -= weight; 2057 nid = next_node_in(nid, nodemask); 2058 } 2059 rcu_read_unlock(); 2060 return nid; 2061 } 2062 2063 /* 2064 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2065 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2066 * exceeds the number of present nodes. 2067 */ 2068 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2069 { 2070 nodemask_t nodemask; 2071 unsigned int target, nnodes; 2072 int i; 2073 int nid; 2074 2075 nnodes = read_once_policy_nodemask(pol, &nodemask); 2076 if (!nnodes) 2077 return numa_node_id(); 2078 target = ilx % nnodes; 2079 nid = first_node(nodemask); 2080 for (i = 0; i < target; i++) 2081 nid = next_node(nid, nodemask); 2082 return nid; 2083 } 2084 2085 /* 2086 * Return a nodemask representing a mempolicy for filtering nodes for 2087 * page allocation, together with preferred node id (or the input node id). 2088 */ 2089 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2090 pgoff_t ilx, int *nid) 2091 { 2092 nodemask_t *nodemask = NULL; 2093 2094 switch (pol->mode) { 2095 case MPOL_PREFERRED: 2096 /* Override input node id */ 2097 *nid = first_node(pol->nodes); 2098 break; 2099 case MPOL_PREFERRED_MANY: 2100 nodemask = &pol->nodes; 2101 if (pol->home_node != NUMA_NO_NODE) 2102 *nid = pol->home_node; 2103 break; 2104 case MPOL_BIND: 2105 /* Restrict to nodemask (but not on lower zones) */ 2106 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2107 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2108 nodemask = &pol->nodes; 2109 if (pol->home_node != NUMA_NO_NODE) 2110 *nid = pol->home_node; 2111 /* 2112 * __GFP_THISNODE shouldn't even be used with the bind policy 2113 * because we might easily break the expectation to stay on the 2114 * requested node and not break the policy. 2115 */ 2116 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2117 break; 2118 case MPOL_INTERLEAVE: 2119 /* Override input node id */ 2120 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2121 interleave_nodes(pol) : interleave_nid(pol, ilx); 2122 break; 2123 case MPOL_WEIGHTED_INTERLEAVE: 2124 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2125 weighted_interleave_nodes(pol) : 2126 weighted_interleave_nid(pol, ilx); 2127 break; 2128 } 2129 2130 return nodemask; 2131 } 2132 2133 #ifdef CONFIG_HUGETLBFS 2134 /* 2135 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2136 * @vma: virtual memory area whose policy is sought 2137 * @addr: address in @vma for shared policy lookup and interleave policy 2138 * @gfp_flags: for requested zone 2139 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2140 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2141 * 2142 * Returns a nid suitable for a huge page allocation and a pointer 2143 * to the struct mempolicy for conditional unref after allocation. 2144 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2145 * to the mempolicy's @nodemask for filtering the zonelist. 2146 */ 2147 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2148 struct mempolicy **mpol, nodemask_t **nodemask) 2149 { 2150 pgoff_t ilx; 2151 int nid; 2152 2153 nid = numa_node_id(); 2154 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2155 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2156 return nid; 2157 } 2158 2159 /* 2160 * init_nodemask_of_mempolicy 2161 * 2162 * If the current task's mempolicy is "default" [NULL], return 'false' 2163 * to indicate default policy. Otherwise, extract the policy nodemask 2164 * for 'bind' or 'interleave' policy into the argument nodemask, or 2165 * initialize the argument nodemask to contain the single node for 2166 * 'preferred' or 'local' policy and return 'true' to indicate presence 2167 * of non-default mempolicy. 2168 * 2169 * We don't bother with reference counting the mempolicy [mpol_get/put] 2170 * because the current task is examining it's own mempolicy and a task's 2171 * mempolicy is only ever changed by the task itself. 2172 * 2173 * N.B., it is the caller's responsibility to free a returned nodemask. 2174 */ 2175 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2176 { 2177 struct mempolicy *mempolicy; 2178 2179 if (!(mask && current->mempolicy)) 2180 return false; 2181 2182 task_lock(current); 2183 mempolicy = current->mempolicy; 2184 switch (mempolicy->mode) { 2185 case MPOL_PREFERRED: 2186 case MPOL_PREFERRED_MANY: 2187 case MPOL_BIND: 2188 case MPOL_INTERLEAVE: 2189 case MPOL_WEIGHTED_INTERLEAVE: 2190 *mask = mempolicy->nodes; 2191 break; 2192 2193 case MPOL_LOCAL: 2194 init_nodemask_of_node(mask, numa_node_id()); 2195 break; 2196 2197 default: 2198 BUG(); 2199 } 2200 task_unlock(current); 2201 2202 return true; 2203 } 2204 #endif 2205 2206 /* 2207 * mempolicy_in_oom_domain 2208 * 2209 * If tsk's mempolicy is "bind", check for intersection between mask and 2210 * the policy nodemask. Otherwise, return true for all other policies 2211 * including "interleave", as a tsk with "interleave" policy may have 2212 * memory allocated from all nodes in system. 2213 * 2214 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2215 */ 2216 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2217 const nodemask_t *mask) 2218 { 2219 struct mempolicy *mempolicy; 2220 bool ret = true; 2221 2222 if (!mask) 2223 return ret; 2224 2225 task_lock(tsk); 2226 mempolicy = tsk->mempolicy; 2227 if (mempolicy && mempolicy->mode == MPOL_BIND) 2228 ret = nodes_intersects(mempolicy->nodes, *mask); 2229 task_unlock(tsk); 2230 2231 return ret; 2232 } 2233 2234 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2235 int nid, nodemask_t *nodemask) 2236 { 2237 struct page *page; 2238 gfp_t preferred_gfp; 2239 2240 /* 2241 * This is a two pass approach. The first pass will only try the 2242 * preferred nodes but skip the direct reclaim and allow the 2243 * allocation to fail, while the second pass will try all the 2244 * nodes in system. 2245 */ 2246 preferred_gfp = gfp | __GFP_NOWARN; 2247 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2248 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2249 if (!page) 2250 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2251 2252 return page; 2253 } 2254 2255 /** 2256 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2257 * @gfp: GFP flags. 2258 * @order: Order of the page allocation. 2259 * @pol: Pointer to the NUMA mempolicy. 2260 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2261 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2262 * 2263 * Return: The page on success or NULL if allocation fails. 2264 */ 2265 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2266 struct mempolicy *pol, pgoff_t ilx, int nid) 2267 { 2268 nodemask_t *nodemask; 2269 struct page *page; 2270 2271 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2272 2273 if (pol->mode == MPOL_PREFERRED_MANY) 2274 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2275 2276 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2277 /* filter "hugepage" allocation, unless from alloc_pages() */ 2278 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2279 /* 2280 * For hugepage allocation and non-interleave policy which 2281 * allows the current node (or other explicitly preferred 2282 * node) we only try to allocate from the current/preferred 2283 * node and don't fall back to other nodes, as the cost of 2284 * remote accesses would likely offset THP benefits. 2285 * 2286 * If the policy is interleave or does not allow the current 2287 * node in its nodemask, we allocate the standard way. 2288 */ 2289 if (pol->mode != MPOL_INTERLEAVE && 2290 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2291 (!nodemask || node_isset(nid, *nodemask))) { 2292 /* 2293 * First, try to allocate THP only on local node, but 2294 * don't reclaim unnecessarily, just compact. 2295 */ 2296 page = __alloc_frozen_pages_noprof( 2297 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2298 nid, NULL); 2299 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2300 return page; 2301 /* 2302 * If hugepage allocations are configured to always 2303 * synchronous compact or the vma has been madvised 2304 * to prefer hugepage backing, retry allowing remote 2305 * memory with both reclaim and compact as well. 2306 */ 2307 } 2308 } 2309 2310 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2311 2312 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2313 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2314 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2315 if (static_branch_likely(&vm_numa_stat_key) && 2316 page_to_nid(page) == nid) { 2317 preempt_disable(); 2318 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2319 preempt_enable(); 2320 } 2321 } 2322 2323 return page; 2324 } 2325 2326 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2327 struct mempolicy *pol, pgoff_t ilx, int nid) 2328 { 2329 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2330 ilx, nid); 2331 if (!page) 2332 return NULL; 2333 2334 set_page_refcounted(page); 2335 return page_rmappable_folio(page); 2336 } 2337 2338 /** 2339 * vma_alloc_folio - Allocate a folio for a VMA. 2340 * @gfp: GFP flags. 2341 * @order: Order of the folio. 2342 * @vma: Pointer to VMA. 2343 * @addr: Virtual address of the allocation. Must be inside @vma. 2344 * 2345 * Allocate a folio for a specific address in @vma, using the appropriate 2346 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2347 * VMA to prevent it from going away. Should be used for all allocations 2348 * for folios that will be mapped into user space, excepting hugetlbfs, and 2349 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2350 * 2351 * Return: The folio on success or NULL if allocation fails. 2352 */ 2353 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2354 unsigned long addr) 2355 { 2356 struct mempolicy *pol; 2357 pgoff_t ilx; 2358 struct folio *folio; 2359 2360 if (vma->vm_flags & VM_DROPPABLE) 2361 gfp |= __GFP_NOWARN; 2362 2363 pol = get_vma_policy(vma, addr, order, &ilx); 2364 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2365 mpol_cond_put(pol); 2366 return folio; 2367 } 2368 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2369 2370 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2371 { 2372 struct mempolicy *pol = &default_policy; 2373 2374 /* 2375 * No reference counting needed for current->mempolicy 2376 * nor system default_policy 2377 */ 2378 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2379 pol = get_task_policy(current); 2380 2381 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2382 numa_node_id()); 2383 } 2384 2385 /** 2386 * alloc_pages - Allocate pages. 2387 * @gfp: GFP flags. 2388 * @order: Power of two of number of pages to allocate. 2389 * 2390 * Allocate 1 << @order contiguous pages. The physical address of the 2391 * first page is naturally aligned (eg an order-3 allocation will be aligned 2392 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2393 * process is honoured when in process context. 2394 * 2395 * Context: Can be called from any context, providing the appropriate GFP 2396 * flags are used. 2397 * Return: The page on success or NULL if allocation fails. 2398 */ 2399 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2400 { 2401 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2402 2403 if (page) 2404 set_page_refcounted(page); 2405 return page; 2406 } 2407 EXPORT_SYMBOL(alloc_pages_noprof); 2408 2409 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2410 { 2411 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2412 } 2413 EXPORT_SYMBOL(folio_alloc_noprof); 2414 2415 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2416 struct mempolicy *pol, unsigned long nr_pages, 2417 struct page **page_array) 2418 { 2419 int nodes; 2420 unsigned long nr_pages_per_node; 2421 int delta; 2422 int i; 2423 unsigned long nr_allocated; 2424 unsigned long total_allocated = 0; 2425 2426 nodes = nodes_weight(pol->nodes); 2427 nr_pages_per_node = nr_pages / nodes; 2428 delta = nr_pages - nodes * nr_pages_per_node; 2429 2430 for (i = 0; i < nodes; i++) { 2431 if (delta) { 2432 nr_allocated = alloc_pages_bulk_noprof(gfp, 2433 interleave_nodes(pol), NULL, 2434 nr_pages_per_node + 1, 2435 page_array); 2436 delta--; 2437 } else { 2438 nr_allocated = alloc_pages_bulk_noprof(gfp, 2439 interleave_nodes(pol), NULL, 2440 nr_pages_per_node, page_array); 2441 } 2442 2443 page_array += nr_allocated; 2444 total_allocated += nr_allocated; 2445 } 2446 2447 return total_allocated; 2448 } 2449 2450 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2451 struct mempolicy *pol, unsigned long nr_pages, 2452 struct page **page_array) 2453 { 2454 struct task_struct *me = current; 2455 unsigned int cpuset_mems_cookie; 2456 unsigned long total_allocated = 0; 2457 unsigned long nr_allocated = 0; 2458 unsigned long rounds; 2459 unsigned long node_pages, delta; 2460 u8 *table, *weights, weight; 2461 unsigned int weight_total = 0; 2462 unsigned long rem_pages = nr_pages; 2463 nodemask_t nodes; 2464 int nnodes, node; 2465 int resume_node = MAX_NUMNODES - 1; 2466 u8 resume_weight = 0; 2467 int prev_node; 2468 int i; 2469 2470 if (!nr_pages) 2471 return 0; 2472 2473 /* read the nodes onto the stack, retry if done during rebind */ 2474 do { 2475 cpuset_mems_cookie = read_mems_allowed_begin(); 2476 nnodes = read_once_policy_nodemask(pol, &nodes); 2477 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2478 2479 /* if the nodemask has become invalid, we cannot do anything */ 2480 if (!nnodes) 2481 return 0; 2482 2483 /* Continue allocating from most recent node and adjust the nr_pages */ 2484 node = me->il_prev; 2485 weight = me->il_weight; 2486 if (weight && node_isset(node, nodes)) { 2487 node_pages = min(rem_pages, weight); 2488 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2489 page_array); 2490 page_array += nr_allocated; 2491 total_allocated += nr_allocated; 2492 /* if that's all the pages, no need to interleave */ 2493 if (rem_pages <= weight) { 2494 me->il_weight -= rem_pages; 2495 return total_allocated; 2496 } 2497 /* Otherwise we adjust remaining pages, continue from there */ 2498 rem_pages -= weight; 2499 } 2500 /* clear active weight in case of an allocation failure */ 2501 me->il_weight = 0; 2502 prev_node = node; 2503 2504 /* create a local copy of node weights to operate on outside rcu */ 2505 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2506 if (!weights) 2507 return total_allocated; 2508 2509 rcu_read_lock(); 2510 table = rcu_dereference(iw_table); 2511 if (table) 2512 memcpy(weights, table, nr_node_ids); 2513 rcu_read_unlock(); 2514 2515 /* calculate total, detect system default usage */ 2516 for_each_node_mask(node, nodes) { 2517 if (!weights[node]) 2518 weights[node] = 1; 2519 weight_total += weights[node]; 2520 } 2521 2522 /* 2523 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2524 * Track which node weighted interleave should resume from. 2525 * 2526 * if (rounds > 0) and (delta == 0), resume_node will always be 2527 * the node following prev_node and its weight. 2528 */ 2529 rounds = rem_pages / weight_total; 2530 delta = rem_pages % weight_total; 2531 resume_node = next_node_in(prev_node, nodes); 2532 resume_weight = weights[resume_node]; 2533 for (i = 0; i < nnodes; i++) { 2534 node = next_node_in(prev_node, nodes); 2535 weight = weights[node]; 2536 node_pages = weight * rounds; 2537 /* If a delta exists, add this node's portion of the delta */ 2538 if (delta > weight) { 2539 node_pages += weight; 2540 delta -= weight; 2541 } else if (delta) { 2542 /* when delta is depleted, resume from that node */ 2543 node_pages += delta; 2544 resume_node = node; 2545 resume_weight = weight - delta; 2546 delta = 0; 2547 } 2548 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2549 if (!node_pages) 2550 break; 2551 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2552 page_array); 2553 page_array += nr_allocated; 2554 total_allocated += nr_allocated; 2555 if (total_allocated == nr_pages) 2556 break; 2557 prev_node = node; 2558 } 2559 me->il_prev = resume_node; 2560 me->il_weight = resume_weight; 2561 kfree(weights); 2562 return total_allocated; 2563 } 2564 2565 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2566 struct mempolicy *pol, unsigned long nr_pages, 2567 struct page **page_array) 2568 { 2569 gfp_t preferred_gfp; 2570 unsigned long nr_allocated = 0; 2571 2572 preferred_gfp = gfp | __GFP_NOWARN; 2573 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2574 2575 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2576 nr_pages, page_array); 2577 2578 if (nr_allocated < nr_pages) 2579 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2580 nr_pages - nr_allocated, 2581 page_array + nr_allocated); 2582 return nr_allocated; 2583 } 2584 2585 /* alloc pages bulk and mempolicy should be considered at the 2586 * same time in some situation such as vmalloc. 2587 * 2588 * It can accelerate memory allocation especially interleaving 2589 * allocate memory. 2590 */ 2591 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2592 unsigned long nr_pages, struct page **page_array) 2593 { 2594 struct mempolicy *pol = &default_policy; 2595 nodemask_t *nodemask; 2596 int nid; 2597 2598 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2599 pol = get_task_policy(current); 2600 2601 if (pol->mode == MPOL_INTERLEAVE) 2602 return alloc_pages_bulk_interleave(gfp, pol, 2603 nr_pages, page_array); 2604 2605 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2606 return alloc_pages_bulk_weighted_interleave( 2607 gfp, pol, nr_pages, page_array); 2608 2609 if (pol->mode == MPOL_PREFERRED_MANY) 2610 return alloc_pages_bulk_preferred_many(gfp, 2611 numa_node_id(), pol, nr_pages, page_array); 2612 2613 nid = numa_node_id(); 2614 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2615 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2616 nr_pages, page_array); 2617 } 2618 2619 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2620 { 2621 struct mempolicy *pol = mpol_dup(src->vm_policy); 2622 2623 if (IS_ERR(pol)) 2624 return PTR_ERR(pol); 2625 dst->vm_policy = pol; 2626 return 0; 2627 } 2628 2629 /* 2630 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2631 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2632 * with the mems_allowed returned by cpuset_mems_allowed(). This 2633 * keeps mempolicies cpuset relative after its cpuset moves. See 2634 * further kernel/cpuset.c update_nodemask(). 2635 * 2636 * current's mempolicy may be rebinded by the other task(the task that changes 2637 * cpuset's mems), so we needn't do rebind work for current task. 2638 */ 2639 2640 /* Slow path of a mempolicy duplicate */ 2641 struct mempolicy *__mpol_dup(struct mempolicy *old) 2642 { 2643 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2644 2645 if (!new) 2646 return ERR_PTR(-ENOMEM); 2647 2648 /* task's mempolicy is protected by alloc_lock */ 2649 if (old == current->mempolicy) { 2650 task_lock(current); 2651 *new = *old; 2652 task_unlock(current); 2653 } else 2654 *new = *old; 2655 2656 if (current_cpuset_is_being_rebound()) { 2657 nodemask_t mems = cpuset_mems_allowed(current); 2658 mpol_rebind_policy(new, &mems); 2659 } 2660 atomic_set(&new->refcnt, 1); 2661 return new; 2662 } 2663 2664 /* Slow path of a mempolicy comparison */ 2665 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2666 { 2667 if (!a || !b) 2668 return false; 2669 if (a->mode != b->mode) 2670 return false; 2671 if (a->flags != b->flags) 2672 return false; 2673 if (a->home_node != b->home_node) 2674 return false; 2675 if (mpol_store_user_nodemask(a)) 2676 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2677 return false; 2678 2679 switch (a->mode) { 2680 case MPOL_BIND: 2681 case MPOL_INTERLEAVE: 2682 case MPOL_PREFERRED: 2683 case MPOL_PREFERRED_MANY: 2684 case MPOL_WEIGHTED_INTERLEAVE: 2685 return !!nodes_equal(a->nodes, b->nodes); 2686 case MPOL_LOCAL: 2687 return true; 2688 default: 2689 BUG(); 2690 return false; 2691 } 2692 } 2693 2694 /* 2695 * Shared memory backing store policy support. 2696 * 2697 * Remember policies even when nobody has shared memory mapped. 2698 * The policies are kept in Red-Black tree linked from the inode. 2699 * They are protected by the sp->lock rwlock, which should be held 2700 * for any accesses to the tree. 2701 */ 2702 2703 /* 2704 * lookup first element intersecting start-end. Caller holds sp->lock for 2705 * reading or for writing 2706 */ 2707 static struct sp_node *sp_lookup(struct shared_policy *sp, 2708 pgoff_t start, pgoff_t end) 2709 { 2710 struct rb_node *n = sp->root.rb_node; 2711 2712 while (n) { 2713 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2714 2715 if (start >= p->end) 2716 n = n->rb_right; 2717 else if (end <= p->start) 2718 n = n->rb_left; 2719 else 2720 break; 2721 } 2722 if (!n) 2723 return NULL; 2724 for (;;) { 2725 struct sp_node *w = NULL; 2726 struct rb_node *prev = rb_prev(n); 2727 if (!prev) 2728 break; 2729 w = rb_entry(prev, struct sp_node, nd); 2730 if (w->end <= start) 2731 break; 2732 n = prev; 2733 } 2734 return rb_entry(n, struct sp_node, nd); 2735 } 2736 2737 /* 2738 * Insert a new shared policy into the list. Caller holds sp->lock for 2739 * writing. 2740 */ 2741 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2742 { 2743 struct rb_node **p = &sp->root.rb_node; 2744 struct rb_node *parent = NULL; 2745 struct sp_node *nd; 2746 2747 while (*p) { 2748 parent = *p; 2749 nd = rb_entry(parent, struct sp_node, nd); 2750 if (new->start < nd->start) 2751 p = &(*p)->rb_left; 2752 else if (new->end > nd->end) 2753 p = &(*p)->rb_right; 2754 else 2755 BUG(); 2756 } 2757 rb_link_node(&new->nd, parent, p); 2758 rb_insert_color(&new->nd, &sp->root); 2759 } 2760 2761 /* Find shared policy intersecting idx */ 2762 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2763 pgoff_t idx) 2764 { 2765 struct mempolicy *pol = NULL; 2766 struct sp_node *sn; 2767 2768 if (!sp->root.rb_node) 2769 return NULL; 2770 read_lock(&sp->lock); 2771 sn = sp_lookup(sp, idx, idx+1); 2772 if (sn) { 2773 mpol_get(sn->policy); 2774 pol = sn->policy; 2775 } 2776 read_unlock(&sp->lock); 2777 return pol; 2778 } 2779 2780 static void sp_free(struct sp_node *n) 2781 { 2782 mpol_put(n->policy); 2783 kmem_cache_free(sn_cache, n); 2784 } 2785 2786 /** 2787 * mpol_misplaced - check whether current folio node is valid in policy 2788 * 2789 * @folio: folio to be checked 2790 * @vmf: structure describing the fault 2791 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2792 * 2793 * Lookup current policy node id for vma,addr and "compare to" folio's 2794 * node id. Policy determination "mimics" alloc_page_vma(). 2795 * Called from fault path where we know the vma and faulting address. 2796 * 2797 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2798 * policy, or a suitable node ID to allocate a replacement folio from. 2799 */ 2800 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2801 unsigned long addr) 2802 { 2803 struct mempolicy *pol; 2804 pgoff_t ilx; 2805 struct zoneref *z; 2806 int curnid = folio_nid(folio); 2807 struct vm_area_struct *vma = vmf->vma; 2808 int thiscpu = raw_smp_processor_id(); 2809 int thisnid = numa_node_id(); 2810 int polnid = NUMA_NO_NODE; 2811 int ret = NUMA_NO_NODE; 2812 2813 /* 2814 * Make sure ptl is held so that we don't preempt and we 2815 * have a stable smp processor id 2816 */ 2817 lockdep_assert_held(vmf->ptl); 2818 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2819 if (!(pol->flags & MPOL_F_MOF)) 2820 goto out; 2821 2822 switch (pol->mode) { 2823 case MPOL_INTERLEAVE: 2824 polnid = interleave_nid(pol, ilx); 2825 break; 2826 2827 case MPOL_WEIGHTED_INTERLEAVE: 2828 polnid = weighted_interleave_nid(pol, ilx); 2829 break; 2830 2831 case MPOL_PREFERRED: 2832 if (node_isset(curnid, pol->nodes)) 2833 goto out; 2834 polnid = first_node(pol->nodes); 2835 break; 2836 2837 case MPOL_LOCAL: 2838 polnid = numa_node_id(); 2839 break; 2840 2841 case MPOL_BIND: 2842 case MPOL_PREFERRED_MANY: 2843 /* 2844 * Even though MPOL_PREFERRED_MANY can allocate pages outside 2845 * policy nodemask we don't allow numa migration to nodes 2846 * outside policy nodemask for now. This is done so that if we 2847 * want demotion to slow memory to happen, before allocating 2848 * from some DRAM node say 'x', we will end up using a 2849 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 2850 * we should not promote to node 'x' from slow memory node. 2851 */ 2852 if (pol->flags & MPOL_F_MORON) { 2853 /* 2854 * Optimize placement among multiple nodes 2855 * via NUMA balancing 2856 */ 2857 if (node_isset(thisnid, pol->nodes)) 2858 break; 2859 goto out; 2860 } 2861 2862 /* 2863 * use current page if in policy nodemask, 2864 * else select nearest allowed node, if any. 2865 * If no allowed nodes, use current [!misplaced]. 2866 */ 2867 if (node_isset(curnid, pol->nodes)) 2868 goto out; 2869 z = first_zones_zonelist( 2870 node_zonelist(thisnid, GFP_HIGHUSER), 2871 gfp_zone(GFP_HIGHUSER), 2872 &pol->nodes); 2873 polnid = zonelist_node_idx(z); 2874 break; 2875 2876 default: 2877 BUG(); 2878 } 2879 2880 /* Migrate the folio towards the node whose CPU is referencing it */ 2881 if (pol->flags & MPOL_F_MORON) { 2882 polnid = thisnid; 2883 2884 if (!should_numa_migrate_memory(current, folio, curnid, 2885 thiscpu)) 2886 goto out; 2887 } 2888 2889 if (curnid != polnid) 2890 ret = polnid; 2891 out: 2892 mpol_cond_put(pol); 2893 2894 return ret; 2895 } 2896 2897 /* 2898 * Drop the (possibly final) reference to task->mempolicy. It needs to be 2899 * dropped after task->mempolicy is set to NULL so that any allocation done as 2900 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 2901 * policy. 2902 */ 2903 void mpol_put_task_policy(struct task_struct *task) 2904 { 2905 struct mempolicy *pol; 2906 2907 task_lock(task); 2908 pol = task->mempolicy; 2909 task->mempolicy = NULL; 2910 task_unlock(task); 2911 mpol_put(pol); 2912 } 2913 2914 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2915 { 2916 rb_erase(&n->nd, &sp->root); 2917 sp_free(n); 2918 } 2919 2920 static void sp_node_init(struct sp_node *node, unsigned long start, 2921 unsigned long end, struct mempolicy *pol) 2922 { 2923 node->start = start; 2924 node->end = end; 2925 node->policy = pol; 2926 } 2927 2928 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2929 struct mempolicy *pol) 2930 { 2931 struct sp_node *n; 2932 struct mempolicy *newpol; 2933 2934 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2935 if (!n) 2936 return NULL; 2937 2938 newpol = mpol_dup(pol); 2939 if (IS_ERR(newpol)) { 2940 kmem_cache_free(sn_cache, n); 2941 return NULL; 2942 } 2943 newpol->flags |= MPOL_F_SHARED; 2944 sp_node_init(n, start, end, newpol); 2945 2946 return n; 2947 } 2948 2949 /* Replace a policy range. */ 2950 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 2951 pgoff_t end, struct sp_node *new) 2952 { 2953 struct sp_node *n; 2954 struct sp_node *n_new = NULL; 2955 struct mempolicy *mpol_new = NULL; 2956 int ret = 0; 2957 2958 restart: 2959 write_lock(&sp->lock); 2960 n = sp_lookup(sp, start, end); 2961 /* Take care of old policies in the same range. */ 2962 while (n && n->start < end) { 2963 struct rb_node *next = rb_next(&n->nd); 2964 if (n->start >= start) { 2965 if (n->end <= end) 2966 sp_delete(sp, n); 2967 else 2968 n->start = end; 2969 } else { 2970 /* Old policy spanning whole new range. */ 2971 if (n->end > end) { 2972 if (!n_new) 2973 goto alloc_new; 2974 2975 *mpol_new = *n->policy; 2976 atomic_set(&mpol_new->refcnt, 1); 2977 sp_node_init(n_new, end, n->end, mpol_new); 2978 n->end = start; 2979 sp_insert(sp, n_new); 2980 n_new = NULL; 2981 mpol_new = NULL; 2982 break; 2983 } else 2984 n->end = start; 2985 } 2986 if (!next) 2987 break; 2988 n = rb_entry(next, struct sp_node, nd); 2989 } 2990 if (new) 2991 sp_insert(sp, new); 2992 write_unlock(&sp->lock); 2993 ret = 0; 2994 2995 err_out: 2996 if (mpol_new) 2997 mpol_put(mpol_new); 2998 if (n_new) 2999 kmem_cache_free(sn_cache, n_new); 3000 3001 return ret; 3002 3003 alloc_new: 3004 write_unlock(&sp->lock); 3005 ret = -ENOMEM; 3006 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3007 if (!n_new) 3008 goto err_out; 3009 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3010 if (!mpol_new) 3011 goto err_out; 3012 atomic_set(&mpol_new->refcnt, 1); 3013 goto restart; 3014 } 3015 3016 /** 3017 * mpol_shared_policy_init - initialize shared policy for inode 3018 * @sp: pointer to inode shared policy 3019 * @mpol: struct mempolicy to install 3020 * 3021 * Install non-NULL @mpol in inode's shared policy rb-tree. 3022 * On entry, the current task has a reference on a non-NULL @mpol. 3023 * This must be released on exit. 3024 * This is called at get_inode() calls and we can use GFP_KERNEL. 3025 */ 3026 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3027 { 3028 int ret; 3029 3030 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3031 rwlock_init(&sp->lock); 3032 3033 if (mpol) { 3034 struct sp_node *sn; 3035 struct mempolicy *npol; 3036 NODEMASK_SCRATCH(scratch); 3037 3038 if (!scratch) 3039 goto put_mpol; 3040 3041 /* contextualize the tmpfs mount point mempolicy to this file */ 3042 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3043 if (IS_ERR(npol)) 3044 goto free_scratch; /* no valid nodemask intersection */ 3045 3046 task_lock(current); 3047 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3048 task_unlock(current); 3049 if (ret) 3050 goto put_npol; 3051 3052 /* alloc node covering entire file; adds ref to file's npol */ 3053 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3054 if (sn) 3055 sp_insert(sp, sn); 3056 put_npol: 3057 mpol_put(npol); /* drop initial ref on file's npol */ 3058 free_scratch: 3059 NODEMASK_SCRATCH_FREE(scratch); 3060 put_mpol: 3061 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3062 } 3063 } 3064 3065 int mpol_set_shared_policy(struct shared_policy *sp, 3066 struct vm_area_struct *vma, struct mempolicy *pol) 3067 { 3068 int err; 3069 struct sp_node *new = NULL; 3070 unsigned long sz = vma_pages(vma); 3071 3072 if (pol) { 3073 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3074 if (!new) 3075 return -ENOMEM; 3076 } 3077 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3078 if (err && new) 3079 sp_free(new); 3080 return err; 3081 } 3082 3083 /* Free a backing policy store on inode delete. */ 3084 void mpol_free_shared_policy(struct shared_policy *sp) 3085 { 3086 struct sp_node *n; 3087 struct rb_node *next; 3088 3089 if (!sp->root.rb_node) 3090 return; 3091 write_lock(&sp->lock); 3092 next = rb_first(&sp->root); 3093 while (next) { 3094 n = rb_entry(next, struct sp_node, nd); 3095 next = rb_next(&n->nd); 3096 sp_delete(sp, n); 3097 } 3098 write_unlock(&sp->lock); 3099 } 3100 3101 #ifdef CONFIG_NUMA_BALANCING 3102 static int __initdata numabalancing_override; 3103 3104 static void __init check_numabalancing_enable(void) 3105 { 3106 bool numabalancing_default = false; 3107 3108 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3109 numabalancing_default = true; 3110 3111 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3112 if (numabalancing_override) 3113 set_numabalancing_state(numabalancing_override == 1); 3114 3115 if (num_online_nodes() > 1 && !numabalancing_override) { 3116 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3117 numabalancing_default ? "Enabling" : "Disabling"); 3118 set_numabalancing_state(numabalancing_default); 3119 } 3120 } 3121 3122 static int __init setup_numabalancing(char *str) 3123 { 3124 int ret = 0; 3125 if (!str) 3126 goto out; 3127 3128 if (!strcmp(str, "enable")) { 3129 numabalancing_override = 1; 3130 ret = 1; 3131 } else if (!strcmp(str, "disable")) { 3132 numabalancing_override = -1; 3133 ret = 1; 3134 } 3135 out: 3136 if (!ret) 3137 pr_warn("Unable to parse numa_balancing=\n"); 3138 3139 return ret; 3140 } 3141 __setup("numa_balancing=", setup_numabalancing); 3142 #else 3143 static inline void __init check_numabalancing_enable(void) 3144 { 3145 } 3146 #endif /* CONFIG_NUMA_BALANCING */ 3147 3148 void __init numa_policy_init(void) 3149 { 3150 nodemask_t interleave_nodes; 3151 unsigned long largest = 0; 3152 int nid, prefer = 0; 3153 3154 policy_cache = kmem_cache_create("numa_policy", 3155 sizeof(struct mempolicy), 3156 0, SLAB_PANIC, NULL); 3157 3158 sn_cache = kmem_cache_create("shared_policy_node", 3159 sizeof(struct sp_node), 3160 0, SLAB_PANIC, NULL); 3161 3162 for_each_node(nid) { 3163 preferred_node_policy[nid] = (struct mempolicy) { 3164 .refcnt = ATOMIC_INIT(1), 3165 .mode = MPOL_PREFERRED, 3166 .flags = MPOL_F_MOF | MPOL_F_MORON, 3167 .nodes = nodemask_of_node(nid), 3168 }; 3169 } 3170 3171 /* 3172 * Set interleaving policy for system init. Interleaving is only 3173 * enabled across suitably sized nodes (default is >= 16MB), or 3174 * fall back to the largest node if they're all smaller. 3175 */ 3176 nodes_clear(interleave_nodes); 3177 for_each_node_state(nid, N_MEMORY) { 3178 unsigned long total_pages = node_present_pages(nid); 3179 3180 /* Preserve the largest node */ 3181 if (largest < total_pages) { 3182 largest = total_pages; 3183 prefer = nid; 3184 } 3185 3186 /* Interleave this node? */ 3187 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3188 node_set(nid, interleave_nodes); 3189 } 3190 3191 /* All too small, use the largest */ 3192 if (unlikely(nodes_empty(interleave_nodes))) 3193 node_set(prefer, interleave_nodes); 3194 3195 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3196 pr_err("%s: interleaving failed\n", __func__); 3197 3198 check_numabalancing_enable(); 3199 } 3200 3201 /* Reset policy of current process to default */ 3202 void numa_default_policy(void) 3203 { 3204 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3205 } 3206 3207 /* 3208 * Parse and format mempolicy from/to strings 3209 */ 3210 static const char * const policy_modes[] = 3211 { 3212 [MPOL_DEFAULT] = "default", 3213 [MPOL_PREFERRED] = "prefer", 3214 [MPOL_BIND] = "bind", 3215 [MPOL_INTERLEAVE] = "interleave", 3216 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3217 [MPOL_LOCAL] = "local", 3218 [MPOL_PREFERRED_MANY] = "prefer (many)", 3219 }; 3220 3221 #ifdef CONFIG_TMPFS 3222 /** 3223 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3224 * @str: string containing mempolicy to parse 3225 * @mpol: pointer to struct mempolicy pointer, returned on success. 3226 * 3227 * Format of input: 3228 * <mode>[=<flags>][:<nodelist>] 3229 * 3230 * Return: %0 on success, else %1 3231 */ 3232 int mpol_parse_str(char *str, struct mempolicy **mpol) 3233 { 3234 struct mempolicy *new = NULL; 3235 unsigned short mode_flags; 3236 nodemask_t nodes; 3237 char *nodelist = strchr(str, ':'); 3238 char *flags = strchr(str, '='); 3239 int err = 1, mode; 3240 3241 if (flags) 3242 *flags++ = '\0'; /* terminate mode string */ 3243 3244 if (nodelist) { 3245 /* NUL-terminate mode or flags string */ 3246 *nodelist++ = '\0'; 3247 if (nodelist_parse(nodelist, nodes)) 3248 goto out; 3249 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3250 goto out; 3251 } else 3252 nodes_clear(nodes); 3253 3254 mode = match_string(policy_modes, MPOL_MAX, str); 3255 if (mode < 0) 3256 goto out; 3257 3258 switch (mode) { 3259 case MPOL_PREFERRED: 3260 /* 3261 * Insist on a nodelist of one node only, although later 3262 * we use first_node(nodes) to grab a single node, so here 3263 * nodelist (or nodes) cannot be empty. 3264 */ 3265 if (nodelist) { 3266 char *rest = nodelist; 3267 while (isdigit(*rest)) 3268 rest++; 3269 if (*rest) 3270 goto out; 3271 if (nodes_empty(nodes)) 3272 goto out; 3273 } 3274 break; 3275 case MPOL_INTERLEAVE: 3276 case MPOL_WEIGHTED_INTERLEAVE: 3277 /* 3278 * Default to online nodes with memory if no nodelist 3279 */ 3280 if (!nodelist) 3281 nodes = node_states[N_MEMORY]; 3282 break; 3283 case MPOL_LOCAL: 3284 /* 3285 * Don't allow a nodelist; mpol_new() checks flags 3286 */ 3287 if (nodelist) 3288 goto out; 3289 break; 3290 case MPOL_DEFAULT: 3291 /* 3292 * Insist on a empty nodelist 3293 */ 3294 if (!nodelist) 3295 err = 0; 3296 goto out; 3297 case MPOL_PREFERRED_MANY: 3298 case MPOL_BIND: 3299 /* 3300 * Insist on a nodelist 3301 */ 3302 if (!nodelist) 3303 goto out; 3304 } 3305 3306 mode_flags = 0; 3307 if (flags) { 3308 /* 3309 * Currently, we only support two mutually exclusive 3310 * mode flags. 3311 */ 3312 if (!strcmp(flags, "static")) 3313 mode_flags |= MPOL_F_STATIC_NODES; 3314 else if (!strcmp(flags, "relative")) 3315 mode_flags |= MPOL_F_RELATIVE_NODES; 3316 else 3317 goto out; 3318 } 3319 3320 new = mpol_new(mode, mode_flags, &nodes); 3321 if (IS_ERR(new)) 3322 goto out; 3323 3324 /* 3325 * Save nodes for mpol_to_str() to show the tmpfs mount options 3326 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3327 */ 3328 if (mode != MPOL_PREFERRED) { 3329 new->nodes = nodes; 3330 } else if (nodelist) { 3331 nodes_clear(new->nodes); 3332 node_set(first_node(nodes), new->nodes); 3333 } else { 3334 new->mode = MPOL_LOCAL; 3335 } 3336 3337 /* 3338 * Save nodes for contextualization: this will be used to "clone" 3339 * the mempolicy in a specific context [cpuset] at a later time. 3340 */ 3341 new->w.user_nodemask = nodes; 3342 3343 err = 0; 3344 3345 out: 3346 /* Restore string for error message */ 3347 if (nodelist) 3348 *--nodelist = ':'; 3349 if (flags) 3350 *--flags = '='; 3351 if (!err) 3352 *mpol = new; 3353 return err; 3354 } 3355 #endif /* CONFIG_TMPFS */ 3356 3357 /** 3358 * mpol_to_str - format a mempolicy structure for printing 3359 * @buffer: to contain formatted mempolicy string 3360 * @maxlen: length of @buffer 3361 * @pol: pointer to mempolicy to be formatted 3362 * 3363 * Convert @pol into a string. If @buffer is too short, truncate the string. 3364 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3365 * interleave", plus the longest flag flags, "relative|balancing", and to 3366 * display at least a few node ids. 3367 */ 3368 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3369 { 3370 char *p = buffer; 3371 nodemask_t nodes = NODE_MASK_NONE; 3372 unsigned short mode = MPOL_DEFAULT; 3373 unsigned short flags = 0; 3374 3375 if (pol && 3376 pol != &default_policy && 3377 !(pol >= &preferred_node_policy[0] && 3378 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3379 mode = pol->mode; 3380 flags = pol->flags; 3381 } 3382 3383 switch (mode) { 3384 case MPOL_DEFAULT: 3385 case MPOL_LOCAL: 3386 break; 3387 case MPOL_PREFERRED: 3388 case MPOL_PREFERRED_MANY: 3389 case MPOL_BIND: 3390 case MPOL_INTERLEAVE: 3391 case MPOL_WEIGHTED_INTERLEAVE: 3392 nodes = pol->nodes; 3393 break; 3394 default: 3395 WARN_ON_ONCE(1); 3396 snprintf(p, maxlen, "unknown"); 3397 return; 3398 } 3399 3400 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3401 3402 if (flags & MPOL_MODE_FLAGS) { 3403 p += snprintf(p, buffer + maxlen - p, "="); 3404 3405 /* 3406 * Static and relative are mutually exclusive. 3407 */ 3408 if (flags & MPOL_F_STATIC_NODES) 3409 p += snprintf(p, buffer + maxlen - p, "static"); 3410 else if (flags & MPOL_F_RELATIVE_NODES) 3411 p += snprintf(p, buffer + maxlen - p, "relative"); 3412 3413 if (flags & MPOL_F_NUMA_BALANCING) { 3414 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3415 p += snprintf(p, buffer + maxlen - p, "|"); 3416 p += snprintf(p, buffer + maxlen - p, "balancing"); 3417 } 3418 } 3419 3420 if (!nodes_empty(nodes)) 3421 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3422 nodemask_pr_args(&nodes)); 3423 } 3424 3425 #ifdef CONFIG_SYSFS 3426 struct iw_node_attr { 3427 struct kobj_attribute kobj_attr; 3428 int nid; 3429 }; 3430 3431 struct sysfs_wi_group { 3432 struct kobject wi_kobj; 3433 struct mutex kobj_lock; 3434 struct iw_node_attr *nattrs[]; 3435 }; 3436 3437 static struct sysfs_wi_group *wi_group; 3438 3439 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3440 char *buf) 3441 { 3442 struct iw_node_attr *node_attr; 3443 u8 weight; 3444 3445 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3446 weight = get_il_weight(node_attr->nid); 3447 return sysfs_emit(buf, "%d\n", weight); 3448 } 3449 3450 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3451 const char *buf, size_t count) 3452 { 3453 struct iw_node_attr *node_attr; 3454 u8 *new; 3455 u8 *old; 3456 u8 weight = 0; 3457 3458 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3459 if (count == 0 || sysfs_streq(buf, "")) 3460 weight = 0; 3461 else if (kstrtou8(buf, 0, &weight)) 3462 return -EINVAL; 3463 3464 new = kzalloc(nr_node_ids, GFP_KERNEL); 3465 if (!new) 3466 return -ENOMEM; 3467 3468 mutex_lock(&iw_table_lock); 3469 old = rcu_dereference_protected(iw_table, 3470 lockdep_is_held(&iw_table_lock)); 3471 if (old) 3472 memcpy(new, old, nr_node_ids); 3473 new[node_attr->nid] = weight; 3474 rcu_assign_pointer(iw_table, new); 3475 mutex_unlock(&iw_table_lock); 3476 synchronize_rcu(); 3477 kfree(old); 3478 return count; 3479 } 3480 3481 static void sysfs_wi_node_delete(int nid) 3482 { 3483 struct iw_node_attr *attr; 3484 3485 if (nid < 0 || nid >= nr_node_ids) 3486 return; 3487 3488 mutex_lock(&wi_group->kobj_lock); 3489 attr = wi_group->nattrs[nid]; 3490 if (!attr) { 3491 mutex_unlock(&wi_group->kobj_lock); 3492 return; 3493 } 3494 3495 wi_group->nattrs[nid] = NULL; 3496 mutex_unlock(&wi_group->kobj_lock); 3497 3498 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); 3499 kfree(attr->kobj_attr.attr.name); 3500 kfree(attr); 3501 } 3502 3503 static void sysfs_wi_node_delete_all(void) 3504 { 3505 int nid; 3506 3507 for (nid = 0; nid < nr_node_ids; nid++) 3508 sysfs_wi_node_delete(nid); 3509 } 3510 3511 static void iw_table_free(void) 3512 { 3513 u8 *old; 3514 3515 mutex_lock(&iw_table_lock); 3516 old = rcu_dereference_protected(iw_table, 3517 lockdep_is_held(&iw_table_lock)); 3518 rcu_assign_pointer(iw_table, NULL); 3519 mutex_unlock(&iw_table_lock); 3520 3521 synchronize_rcu(); 3522 kfree(old); 3523 } 3524 3525 static void wi_cleanup(void) { 3526 sysfs_wi_node_delete_all(); 3527 iw_table_free(); 3528 } 3529 3530 static void wi_kobj_release(struct kobject *wi_kobj) 3531 { 3532 kfree(wi_group); 3533 } 3534 3535 static const struct kobj_type wi_ktype = { 3536 .sysfs_ops = &kobj_sysfs_ops, 3537 .release = wi_kobj_release, 3538 }; 3539 3540 static int sysfs_wi_node_add(int nid) 3541 { 3542 int ret; 3543 char *name; 3544 struct iw_node_attr *new_attr; 3545 3546 if (nid < 0 || nid >= nr_node_ids) { 3547 pr_err("invalid node id: %d\n", nid); 3548 return -EINVAL; 3549 } 3550 3551 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); 3552 if (!new_attr) 3553 return -ENOMEM; 3554 3555 name = kasprintf(GFP_KERNEL, "node%d", nid); 3556 if (!name) { 3557 kfree(new_attr); 3558 return -ENOMEM; 3559 } 3560 3561 sysfs_attr_init(&new_attr->kobj_attr.attr); 3562 new_attr->kobj_attr.attr.name = name; 3563 new_attr->kobj_attr.attr.mode = 0644; 3564 new_attr->kobj_attr.show = node_show; 3565 new_attr->kobj_attr.store = node_store; 3566 new_attr->nid = nid; 3567 3568 mutex_lock(&wi_group->kobj_lock); 3569 if (wi_group->nattrs[nid]) { 3570 mutex_unlock(&wi_group->kobj_lock); 3571 ret = -EEXIST; 3572 goto out; 3573 } 3574 3575 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); 3576 if (ret) { 3577 mutex_unlock(&wi_group->kobj_lock); 3578 goto out; 3579 } 3580 wi_group->nattrs[nid] = new_attr; 3581 mutex_unlock(&wi_group->kobj_lock); 3582 return 0; 3583 3584 out: 3585 kfree(new_attr->kobj_attr.attr.name); 3586 kfree(new_attr); 3587 return ret; 3588 } 3589 3590 static int wi_node_notifier(struct notifier_block *nb, 3591 unsigned long action, void *data) 3592 { 3593 int err; 3594 struct memory_notify *arg = data; 3595 int nid = arg->status_change_nid; 3596 3597 if (nid < 0) 3598 return NOTIFY_OK; 3599 3600 switch (action) { 3601 case MEM_ONLINE: 3602 err = sysfs_wi_node_add(nid); 3603 if (err) 3604 pr_err("failed to add sysfs for node%d during hotplug: %d\n", 3605 nid, err); 3606 break; 3607 case MEM_OFFLINE: 3608 sysfs_wi_node_delete(nid); 3609 break; 3610 } 3611 3612 return NOTIFY_OK; 3613 } 3614 3615 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) 3616 { 3617 int nid, err; 3618 3619 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), 3620 GFP_KERNEL); 3621 if (!wi_group) 3622 return -ENOMEM; 3623 mutex_init(&wi_group->kobj_lock); 3624 3625 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, 3626 "weighted_interleave"); 3627 if (err) 3628 goto err_put_kobj; 3629 3630 for_each_online_node(nid) { 3631 if (!node_state(nid, N_MEMORY)) 3632 continue; 3633 3634 err = sysfs_wi_node_add(nid); 3635 if (err) { 3636 pr_err("failed to add sysfs for node%d during init: %d\n", 3637 nid, err); 3638 goto err_cleanup_kobj; 3639 } 3640 } 3641 3642 hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); 3643 return 0; 3644 3645 err_cleanup_kobj: 3646 wi_cleanup(); 3647 kobject_del(&wi_group->wi_kobj); 3648 err_put_kobj: 3649 kobject_put(&wi_group->wi_kobj); 3650 return err; 3651 } 3652 3653 static int __init mempolicy_sysfs_init(void) 3654 { 3655 int err; 3656 static struct kobject *mempolicy_kobj; 3657 3658 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); 3659 if (!mempolicy_kobj) 3660 return -ENOMEM; 3661 3662 err = add_weighted_interleave_group(mempolicy_kobj); 3663 if (err) 3664 goto err_kobj; 3665 3666 return 0; 3667 3668 err_kobj: 3669 kobject_del(mempolicy_kobj); 3670 kobject_put(mempolicy_kobj); 3671 return err; 3672 } 3673 3674 late_initcall(mempolicy_sysfs_init); 3675 #endif /* CONFIG_SYSFS */ 3676