1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/task.h> 89 #include <linux/nodemask.h> 90 #include <linux/cpuset.h> 91 #include <linux/slab.h> 92 #include <linux/string.h> 93 #include <linux/export.h> 94 #include <linux/nsproxy.h> 95 #include <linux/interrupt.h> 96 #include <linux/init.h> 97 #include <linux/compat.h> 98 #include <linux/ptrace.h> 99 #include <linux/swap.h> 100 #include <linux/seq_file.h> 101 #include <linux/proc_fs.h> 102 #include <linux/migrate.h> 103 #include <linux/ksm.h> 104 #include <linux/rmap.h> 105 #include <linux/security.h> 106 #include <linux/syscalls.h> 107 #include <linux/ctype.h> 108 #include <linux/mm_inline.h> 109 #include <linux/mmu_notifier.h> 110 #include <linux/printk.h> 111 #include <linux/swapops.h> 112 113 #include <asm/tlbflush.h> 114 #include <asm/tlb.h> 115 #include <linux/uaccess.h> 116 117 #include "internal.h" 118 119 /* Internal flags */ 120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 121 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 122 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 123 124 static struct kmem_cache *policy_cache; 125 static struct kmem_cache *sn_cache; 126 127 /* Highest zone. An specific allocation for a zone below that is not 128 policied. */ 129 enum zone_type policy_zone = 0; 130 131 /* 132 * run-time system-wide default policy => local allocation 133 */ 134 static struct mempolicy default_policy = { 135 .refcnt = ATOMIC_INIT(1), /* never free it */ 136 .mode = MPOL_LOCAL, 137 }; 138 139 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 140 141 /* 142 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes 143 * system-default value should be used. A NULL iw_table also denotes that 144 * system-default values should be used. Until the system-default table 145 * is implemented, the system-default is always 1. 146 * 147 * iw_table is RCU protected 148 */ 149 static u8 __rcu *iw_table; 150 static DEFINE_MUTEX(iw_table_lock); 151 152 static u8 get_il_weight(int node) 153 { 154 u8 *table; 155 u8 weight; 156 157 rcu_read_lock(); 158 table = rcu_dereference(iw_table); 159 /* if no iw_table, use system default */ 160 weight = table ? table[node] : 1; 161 /* if value in iw_table is 0, use system default */ 162 weight = weight ? weight : 1; 163 rcu_read_unlock(); 164 return weight; 165 } 166 167 /** 168 * numa_nearest_node - Find nearest node by state 169 * @node: Node id to start the search 170 * @state: State to filter the search 171 * 172 * Lookup the closest node by distance if @nid is not in state. 173 * 174 * Return: this @node if it is in state, otherwise the closest node by distance 175 */ 176 int numa_nearest_node(int node, unsigned int state) 177 { 178 int min_dist = INT_MAX, dist, n, min_node; 179 180 if (state >= NR_NODE_STATES) 181 return -EINVAL; 182 183 if (node == NUMA_NO_NODE || node_state(node, state)) 184 return node; 185 186 min_node = node; 187 for_each_node_state(n, state) { 188 dist = node_distance(node, n); 189 if (dist < min_dist) { 190 min_dist = dist; 191 min_node = n; 192 } 193 } 194 195 return min_node; 196 } 197 EXPORT_SYMBOL_GPL(numa_nearest_node); 198 199 /** 200 * nearest_node_nodemask - Find the node in @mask at the nearest distance 201 * from @node. 202 * 203 * @node: a valid node ID to start the search from. 204 * @mask: a pointer to a nodemask representing the allowed nodes. 205 * 206 * This function iterates over all nodes in @mask and calculates the 207 * distance from the starting @node, then it returns the node ID that is 208 * the closest to @node, or MAX_NUMNODES if no node is found. 209 * 210 * Note that @node must be a valid node ID usable with node_distance(), 211 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 212 * or unexpected behavior. 213 */ 214 int nearest_node_nodemask(int node, nodemask_t *mask) 215 { 216 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 217 218 for_each_node_mask(n, *mask) { 219 dist = node_distance(node, n); 220 if (dist < min_dist) { 221 min_dist = dist; 222 min_node = n; 223 } 224 } 225 226 return min_node; 227 } 228 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 229 230 struct mempolicy *get_task_policy(struct task_struct *p) 231 { 232 struct mempolicy *pol = p->mempolicy; 233 int node; 234 235 if (pol) 236 return pol; 237 238 node = numa_node_id(); 239 if (node != NUMA_NO_NODE) { 240 pol = &preferred_node_policy[node]; 241 /* preferred_node_policy is not initialised early in boot */ 242 if (pol->mode) 243 return pol; 244 } 245 246 return &default_policy; 247 } 248 249 static const struct mempolicy_operations { 250 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 251 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 252 } mpol_ops[MPOL_MAX]; 253 254 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 255 { 256 return pol->flags & MPOL_MODE_FLAGS; 257 } 258 259 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 260 const nodemask_t *rel) 261 { 262 nodemask_t tmp; 263 nodes_fold(tmp, *orig, nodes_weight(*rel)); 264 nodes_onto(*ret, tmp, *rel); 265 } 266 267 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 268 { 269 if (nodes_empty(*nodes)) 270 return -EINVAL; 271 pol->nodes = *nodes; 272 return 0; 273 } 274 275 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 276 { 277 if (nodes_empty(*nodes)) 278 return -EINVAL; 279 280 nodes_clear(pol->nodes); 281 node_set(first_node(*nodes), pol->nodes); 282 return 0; 283 } 284 285 /* 286 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 287 * any, for the new policy. mpol_new() has already validated the nodes 288 * parameter with respect to the policy mode and flags. 289 * 290 * Must be called holding task's alloc_lock to protect task's mems_allowed 291 * and mempolicy. May also be called holding the mmap_lock for write. 292 */ 293 static int mpol_set_nodemask(struct mempolicy *pol, 294 const nodemask_t *nodes, struct nodemask_scratch *nsc) 295 { 296 int ret; 297 298 /* 299 * Default (pol==NULL) resp. local memory policies are not a 300 * subject of any remapping. They also do not need any special 301 * constructor. 302 */ 303 if (!pol || pol->mode == MPOL_LOCAL) 304 return 0; 305 306 /* Check N_MEMORY */ 307 nodes_and(nsc->mask1, 308 cpuset_current_mems_allowed, node_states[N_MEMORY]); 309 310 VM_BUG_ON(!nodes); 311 312 if (pol->flags & MPOL_F_RELATIVE_NODES) 313 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 314 else 315 nodes_and(nsc->mask2, *nodes, nsc->mask1); 316 317 if (mpol_store_user_nodemask(pol)) 318 pol->w.user_nodemask = *nodes; 319 else 320 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 321 322 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 323 return ret; 324 } 325 326 /* 327 * This function just creates a new policy, does some check and simple 328 * initialization. You must invoke mpol_set_nodemask() to set nodes. 329 */ 330 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 331 nodemask_t *nodes) 332 { 333 struct mempolicy *policy; 334 335 if (mode == MPOL_DEFAULT) { 336 if (nodes && !nodes_empty(*nodes)) 337 return ERR_PTR(-EINVAL); 338 return NULL; 339 } 340 VM_BUG_ON(!nodes); 341 342 /* 343 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 344 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 345 * All other modes require a valid pointer to a non-empty nodemask. 346 */ 347 if (mode == MPOL_PREFERRED) { 348 if (nodes_empty(*nodes)) { 349 if (((flags & MPOL_F_STATIC_NODES) || 350 (flags & MPOL_F_RELATIVE_NODES))) 351 return ERR_PTR(-EINVAL); 352 353 mode = MPOL_LOCAL; 354 } 355 } else if (mode == MPOL_LOCAL) { 356 if (!nodes_empty(*nodes) || 357 (flags & MPOL_F_STATIC_NODES) || 358 (flags & MPOL_F_RELATIVE_NODES)) 359 return ERR_PTR(-EINVAL); 360 } else if (nodes_empty(*nodes)) 361 return ERR_PTR(-EINVAL); 362 363 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 364 if (!policy) 365 return ERR_PTR(-ENOMEM); 366 atomic_set(&policy->refcnt, 1); 367 policy->mode = mode; 368 policy->flags = flags; 369 policy->home_node = NUMA_NO_NODE; 370 371 return policy; 372 } 373 374 /* Slow path of a mpol destructor. */ 375 void __mpol_put(struct mempolicy *pol) 376 { 377 if (!atomic_dec_and_test(&pol->refcnt)) 378 return; 379 kmem_cache_free(policy_cache, pol); 380 } 381 382 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 383 { 384 } 385 386 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 387 { 388 nodemask_t tmp; 389 390 if (pol->flags & MPOL_F_STATIC_NODES) 391 nodes_and(tmp, pol->w.user_nodemask, *nodes); 392 else if (pol->flags & MPOL_F_RELATIVE_NODES) 393 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 394 else { 395 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 396 *nodes); 397 pol->w.cpuset_mems_allowed = *nodes; 398 } 399 400 if (nodes_empty(tmp)) 401 tmp = *nodes; 402 403 pol->nodes = tmp; 404 } 405 406 static void mpol_rebind_preferred(struct mempolicy *pol, 407 const nodemask_t *nodes) 408 { 409 pol->w.cpuset_mems_allowed = *nodes; 410 } 411 412 /* 413 * mpol_rebind_policy - Migrate a policy to a different set of nodes 414 * 415 * Per-vma policies are protected by mmap_lock. Allocations using per-task 416 * policies are protected by task->mems_allowed_seq to prevent a premature 417 * OOM/allocation failure due to parallel nodemask modification. 418 */ 419 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 420 { 421 if (!pol || pol->mode == MPOL_LOCAL) 422 return; 423 if (!mpol_store_user_nodemask(pol) && 424 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 425 return; 426 427 mpol_ops[pol->mode].rebind(pol, newmask); 428 } 429 430 /* 431 * Wrapper for mpol_rebind_policy() that just requires task 432 * pointer, and updates task mempolicy. 433 * 434 * Called with task's alloc_lock held. 435 */ 436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 437 { 438 mpol_rebind_policy(tsk->mempolicy, new); 439 } 440 441 /* 442 * Rebind each vma in mm to new nodemask. 443 * 444 * Call holding a reference to mm. Takes mm->mmap_lock during call. 445 */ 446 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 447 { 448 struct vm_area_struct *vma; 449 VMA_ITERATOR(vmi, mm, 0); 450 451 mmap_write_lock(mm); 452 for_each_vma(vmi, vma) { 453 vma_start_write(vma); 454 mpol_rebind_policy(vma->vm_policy, new); 455 } 456 mmap_write_unlock(mm); 457 } 458 459 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 460 [MPOL_DEFAULT] = { 461 .rebind = mpol_rebind_default, 462 }, 463 [MPOL_INTERLEAVE] = { 464 .create = mpol_new_nodemask, 465 .rebind = mpol_rebind_nodemask, 466 }, 467 [MPOL_PREFERRED] = { 468 .create = mpol_new_preferred, 469 .rebind = mpol_rebind_preferred, 470 }, 471 [MPOL_BIND] = { 472 .create = mpol_new_nodemask, 473 .rebind = mpol_rebind_nodemask, 474 }, 475 [MPOL_LOCAL] = { 476 .rebind = mpol_rebind_default, 477 }, 478 [MPOL_PREFERRED_MANY] = { 479 .create = mpol_new_nodemask, 480 .rebind = mpol_rebind_preferred, 481 }, 482 [MPOL_WEIGHTED_INTERLEAVE] = { 483 .create = mpol_new_nodemask, 484 .rebind = mpol_rebind_nodemask, 485 }, 486 }; 487 488 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 489 unsigned long flags); 490 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 491 pgoff_t ilx, int *nid); 492 493 static bool strictly_unmovable(unsigned long flags) 494 { 495 /* 496 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 497 * if any misplaced page is found. 498 */ 499 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 500 MPOL_MF_STRICT; 501 } 502 503 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 504 struct mempolicy *pol; 505 pgoff_t ilx; 506 }; 507 508 struct queue_pages { 509 struct list_head *pagelist; 510 unsigned long flags; 511 nodemask_t *nmask; 512 unsigned long start; 513 unsigned long end; 514 struct vm_area_struct *first; 515 struct folio *large; /* note last large folio encountered */ 516 long nr_failed; /* could not be isolated at this time */ 517 }; 518 519 /* 520 * Check if the folio's nid is in qp->nmask. 521 * 522 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 523 * in the invert of qp->nmask. 524 */ 525 static inline bool queue_folio_required(struct folio *folio, 526 struct queue_pages *qp) 527 { 528 int nid = folio_nid(folio); 529 unsigned long flags = qp->flags; 530 531 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 532 } 533 534 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 535 { 536 struct folio *folio; 537 struct queue_pages *qp = walk->private; 538 539 if (unlikely(is_pmd_migration_entry(*pmd))) { 540 qp->nr_failed++; 541 return; 542 } 543 folio = pmd_folio(*pmd); 544 if (is_huge_zero_folio(folio)) { 545 walk->action = ACTION_CONTINUE; 546 return; 547 } 548 if (!queue_folio_required(folio, qp)) 549 return; 550 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 551 !vma_migratable(walk->vma) || 552 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 553 qp->nr_failed++; 554 } 555 556 /* 557 * Scan through folios, checking if they satisfy the required conditions, 558 * moving them from LRU to local pagelist for migration if they do (or not). 559 * 560 * queue_folios_pte_range() has two possible return values: 561 * 0 - continue walking to scan for more, even if an existing folio on the 562 * wrong node could not be isolated and queued for migration. 563 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 564 * and an existing folio was on a node that does not follow the policy. 565 */ 566 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 567 unsigned long end, struct mm_walk *walk) 568 { 569 const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 570 struct vm_area_struct *vma = walk->vma; 571 struct folio *folio; 572 struct queue_pages *qp = walk->private; 573 unsigned long flags = qp->flags; 574 pte_t *pte, *mapped_pte; 575 pte_t ptent; 576 spinlock_t *ptl; 577 int max_nr, nr; 578 579 ptl = pmd_trans_huge_lock(pmd, vma); 580 if (ptl) { 581 queue_folios_pmd(pmd, walk); 582 spin_unlock(ptl); 583 goto out; 584 } 585 586 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 587 if (!pte) { 588 walk->action = ACTION_AGAIN; 589 return 0; 590 } 591 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 592 max_nr = (end - addr) >> PAGE_SHIFT; 593 nr = 1; 594 ptent = ptep_get(pte); 595 if (pte_none(ptent)) 596 continue; 597 if (!pte_present(ptent)) { 598 if (is_migration_entry(pte_to_swp_entry(ptent))) 599 qp->nr_failed++; 600 continue; 601 } 602 folio = vm_normal_folio(vma, addr, ptent); 603 if (!folio || folio_is_zone_device(folio)) 604 continue; 605 if (folio_test_large(folio) && max_nr != 1) 606 nr = folio_pte_batch(folio, addr, pte, ptent, 607 max_nr, fpb_flags, 608 NULL, NULL, NULL); 609 /* 610 * vm_normal_folio() filters out zero pages, but there might 611 * still be reserved folios to skip, perhaps in a VDSO. 612 */ 613 if (folio_test_reserved(folio)) 614 continue; 615 if (!queue_folio_required(folio, qp)) 616 continue; 617 if (folio_test_large(folio)) { 618 /* 619 * A large folio can only be isolated from LRU once, 620 * but may be mapped by many PTEs (and Copy-On-Write may 621 * intersperse PTEs of other, order 0, folios). This is 622 * a common case, so don't mistake it for failure (but 623 * there can be other cases of multi-mapped pages which 624 * this quick check does not help to filter out - and a 625 * search of the pagelist might grow to be prohibitive). 626 * 627 * migrate_pages(&pagelist) returns nr_failed folios, so 628 * check "large" now so that queue_pages_range() returns 629 * a comparable nr_failed folios. This does imply that 630 * if folio could not be isolated for some racy reason 631 * at its first PTE, later PTEs will not give it another 632 * chance of isolation; but keeps the accounting simple. 633 */ 634 if (folio == qp->large) 635 continue; 636 qp->large = folio; 637 } 638 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 639 !vma_migratable(vma) || 640 !migrate_folio_add(folio, qp->pagelist, flags)) { 641 qp->nr_failed += nr; 642 if (strictly_unmovable(flags)) 643 break; 644 } 645 } 646 pte_unmap_unlock(mapped_pte, ptl); 647 cond_resched(); 648 out: 649 if (qp->nr_failed && strictly_unmovable(flags)) 650 return -EIO; 651 return 0; 652 } 653 654 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 655 unsigned long addr, unsigned long end, 656 struct mm_walk *walk) 657 { 658 #ifdef CONFIG_HUGETLB_PAGE 659 struct queue_pages *qp = walk->private; 660 unsigned long flags = qp->flags; 661 struct folio *folio; 662 spinlock_t *ptl; 663 pte_t entry; 664 665 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 666 entry = huge_ptep_get(walk->mm, addr, pte); 667 if (!pte_present(entry)) { 668 if (unlikely(is_hugetlb_entry_migration(entry))) 669 qp->nr_failed++; 670 goto unlock; 671 } 672 folio = pfn_folio(pte_pfn(entry)); 673 if (!queue_folio_required(folio, qp)) 674 goto unlock; 675 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 676 !vma_migratable(walk->vma)) { 677 qp->nr_failed++; 678 goto unlock; 679 } 680 /* 681 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 682 * Choosing not to migrate a shared folio is not counted as a failure. 683 * 684 * See folio_maybe_mapped_shared() on possible imprecision when we 685 * cannot easily detect if a folio is shared. 686 */ 687 if ((flags & MPOL_MF_MOVE_ALL) || 688 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 689 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 690 qp->nr_failed++; 691 unlock: 692 spin_unlock(ptl); 693 if (qp->nr_failed && strictly_unmovable(flags)) 694 return -EIO; 695 #endif 696 return 0; 697 } 698 699 #ifdef CONFIG_NUMA_BALANCING 700 /* 701 * This is used to mark a range of virtual addresses to be inaccessible. 702 * These are later cleared by a NUMA hinting fault. Depending on these 703 * faults, pages may be migrated for better NUMA placement. 704 * 705 * This is assuming that NUMA faults are handled using PROT_NONE. If 706 * an architecture makes a different choice, it will need further 707 * changes to the core. 708 */ 709 unsigned long change_prot_numa(struct vm_area_struct *vma, 710 unsigned long addr, unsigned long end) 711 { 712 struct mmu_gather tlb; 713 long nr_updated; 714 715 tlb_gather_mmu(&tlb, vma->vm_mm); 716 717 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 718 if (nr_updated > 0) { 719 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 720 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 721 } 722 723 tlb_finish_mmu(&tlb); 724 725 return nr_updated; 726 } 727 #endif /* CONFIG_NUMA_BALANCING */ 728 729 static int queue_pages_test_walk(unsigned long start, unsigned long end, 730 struct mm_walk *walk) 731 { 732 struct vm_area_struct *next, *vma = walk->vma; 733 struct queue_pages *qp = walk->private; 734 unsigned long flags = qp->flags; 735 736 /* range check first */ 737 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 738 739 if (!qp->first) { 740 qp->first = vma; 741 if (!(flags & MPOL_MF_DISCONTIG_OK) && 742 (qp->start < vma->vm_start)) 743 /* hole at head side of range */ 744 return -EFAULT; 745 } 746 next = find_vma(vma->vm_mm, vma->vm_end); 747 if (!(flags & MPOL_MF_DISCONTIG_OK) && 748 ((vma->vm_end < qp->end) && 749 (!next || vma->vm_end < next->vm_start))) 750 /* hole at middle or tail of range */ 751 return -EFAULT; 752 753 /* 754 * Need check MPOL_MF_STRICT to return -EIO if possible 755 * regardless of vma_migratable 756 */ 757 if (!vma_migratable(vma) && 758 !(flags & MPOL_MF_STRICT)) 759 return 1; 760 761 /* 762 * Check page nodes, and queue pages to move, in the current vma. 763 * But if no moving, and no strict checking, the scan can be skipped. 764 */ 765 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 766 return 0; 767 return 1; 768 } 769 770 static const struct mm_walk_ops queue_pages_walk_ops = { 771 .hugetlb_entry = queue_folios_hugetlb, 772 .pmd_entry = queue_folios_pte_range, 773 .test_walk = queue_pages_test_walk, 774 .walk_lock = PGWALK_RDLOCK, 775 }; 776 777 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 778 .hugetlb_entry = queue_folios_hugetlb, 779 .pmd_entry = queue_folios_pte_range, 780 .test_walk = queue_pages_test_walk, 781 .walk_lock = PGWALK_WRLOCK, 782 }; 783 784 /* 785 * Walk through page tables and collect pages to be migrated. 786 * 787 * If pages found in a given range are not on the required set of @nodes, 788 * and migration is allowed, they are isolated and queued to @pagelist. 789 * 790 * queue_pages_range() may return: 791 * 0 - all pages already on the right node, or successfully queued for moving 792 * (or neither strict checking nor moving requested: only range checking). 793 * >0 - this number of misplaced folios could not be queued for moving 794 * (a hugetlbfs page or a transparent huge page being counted as 1). 795 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 796 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 797 */ 798 static long 799 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 800 nodemask_t *nodes, unsigned long flags, 801 struct list_head *pagelist) 802 { 803 int err; 804 struct queue_pages qp = { 805 .pagelist = pagelist, 806 .flags = flags, 807 .nmask = nodes, 808 .start = start, 809 .end = end, 810 .first = NULL, 811 }; 812 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 813 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 814 815 err = walk_page_range(mm, start, end, ops, &qp); 816 817 if (!qp.first) 818 /* whole range in hole */ 819 err = -EFAULT; 820 821 return err ? : qp.nr_failed; 822 } 823 824 /* 825 * Apply policy to a single VMA 826 * This must be called with the mmap_lock held for writing. 827 */ 828 static int vma_replace_policy(struct vm_area_struct *vma, 829 struct mempolicy *pol) 830 { 831 int err; 832 struct mempolicy *old; 833 struct mempolicy *new; 834 835 vma_assert_write_locked(vma); 836 837 new = mpol_dup(pol); 838 if (IS_ERR(new)) 839 return PTR_ERR(new); 840 841 if (vma->vm_ops && vma->vm_ops->set_policy) { 842 err = vma->vm_ops->set_policy(vma, new); 843 if (err) 844 goto err_out; 845 } 846 847 old = vma->vm_policy; 848 vma->vm_policy = new; /* protected by mmap_lock */ 849 mpol_put(old); 850 851 return 0; 852 err_out: 853 mpol_put(new); 854 return err; 855 } 856 857 /* Split or merge the VMA (if required) and apply the new policy */ 858 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 859 struct vm_area_struct **prev, unsigned long start, 860 unsigned long end, struct mempolicy *new_pol) 861 { 862 unsigned long vmstart, vmend; 863 864 vmend = min(end, vma->vm_end); 865 if (start > vma->vm_start) { 866 *prev = vma; 867 vmstart = start; 868 } else { 869 vmstart = vma->vm_start; 870 } 871 872 if (mpol_equal(vma->vm_policy, new_pol)) { 873 *prev = vma; 874 return 0; 875 } 876 877 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 878 if (IS_ERR(vma)) 879 return PTR_ERR(vma); 880 881 *prev = vma; 882 return vma_replace_policy(vma, new_pol); 883 } 884 885 /* Set the process memory policy */ 886 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 887 nodemask_t *nodes) 888 { 889 struct mempolicy *new, *old; 890 NODEMASK_SCRATCH(scratch); 891 int ret; 892 893 if (!scratch) 894 return -ENOMEM; 895 896 new = mpol_new(mode, flags, nodes); 897 if (IS_ERR(new)) { 898 ret = PTR_ERR(new); 899 goto out; 900 } 901 902 task_lock(current); 903 ret = mpol_set_nodemask(new, nodes, scratch); 904 if (ret) { 905 task_unlock(current); 906 mpol_put(new); 907 goto out; 908 } 909 910 old = current->mempolicy; 911 current->mempolicy = new; 912 if (new && (new->mode == MPOL_INTERLEAVE || 913 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 914 current->il_prev = MAX_NUMNODES-1; 915 current->il_weight = 0; 916 } 917 task_unlock(current); 918 mpol_put(old); 919 ret = 0; 920 out: 921 NODEMASK_SCRATCH_FREE(scratch); 922 return ret; 923 } 924 925 /* 926 * Return nodemask for policy for get_mempolicy() query 927 * 928 * Called with task's alloc_lock held 929 */ 930 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 931 { 932 nodes_clear(*nodes); 933 if (pol == &default_policy) 934 return; 935 936 switch (pol->mode) { 937 case MPOL_BIND: 938 case MPOL_INTERLEAVE: 939 case MPOL_PREFERRED: 940 case MPOL_PREFERRED_MANY: 941 case MPOL_WEIGHTED_INTERLEAVE: 942 *nodes = pol->nodes; 943 break; 944 case MPOL_LOCAL: 945 /* return empty node mask for local allocation */ 946 break; 947 default: 948 BUG(); 949 } 950 } 951 952 static int lookup_node(struct mm_struct *mm, unsigned long addr) 953 { 954 struct page *p = NULL; 955 int ret; 956 957 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 958 if (ret > 0) { 959 ret = page_to_nid(p); 960 put_page(p); 961 } 962 return ret; 963 } 964 965 /* Retrieve NUMA policy */ 966 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 967 unsigned long addr, unsigned long flags) 968 { 969 int err; 970 struct mm_struct *mm = current->mm; 971 struct vm_area_struct *vma = NULL; 972 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 973 974 if (flags & 975 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 976 return -EINVAL; 977 978 if (flags & MPOL_F_MEMS_ALLOWED) { 979 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 980 return -EINVAL; 981 *policy = 0; /* just so it's initialized */ 982 task_lock(current); 983 *nmask = cpuset_current_mems_allowed; 984 task_unlock(current); 985 return 0; 986 } 987 988 if (flags & MPOL_F_ADDR) { 989 pgoff_t ilx; /* ignored here */ 990 /* 991 * Do NOT fall back to task policy if the 992 * vma/shared policy at addr is NULL. We 993 * want to return MPOL_DEFAULT in this case. 994 */ 995 mmap_read_lock(mm); 996 vma = vma_lookup(mm, addr); 997 if (!vma) { 998 mmap_read_unlock(mm); 999 return -EFAULT; 1000 } 1001 pol = __get_vma_policy(vma, addr, &ilx); 1002 } else if (addr) 1003 return -EINVAL; 1004 1005 if (!pol) 1006 pol = &default_policy; /* indicates default behavior */ 1007 1008 if (flags & MPOL_F_NODE) { 1009 if (flags & MPOL_F_ADDR) { 1010 /* 1011 * Take a refcount on the mpol, because we are about to 1012 * drop the mmap_lock, after which only "pol" remains 1013 * valid, "vma" is stale. 1014 */ 1015 pol_refcount = pol; 1016 vma = NULL; 1017 mpol_get(pol); 1018 mmap_read_unlock(mm); 1019 err = lookup_node(mm, addr); 1020 if (err < 0) 1021 goto out; 1022 *policy = err; 1023 } else if (pol == current->mempolicy && 1024 pol->mode == MPOL_INTERLEAVE) { 1025 *policy = next_node_in(current->il_prev, pol->nodes); 1026 } else if (pol == current->mempolicy && 1027 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1028 if (current->il_weight) 1029 *policy = current->il_prev; 1030 else 1031 *policy = next_node_in(current->il_prev, 1032 pol->nodes); 1033 } else { 1034 err = -EINVAL; 1035 goto out; 1036 } 1037 } else { 1038 *policy = pol == &default_policy ? MPOL_DEFAULT : 1039 pol->mode; 1040 /* 1041 * Internal mempolicy flags must be masked off before exposing 1042 * the policy to userspace. 1043 */ 1044 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1045 } 1046 1047 err = 0; 1048 if (nmask) { 1049 if (mpol_store_user_nodemask(pol)) { 1050 *nmask = pol->w.user_nodemask; 1051 } else { 1052 task_lock(current); 1053 get_policy_nodemask(pol, nmask); 1054 task_unlock(current); 1055 } 1056 } 1057 1058 out: 1059 mpol_cond_put(pol); 1060 if (vma) 1061 mmap_read_unlock(mm); 1062 if (pol_refcount) 1063 mpol_put(pol_refcount); 1064 return err; 1065 } 1066 1067 #ifdef CONFIG_MIGRATION 1068 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1069 unsigned long flags) 1070 { 1071 /* 1072 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1073 * Choosing not to migrate a shared folio is not counted as a failure. 1074 * 1075 * See folio_maybe_mapped_shared() on possible imprecision when we 1076 * cannot easily detect if a folio is shared. 1077 */ 1078 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1079 if (folio_isolate_lru(folio)) { 1080 list_add_tail(&folio->lru, foliolist); 1081 node_stat_mod_folio(folio, 1082 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1083 folio_nr_pages(folio)); 1084 } else { 1085 /* 1086 * Non-movable folio may reach here. And, there may be 1087 * temporary off LRU folios or non-LRU movable folios. 1088 * Treat them as unmovable folios since they can't be 1089 * isolated, so they can't be moved at the moment. 1090 */ 1091 return false; 1092 } 1093 } 1094 return true; 1095 } 1096 1097 /* 1098 * Migrate pages from one node to a target node. 1099 * Returns error or the number of pages not migrated. 1100 */ 1101 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1102 int flags) 1103 { 1104 nodemask_t nmask; 1105 struct vm_area_struct *vma; 1106 LIST_HEAD(pagelist); 1107 long nr_failed; 1108 long err = 0; 1109 struct migration_target_control mtc = { 1110 .nid = dest, 1111 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1112 .reason = MR_SYSCALL, 1113 }; 1114 1115 nodes_clear(nmask); 1116 node_set(source, nmask); 1117 1118 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1119 1120 mmap_read_lock(mm); 1121 vma = find_vma(mm, 0); 1122 if (unlikely(!vma)) { 1123 mmap_read_unlock(mm); 1124 return 0; 1125 } 1126 1127 /* 1128 * This does not migrate the range, but isolates all pages that 1129 * need migration. Between passing in the full user address 1130 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1131 * but passes back the count of pages which could not be isolated. 1132 */ 1133 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1134 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1135 mmap_read_unlock(mm); 1136 1137 if (!list_empty(&pagelist)) { 1138 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1139 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1140 if (err) 1141 putback_movable_pages(&pagelist); 1142 } 1143 1144 if (err >= 0) 1145 err += nr_failed; 1146 return err; 1147 } 1148 1149 /* 1150 * Move pages between the two nodesets so as to preserve the physical 1151 * layout as much as possible. 1152 * 1153 * Returns the number of page that could not be moved. 1154 */ 1155 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1156 const nodemask_t *to, int flags) 1157 { 1158 long nr_failed = 0; 1159 long err = 0; 1160 nodemask_t tmp; 1161 1162 lru_cache_disable(); 1163 1164 /* 1165 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1166 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1167 * bit in 'tmp', and return that <source, dest> pair for migration. 1168 * The pair of nodemasks 'to' and 'from' define the map. 1169 * 1170 * If no pair of bits is found that way, fallback to picking some 1171 * pair of 'source' and 'dest' bits that are not the same. If the 1172 * 'source' and 'dest' bits are the same, this represents a node 1173 * that will be migrating to itself, so no pages need move. 1174 * 1175 * If no bits are left in 'tmp', or if all remaining bits left 1176 * in 'tmp' correspond to the same bit in 'to', return false 1177 * (nothing left to migrate). 1178 * 1179 * This lets us pick a pair of nodes to migrate between, such that 1180 * if possible the dest node is not already occupied by some other 1181 * source node, minimizing the risk of overloading the memory on a 1182 * node that would happen if we migrated incoming memory to a node 1183 * before migrating outgoing memory source that same node. 1184 * 1185 * A single scan of tmp is sufficient. As we go, we remember the 1186 * most recent <s, d> pair that moved (s != d). If we find a pair 1187 * that not only moved, but what's better, moved to an empty slot 1188 * (d is not set in tmp), then we break out then, with that pair. 1189 * Otherwise when we finish scanning from_tmp, we at least have the 1190 * most recent <s, d> pair that moved. If we get all the way through 1191 * the scan of tmp without finding any node that moved, much less 1192 * moved to an empty node, then there is nothing left worth migrating. 1193 */ 1194 1195 tmp = *from; 1196 while (!nodes_empty(tmp)) { 1197 int s, d; 1198 int source = NUMA_NO_NODE; 1199 int dest = 0; 1200 1201 for_each_node_mask(s, tmp) { 1202 1203 /* 1204 * do_migrate_pages() tries to maintain the relative 1205 * node relationship of the pages established between 1206 * threads and memory areas. 1207 * 1208 * However if the number of source nodes is not equal to 1209 * the number of destination nodes we can not preserve 1210 * this node relative relationship. In that case, skip 1211 * copying memory from a node that is in the destination 1212 * mask. 1213 * 1214 * Example: [2,3,4] -> [3,4,5] moves everything. 1215 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1216 */ 1217 1218 if ((nodes_weight(*from) != nodes_weight(*to)) && 1219 (node_isset(s, *to))) 1220 continue; 1221 1222 d = node_remap(s, *from, *to); 1223 if (s == d) 1224 continue; 1225 1226 source = s; /* Node moved. Memorize */ 1227 dest = d; 1228 1229 /* dest not in remaining from nodes? */ 1230 if (!node_isset(dest, tmp)) 1231 break; 1232 } 1233 if (source == NUMA_NO_NODE) 1234 break; 1235 1236 node_clear(source, tmp); 1237 err = migrate_to_node(mm, source, dest, flags); 1238 if (err > 0) 1239 nr_failed += err; 1240 if (err < 0) 1241 break; 1242 } 1243 1244 lru_cache_enable(); 1245 if (err < 0) 1246 return err; 1247 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1248 } 1249 1250 /* 1251 * Allocate a new folio for page migration, according to NUMA mempolicy. 1252 */ 1253 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1254 unsigned long private) 1255 { 1256 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1257 struct mempolicy *pol = mmpol->pol; 1258 pgoff_t ilx = mmpol->ilx; 1259 unsigned int order; 1260 int nid = numa_node_id(); 1261 gfp_t gfp; 1262 1263 order = folio_order(src); 1264 ilx += src->index >> order; 1265 1266 if (folio_test_hugetlb(src)) { 1267 nodemask_t *nodemask; 1268 struct hstate *h; 1269 1270 h = folio_hstate(src); 1271 gfp = htlb_alloc_mask(h); 1272 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1273 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1274 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1275 } 1276 1277 if (folio_test_large(src)) 1278 gfp = GFP_TRANSHUGE; 1279 else 1280 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1281 1282 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1283 } 1284 #else 1285 1286 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1287 unsigned long flags) 1288 { 1289 return false; 1290 } 1291 1292 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1293 const nodemask_t *to, int flags) 1294 { 1295 return -ENOSYS; 1296 } 1297 1298 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1299 unsigned long private) 1300 { 1301 return NULL; 1302 } 1303 #endif 1304 1305 static long do_mbind(unsigned long start, unsigned long len, 1306 unsigned short mode, unsigned short mode_flags, 1307 nodemask_t *nmask, unsigned long flags) 1308 { 1309 struct mm_struct *mm = current->mm; 1310 struct vm_area_struct *vma, *prev; 1311 struct vma_iterator vmi; 1312 struct migration_mpol mmpol; 1313 struct mempolicy *new; 1314 unsigned long end; 1315 long err; 1316 long nr_failed; 1317 LIST_HEAD(pagelist); 1318 1319 if (flags & ~(unsigned long)MPOL_MF_VALID) 1320 return -EINVAL; 1321 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1322 return -EPERM; 1323 1324 if (start & ~PAGE_MASK) 1325 return -EINVAL; 1326 1327 if (mode == MPOL_DEFAULT) 1328 flags &= ~MPOL_MF_STRICT; 1329 1330 len = PAGE_ALIGN(len); 1331 end = start + len; 1332 1333 if (end < start) 1334 return -EINVAL; 1335 if (end == start) 1336 return 0; 1337 1338 new = mpol_new(mode, mode_flags, nmask); 1339 if (IS_ERR(new)) 1340 return PTR_ERR(new); 1341 1342 /* 1343 * If we are using the default policy then operation 1344 * on discontinuous address spaces is okay after all 1345 */ 1346 if (!new) 1347 flags |= MPOL_MF_DISCONTIG_OK; 1348 1349 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1350 lru_cache_disable(); 1351 { 1352 NODEMASK_SCRATCH(scratch); 1353 if (scratch) { 1354 mmap_write_lock(mm); 1355 err = mpol_set_nodemask(new, nmask, scratch); 1356 if (err) 1357 mmap_write_unlock(mm); 1358 } else 1359 err = -ENOMEM; 1360 NODEMASK_SCRATCH_FREE(scratch); 1361 } 1362 if (err) 1363 goto mpol_out; 1364 1365 /* 1366 * Lock the VMAs before scanning for pages to migrate, 1367 * to ensure we don't miss a concurrently inserted page. 1368 */ 1369 nr_failed = queue_pages_range(mm, start, end, nmask, 1370 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1371 1372 if (nr_failed < 0) { 1373 err = nr_failed; 1374 nr_failed = 0; 1375 } else { 1376 vma_iter_init(&vmi, mm, start); 1377 prev = vma_prev(&vmi); 1378 for_each_vma_range(vmi, vma, end) { 1379 err = mbind_range(&vmi, vma, &prev, start, end, new); 1380 if (err) 1381 break; 1382 } 1383 } 1384 1385 if (!err && !list_empty(&pagelist)) { 1386 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1387 if (!new) { 1388 new = get_task_policy(current); 1389 mpol_get(new); 1390 } 1391 mmpol.pol = new; 1392 mmpol.ilx = 0; 1393 1394 /* 1395 * In the interleaved case, attempt to allocate on exactly the 1396 * targeted nodes, for the first VMA to be migrated; for later 1397 * VMAs, the nodes will still be interleaved from the targeted 1398 * nodemask, but one by one may be selected differently. 1399 */ 1400 if (new->mode == MPOL_INTERLEAVE || 1401 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1402 struct folio *folio; 1403 unsigned int order; 1404 unsigned long addr = -EFAULT; 1405 1406 list_for_each_entry(folio, &pagelist, lru) { 1407 if (!folio_test_ksm(folio)) 1408 break; 1409 } 1410 if (!list_entry_is_head(folio, &pagelist, lru)) { 1411 vma_iter_init(&vmi, mm, start); 1412 for_each_vma_range(vmi, vma, end) { 1413 addr = page_address_in_vma(folio, 1414 folio_page(folio, 0), vma); 1415 if (addr != -EFAULT) 1416 break; 1417 } 1418 } 1419 if (addr != -EFAULT) { 1420 order = folio_order(folio); 1421 /* We already know the pol, but not the ilx */ 1422 mpol_cond_put(get_vma_policy(vma, addr, order, 1423 &mmpol.ilx)); 1424 /* Set base from which to increment by index */ 1425 mmpol.ilx -= folio->index >> order; 1426 } 1427 } 1428 } 1429 1430 mmap_write_unlock(mm); 1431 1432 if (!err && !list_empty(&pagelist)) { 1433 nr_failed |= migrate_pages(&pagelist, 1434 alloc_migration_target_by_mpol, NULL, 1435 (unsigned long)&mmpol, MIGRATE_SYNC, 1436 MR_MEMPOLICY_MBIND, NULL); 1437 } 1438 1439 if (nr_failed && (flags & MPOL_MF_STRICT)) 1440 err = -EIO; 1441 if (!list_empty(&pagelist)) 1442 putback_movable_pages(&pagelist); 1443 mpol_out: 1444 mpol_put(new); 1445 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1446 lru_cache_enable(); 1447 return err; 1448 } 1449 1450 /* 1451 * User space interface with variable sized bitmaps for nodelists. 1452 */ 1453 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1454 unsigned long maxnode) 1455 { 1456 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1457 int ret; 1458 1459 if (in_compat_syscall()) 1460 ret = compat_get_bitmap(mask, 1461 (const compat_ulong_t __user *)nmask, 1462 maxnode); 1463 else 1464 ret = copy_from_user(mask, nmask, 1465 nlongs * sizeof(unsigned long)); 1466 1467 if (ret) 1468 return -EFAULT; 1469 1470 if (maxnode % BITS_PER_LONG) 1471 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1472 1473 return 0; 1474 } 1475 1476 /* Copy a node mask from user space. */ 1477 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1478 unsigned long maxnode) 1479 { 1480 --maxnode; 1481 nodes_clear(*nodes); 1482 if (maxnode == 0 || !nmask) 1483 return 0; 1484 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1485 return -EINVAL; 1486 1487 /* 1488 * When the user specified more nodes than supported just check 1489 * if the non supported part is all zero, one word at a time, 1490 * starting at the end. 1491 */ 1492 while (maxnode > MAX_NUMNODES) { 1493 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1494 unsigned long t; 1495 1496 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1497 return -EFAULT; 1498 1499 if (maxnode - bits >= MAX_NUMNODES) { 1500 maxnode -= bits; 1501 } else { 1502 maxnode = MAX_NUMNODES; 1503 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1504 } 1505 if (t) 1506 return -EINVAL; 1507 } 1508 1509 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1510 } 1511 1512 /* Copy a kernel node mask to user space */ 1513 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1514 nodemask_t *nodes) 1515 { 1516 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1517 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1518 bool compat = in_compat_syscall(); 1519 1520 if (compat) 1521 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1522 1523 if (copy > nbytes) { 1524 if (copy > PAGE_SIZE) 1525 return -EINVAL; 1526 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1527 return -EFAULT; 1528 copy = nbytes; 1529 maxnode = nr_node_ids; 1530 } 1531 1532 if (compat) 1533 return compat_put_bitmap((compat_ulong_t __user *)mask, 1534 nodes_addr(*nodes), maxnode); 1535 1536 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1537 } 1538 1539 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1540 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1541 { 1542 *flags = *mode & MPOL_MODE_FLAGS; 1543 *mode &= ~MPOL_MODE_FLAGS; 1544 1545 if ((unsigned int)(*mode) >= MPOL_MAX) 1546 return -EINVAL; 1547 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1548 return -EINVAL; 1549 if (*flags & MPOL_F_NUMA_BALANCING) { 1550 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1551 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1552 else 1553 return -EINVAL; 1554 } 1555 return 0; 1556 } 1557 1558 static long kernel_mbind(unsigned long start, unsigned long len, 1559 unsigned long mode, const unsigned long __user *nmask, 1560 unsigned long maxnode, unsigned int flags) 1561 { 1562 unsigned short mode_flags; 1563 nodemask_t nodes; 1564 int lmode = mode; 1565 int err; 1566 1567 start = untagged_addr(start); 1568 err = sanitize_mpol_flags(&lmode, &mode_flags); 1569 if (err) 1570 return err; 1571 1572 err = get_nodes(&nodes, nmask, maxnode); 1573 if (err) 1574 return err; 1575 1576 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1577 } 1578 1579 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1580 unsigned long, home_node, unsigned long, flags) 1581 { 1582 struct mm_struct *mm = current->mm; 1583 struct vm_area_struct *vma, *prev; 1584 struct mempolicy *new, *old; 1585 unsigned long end; 1586 int err = -ENOENT; 1587 VMA_ITERATOR(vmi, mm, start); 1588 1589 start = untagged_addr(start); 1590 if (start & ~PAGE_MASK) 1591 return -EINVAL; 1592 /* 1593 * flags is used for future extension if any. 1594 */ 1595 if (flags != 0) 1596 return -EINVAL; 1597 1598 /* 1599 * Check home_node is online to avoid accessing uninitialized 1600 * NODE_DATA. 1601 */ 1602 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1603 return -EINVAL; 1604 1605 len = PAGE_ALIGN(len); 1606 end = start + len; 1607 1608 if (end < start) 1609 return -EINVAL; 1610 if (end == start) 1611 return 0; 1612 mmap_write_lock(mm); 1613 prev = vma_prev(&vmi); 1614 for_each_vma_range(vmi, vma, end) { 1615 /* 1616 * If any vma in the range got policy other than MPOL_BIND 1617 * or MPOL_PREFERRED_MANY we return error. We don't reset 1618 * the home node for vmas we already updated before. 1619 */ 1620 old = vma_policy(vma); 1621 if (!old) { 1622 prev = vma; 1623 continue; 1624 } 1625 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1626 err = -EOPNOTSUPP; 1627 break; 1628 } 1629 new = mpol_dup(old); 1630 if (IS_ERR(new)) { 1631 err = PTR_ERR(new); 1632 break; 1633 } 1634 1635 vma_start_write(vma); 1636 new->home_node = home_node; 1637 err = mbind_range(&vmi, vma, &prev, start, end, new); 1638 mpol_put(new); 1639 if (err) 1640 break; 1641 } 1642 mmap_write_unlock(mm); 1643 return err; 1644 } 1645 1646 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1647 unsigned long, mode, const unsigned long __user *, nmask, 1648 unsigned long, maxnode, unsigned int, flags) 1649 { 1650 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1651 } 1652 1653 /* Set the process memory policy */ 1654 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1655 unsigned long maxnode) 1656 { 1657 unsigned short mode_flags; 1658 nodemask_t nodes; 1659 int lmode = mode; 1660 int err; 1661 1662 err = sanitize_mpol_flags(&lmode, &mode_flags); 1663 if (err) 1664 return err; 1665 1666 err = get_nodes(&nodes, nmask, maxnode); 1667 if (err) 1668 return err; 1669 1670 return do_set_mempolicy(lmode, mode_flags, &nodes); 1671 } 1672 1673 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1674 unsigned long, maxnode) 1675 { 1676 return kernel_set_mempolicy(mode, nmask, maxnode); 1677 } 1678 1679 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1680 const unsigned long __user *old_nodes, 1681 const unsigned long __user *new_nodes) 1682 { 1683 struct mm_struct *mm = NULL; 1684 struct task_struct *task; 1685 nodemask_t task_nodes; 1686 int err; 1687 nodemask_t *old; 1688 nodemask_t *new; 1689 NODEMASK_SCRATCH(scratch); 1690 1691 if (!scratch) 1692 return -ENOMEM; 1693 1694 old = &scratch->mask1; 1695 new = &scratch->mask2; 1696 1697 err = get_nodes(old, old_nodes, maxnode); 1698 if (err) 1699 goto out; 1700 1701 err = get_nodes(new, new_nodes, maxnode); 1702 if (err) 1703 goto out; 1704 1705 /* Find the mm_struct */ 1706 rcu_read_lock(); 1707 task = pid ? find_task_by_vpid(pid) : current; 1708 if (!task) { 1709 rcu_read_unlock(); 1710 err = -ESRCH; 1711 goto out; 1712 } 1713 get_task_struct(task); 1714 1715 err = -EINVAL; 1716 1717 /* 1718 * Check if this process has the right to modify the specified process. 1719 * Use the regular "ptrace_may_access()" checks. 1720 */ 1721 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1722 rcu_read_unlock(); 1723 err = -EPERM; 1724 goto out_put; 1725 } 1726 rcu_read_unlock(); 1727 1728 task_nodes = cpuset_mems_allowed(task); 1729 /* Is the user allowed to access the target nodes? */ 1730 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1731 err = -EPERM; 1732 goto out_put; 1733 } 1734 1735 task_nodes = cpuset_mems_allowed(current); 1736 nodes_and(*new, *new, task_nodes); 1737 if (nodes_empty(*new)) 1738 goto out_put; 1739 1740 err = security_task_movememory(task); 1741 if (err) 1742 goto out_put; 1743 1744 mm = get_task_mm(task); 1745 put_task_struct(task); 1746 1747 if (!mm) { 1748 err = -EINVAL; 1749 goto out; 1750 } 1751 1752 err = do_migrate_pages(mm, old, new, 1753 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1754 1755 mmput(mm); 1756 out: 1757 NODEMASK_SCRATCH_FREE(scratch); 1758 1759 return err; 1760 1761 out_put: 1762 put_task_struct(task); 1763 goto out; 1764 } 1765 1766 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1767 const unsigned long __user *, old_nodes, 1768 const unsigned long __user *, new_nodes) 1769 { 1770 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1771 } 1772 1773 /* Retrieve NUMA policy */ 1774 static int kernel_get_mempolicy(int __user *policy, 1775 unsigned long __user *nmask, 1776 unsigned long maxnode, 1777 unsigned long addr, 1778 unsigned long flags) 1779 { 1780 int err; 1781 int pval; 1782 nodemask_t nodes; 1783 1784 if (nmask != NULL && maxnode < nr_node_ids) 1785 return -EINVAL; 1786 1787 addr = untagged_addr(addr); 1788 1789 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1790 1791 if (err) 1792 return err; 1793 1794 if (policy && put_user(pval, policy)) 1795 return -EFAULT; 1796 1797 if (nmask) 1798 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1799 1800 return err; 1801 } 1802 1803 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1804 unsigned long __user *, nmask, unsigned long, maxnode, 1805 unsigned long, addr, unsigned long, flags) 1806 { 1807 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1808 } 1809 1810 bool vma_migratable(struct vm_area_struct *vma) 1811 { 1812 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1813 return false; 1814 1815 /* 1816 * DAX device mappings require predictable access latency, so avoid 1817 * incurring periodic faults. 1818 */ 1819 if (vma_is_dax(vma)) 1820 return false; 1821 1822 if (is_vm_hugetlb_page(vma) && 1823 !hugepage_migration_supported(hstate_vma(vma))) 1824 return false; 1825 1826 /* 1827 * Migration allocates pages in the highest zone. If we cannot 1828 * do so then migration (at least from node to node) is not 1829 * possible. 1830 */ 1831 if (vma->vm_file && 1832 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 1833 < policy_zone) 1834 return false; 1835 return true; 1836 } 1837 1838 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1839 unsigned long addr, pgoff_t *ilx) 1840 { 1841 *ilx = 0; 1842 return (vma->vm_ops && vma->vm_ops->get_policy) ? 1843 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 1844 } 1845 1846 /* 1847 * get_vma_policy(@vma, @addr, @order, @ilx) 1848 * @vma: virtual memory area whose policy is sought 1849 * @addr: address in @vma for shared policy lookup 1850 * @order: 0, or appropriate huge_page_order for interleaving 1851 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 1852 * MPOL_WEIGHTED_INTERLEAVE 1853 * 1854 * Returns effective policy for a VMA at specified address. 1855 * Falls back to current->mempolicy or system default policy, as necessary. 1856 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1857 * count--added by the get_policy() vm_op, as appropriate--to protect against 1858 * freeing by another task. It is the caller's responsibility to free the 1859 * extra reference for shared policies. 1860 */ 1861 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1862 unsigned long addr, int order, pgoff_t *ilx) 1863 { 1864 struct mempolicy *pol; 1865 1866 pol = __get_vma_policy(vma, addr, ilx); 1867 if (!pol) 1868 pol = get_task_policy(current); 1869 if (pol->mode == MPOL_INTERLEAVE || 1870 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1871 *ilx += vma->vm_pgoff >> order; 1872 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1873 } 1874 return pol; 1875 } 1876 1877 bool vma_policy_mof(struct vm_area_struct *vma) 1878 { 1879 struct mempolicy *pol; 1880 1881 if (vma->vm_ops && vma->vm_ops->get_policy) { 1882 bool ret = false; 1883 pgoff_t ilx; /* ignored here */ 1884 1885 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1886 if (pol && (pol->flags & MPOL_F_MOF)) 1887 ret = true; 1888 mpol_cond_put(pol); 1889 1890 return ret; 1891 } 1892 1893 pol = vma->vm_policy; 1894 if (!pol) 1895 pol = get_task_policy(current); 1896 1897 return pol->flags & MPOL_F_MOF; 1898 } 1899 1900 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1901 { 1902 enum zone_type dynamic_policy_zone = policy_zone; 1903 1904 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1905 1906 /* 1907 * if policy->nodes has movable memory only, 1908 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1909 * 1910 * policy->nodes is intersect with node_states[N_MEMORY]. 1911 * so if the following test fails, it implies 1912 * policy->nodes has movable memory only. 1913 */ 1914 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 1915 dynamic_policy_zone = ZONE_MOVABLE; 1916 1917 return zone >= dynamic_policy_zone; 1918 } 1919 1920 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 1921 { 1922 unsigned int node; 1923 unsigned int cpuset_mems_cookie; 1924 1925 retry: 1926 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 1927 cpuset_mems_cookie = read_mems_allowed_begin(); 1928 node = current->il_prev; 1929 if (!current->il_weight || !node_isset(node, policy->nodes)) { 1930 node = next_node_in(node, policy->nodes); 1931 if (read_mems_allowed_retry(cpuset_mems_cookie)) 1932 goto retry; 1933 if (node == MAX_NUMNODES) 1934 return node; 1935 current->il_prev = node; 1936 current->il_weight = get_il_weight(node); 1937 } 1938 current->il_weight--; 1939 return node; 1940 } 1941 1942 /* Do dynamic interleaving for a process */ 1943 static unsigned int interleave_nodes(struct mempolicy *policy) 1944 { 1945 unsigned int nid; 1946 unsigned int cpuset_mems_cookie; 1947 1948 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 1949 do { 1950 cpuset_mems_cookie = read_mems_allowed_begin(); 1951 nid = next_node_in(current->il_prev, policy->nodes); 1952 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1953 1954 if (nid < MAX_NUMNODES) 1955 current->il_prev = nid; 1956 return nid; 1957 } 1958 1959 /* 1960 * Depending on the memory policy provide a node from which to allocate the 1961 * next slab entry. 1962 */ 1963 unsigned int mempolicy_slab_node(void) 1964 { 1965 struct mempolicy *policy; 1966 int node = numa_mem_id(); 1967 1968 if (!in_task()) 1969 return node; 1970 1971 policy = current->mempolicy; 1972 if (!policy) 1973 return node; 1974 1975 switch (policy->mode) { 1976 case MPOL_PREFERRED: 1977 return first_node(policy->nodes); 1978 1979 case MPOL_INTERLEAVE: 1980 return interleave_nodes(policy); 1981 1982 case MPOL_WEIGHTED_INTERLEAVE: 1983 return weighted_interleave_nodes(policy); 1984 1985 case MPOL_BIND: 1986 case MPOL_PREFERRED_MANY: 1987 { 1988 struct zoneref *z; 1989 1990 /* 1991 * Follow bind policy behavior and start allocation at the 1992 * first node. 1993 */ 1994 struct zonelist *zonelist; 1995 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1996 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 1997 z = first_zones_zonelist(zonelist, highest_zoneidx, 1998 &policy->nodes); 1999 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2000 } 2001 case MPOL_LOCAL: 2002 return node; 2003 2004 default: 2005 BUG(); 2006 } 2007 } 2008 2009 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2010 nodemask_t *mask) 2011 { 2012 /* 2013 * barrier stabilizes the nodemask locally so that it can be iterated 2014 * over safely without concern for changes. Allocators validate node 2015 * selection does not violate mems_allowed, so this is safe. 2016 */ 2017 barrier(); 2018 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2019 barrier(); 2020 return nodes_weight(*mask); 2021 } 2022 2023 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2024 { 2025 nodemask_t nodemask; 2026 unsigned int target, nr_nodes; 2027 u8 *table; 2028 unsigned int weight_total = 0; 2029 u8 weight; 2030 int nid; 2031 2032 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2033 if (!nr_nodes) 2034 return numa_node_id(); 2035 2036 rcu_read_lock(); 2037 table = rcu_dereference(iw_table); 2038 /* calculate the total weight */ 2039 for_each_node_mask(nid, nodemask) { 2040 /* detect system default usage */ 2041 weight = table ? table[nid] : 1; 2042 weight = weight ? weight : 1; 2043 weight_total += weight; 2044 } 2045 2046 /* Calculate the node offset based on totals */ 2047 target = ilx % weight_total; 2048 nid = first_node(nodemask); 2049 while (target) { 2050 /* detect system default usage */ 2051 weight = table ? table[nid] : 1; 2052 weight = weight ? weight : 1; 2053 if (target < weight) 2054 break; 2055 target -= weight; 2056 nid = next_node_in(nid, nodemask); 2057 } 2058 rcu_read_unlock(); 2059 return nid; 2060 } 2061 2062 /* 2063 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2064 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2065 * exceeds the number of present nodes. 2066 */ 2067 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2068 { 2069 nodemask_t nodemask; 2070 unsigned int target, nnodes; 2071 int i; 2072 int nid; 2073 2074 nnodes = read_once_policy_nodemask(pol, &nodemask); 2075 if (!nnodes) 2076 return numa_node_id(); 2077 target = ilx % nnodes; 2078 nid = first_node(nodemask); 2079 for (i = 0; i < target; i++) 2080 nid = next_node(nid, nodemask); 2081 return nid; 2082 } 2083 2084 /* 2085 * Return a nodemask representing a mempolicy for filtering nodes for 2086 * page allocation, together with preferred node id (or the input node id). 2087 */ 2088 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2089 pgoff_t ilx, int *nid) 2090 { 2091 nodemask_t *nodemask = NULL; 2092 2093 switch (pol->mode) { 2094 case MPOL_PREFERRED: 2095 /* Override input node id */ 2096 *nid = first_node(pol->nodes); 2097 break; 2098 case MPOL_PREFERRED_MANY: 2099 nodemask = &pol->nodes; 2100 if (pol->home_node != NUMA_NO_NODE) 2101 *nid = pol->home_node; 2102 break; 2103 case MPOL_BIND: 2104 /* Restrict to nodemask (but not on lower zones) */ 2105 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2106 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2107 nodemask = &pol->nodes; 2108 if (pol->home_node != NUMA_NO_NODE) 2109 *nid = pol->home_node; 2110 /* 2111 * __GFP_THISNODE shouldn't even be used with the bind policy 2112 * because we might easily break the expectation to stay on the 2113 * requested node and not break the policy. 2114 */ 2115 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2116 break; 2117 case MPOL_INTERLEAVE: 2118 /* Override input node id */ 2119 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2120 interleave_nodes(pol) : interleave_nid(pol, ilx); 2121 break; 2122 case MPOL_WEIGHTED_INTERLEAVE: 2123 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2124 weighted_interleave_nodes(pol) : 2125 weighted_interleave_nid(pol, ilx); 2126 break; 2127 } 2128 2129 return nodemask; 2130 } 2131 2132 #ifdef CONFIG_HUGETLBFS 2133 /* 2134 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2135 * @vma: virtual memory area whose policy is sought 2136 * @addr: address in @vma for shared policy lookup and interleave policy 2137 * @gfp_flags: for requested zone 2138 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2139 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2140 * 2141 * Returns a nid suitable for a huge page allocation and a pointer 2142 * to the struct mempolicy for conditional unref after allocation. 2143 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2144 * to the mempolicy's @nodemask for filtering the zonelist. 2145 */ 2146 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2147 struct mempolicy **mpol, nodemask_t **nodemask) 2148 { 2149 pgoff_t ilx; 2150 int nid; 2151 2152 nid = numa_node_id(); 2153 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2154 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2155 return nid; 2156 } 2157 2158 /* 2159 * init_nodemask_of_mempolicy 2160 * 2161 * If the current task's mempolicy is "default" [NULL], return 'false' 2162 * to indicate default policy. Otherwise, extract the policy nodemask 2163 * for 'bind' or 'interleave' policy into the argument nodemask, or 2164 * initialize the argument nodemask to contain the single node for 2165 * 'preferred' or 'local' policy and return 'true' to indicate presence 2166 * of non-default mempolicy. 2167 * 2168 * We don't bother with reference counting the mempolicy [mpol_get/put] 2169 * because the current task is examining it's own mempolicy and a task's 2170 * mempolicy is only ever changed by the task itself. 2171 * 2172 * N.B., it is the caller's responsibility to free a returned nodemask. 2173 */ 2174 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2175 { 2176 struct mempolicy *mempolicy; 2177 2178 if (!(mask && current->mempolicy)) 2179 return false; 2180 2181 task_lock(current); 2182 mempolicy = current->mempolicy; 2183 switch (mempolicy->mode) { 2184 case MPOL_PREFERRED: 2185 case MPOL_PREFERRED_MANY: 2186 case MPOL_BIND: 2187 case MPOL_INTERLEAVE: 2188 case MPOL_WEIGHTED_INTERLEAVE: 2189 *mask = mempolicy->nodes; 2190 break; 2191 2192 case MPOL_LOCAL: 2193 init_nodemask_of_node(mask, numa_node_id()); 2194 break; 2195 2196 default: 2197 BUG(); 2198 } 2199 task_unlock(current); 2200 2201 return true; 2202 } 2203 #endif 2204 2205 /* 2206 * mempolicy_in_oom_domain 2207 * 2208 * If tsk's mempolicy is "bind", check for intersection between mask and 2209 * the policy nodemask. Otherwise, return true for all other policies 2210 * including "interleave", as a tsk with "interleave" policy may have 2211 * memory allocated from all nodes in system. 2212 * 2213 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2214 */ 2215 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2216 const nodemask_t *mask) 2217 { 2218 struct mempolicy *mempolicy; 2219 bool ret = true; 2220 2221 if (!mask) 2222 return ret; 2223 2224 task_lock(tsk); 2225 mempolicy = tsk->mempolicy; 2226 if (mempolicy && mempolicy->mode == MPOL_BIND) 2227 ret = nodes_intersects(mempolicy->nodes, *mask); 2228 task_unlock(tsk); 2229 2230 return ret; 2231 } 2232 2233 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2234 int nid, nodemask_t *nodemask) 2235 { 2236 struct page *page; 2237 gfp_t preferred_gfp; 2238 2239 /* 2240 * This is a two pass approach. The first pass will only try the 2241 * preferred nodes but skip the direct reclaim and allow the 2242 * allocation to fail, while the second pass will try all the 2243 * nodes in system. 2244 */ 2245 preferred_gfp = gfp | __GFP_NOWARN; 2246 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2247 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2248 if (!page) 2249 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2250 2251 return page; 2252 } 2253 2254 /** 2255 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2256 * @gfp: GFP flags. 2257 * @order: Order of the page allocation. 2258 * @pol: Pointer to the NUMA mempolicy. 2259 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2260 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2261 * 2262 * Return: The page on success or NULL if allocation fails. 2263 */ 2264 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2265 struct mempolicy *pol, pgoff_t ilx, int nid) 2266 { 2267 nodemask_t *nodemask; 2268 struct page *page; 2269 2270 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2271 2272 if (pol->mode == MPOL_PREFERRED_MANY) 2273 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2274 2275 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2276 /* filter "hugepage" allocation, unless from alloc_pages() */ 2277 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2278 /* 2279 * For hugepage allocation and non-interleave policy which 2280 * allows the current node (or other explicitly preferred 2281 * node) we only try to allocate from the current/preferred 2282 * node and don't fall back to other nodes, as the cost of 2283 * remote accesses would likely offset THP benefits. 2284 * 2285 * If the policy is interleave or does not allow the current 2286 * node in its nodemask, we allocate the standard way. 2287 */ 2288 if (pol->mode != MPOL_INTERLEAVE && 2289 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2290 (!nodemask || node_isset(nid, *nodemask))) { 2291 /* 2292 * First, try to allocate THP only on local node, but 2293 * don't reclaim unnecessarily, just compact. 2294 */ 2295 page = __alloc_frozen_pages_noprof( 2296 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2297 nid, NULL); 2298 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2299 return page; 2300 /* 2301 * If hugepage allocations are configured to always 2302 * synchronous compact or the vma has been madvised 2303 * to prefer hugepage backing, retry allowing remote 2304 * memory with both reclaim and compact as well. 2305 */ 2306 } 2307 } 2308 2309 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2310 2311 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2312 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2313 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2314 if (static_branch_likely(&vm_numa_stat_key) && 2315 page_to_nid(page) == nid) { 2316 preempt_disable(); 2317 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2318 preempt_enable(); 2319 } 2320 } 2321 2322 return page; 2323 } 2324 2325 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2326 struct mempolicy *pol, pgoff_t ilx, int nid) 2327 { 2328 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2329 ilx, nid); 2330 if (!page) 2331 return NULL; 2332 2333 set_page_refcounted(page); 2334 return page_rmappable_folio(page); 2335 } 2336 2337 /** 2338 * vma_alloc_folio - Allocate a folio for a VMA. 2339 * @gfp: GFP flags. 2340 * @order: Order of the folio. 2341 * @vma: Pointer to VMA. 2342 * @addr: Virtual address of the allocation. Must be inside @vma. 2343 * 2344 * Allocate a folio for a specific address in @vma, using the appropriate 2345 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2346 * VMA to prevent it from going away. Should be used for all allocations 2347 * for folios that will be mapped into user space, excepting hugetlbfs, and 2348 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2349 * 2350 * Return: The folio on success or NULL if allocation fails. 2351 */ 2352 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2353 unsigned long addr) 2354 { 2355 struct mempolicy *pol; 2356 pgoff_t ilx; 2357 struct folio *folio; 2358 2359 if (vma->vm_flags & VM_DROPPABLE) 2360 gfp |= __GFP_NOWARN; 2361 2362 pol = get_vma_policy(vma, addr, order, &ilx); 2363 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2364 mpol_cond_put(pol); 2365 return folio; 2366 } 2367 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2368 2369 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2370 { 2371 struct mempolicy *pol = &default_policy; 2372 2373 /* 2374 * No reference counting needed for current->mempolicy 2375 * nor system default_policy 2376 */ 2377 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2378 pol = get_task_policy(current); 2379 2380 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2381 numa_node_id()); 2382 } 2383 2384 /** 2385 * alloc_pages - Allocate pages. 2386 * @gfp: GFP flags. 2387 * @order: Power of two of number of pages to allocate. 2388 * 2389 * Allocate 1 << @order contiguous pages. The physical address of the 2390 * first page is naturally aligned (eg an order-3 allocation will be aligned 2391 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2392 * process is honoured when in process context. 2393 * 2394 * Context: Can be called from any context, providing the appropriate GFP 2395 * flags are used. 2396 * Return: The page on success or NULL if allocation fails. 2397 */ 2398 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2399 { 2400 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2401 2402 if (page) 2403 set_page_refcounted(page); 2404 return page; 2405 } 2406 EXPORT_SYMBOL(alloc_pages_noprof); 2407 2408 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2409 { 2410 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2411 } 2412 EXPORT_SYMBOL(folio_alloc_noprof); 2413 2414 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2415 struct mempolicy *pol, unsigned long nr_pages, 2416 struct page **page_array) 2417 { 2418 int nodes; 2419 unsigned long nr_pages_per_node; 2420 int delta; 2421 int i; 2422 unsigned long nr_allocated; 2423 unsigned long total_allocated = 0; 2424 2425 nodes = nodes_weight(pol->nodes); 2426 nr_pages_per_node = nr_pages / nodes; 2427 delta = nr_pages - nodes * nr_pages_per_node; 2428 2429 for (i = 0; i < nodes; i++) { 2430 if (delta) { 2431 nr_allocated = alloc_pages_bulk_noprof(gfp, 2432 interleave_nodes(pol), NULL, 2433 nr_pages_per_node + 1, 2434 page_array); 2435 delta--; 2436 } else { 2437 nr_allocated = alloc_pages_bulk_noprof(gfp, 2438 interleave_nodes(pol), NULL, 2439 nr_pages_per_node, page_array); 2440 } 2441 2442 page_array += nr_allocated; 2443 total_allocated += nr_allocated; 2444 } 2445 2446 return total_allocated; 2447 } 2448 2449 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2450 struct mempolicy *pol, unsigned long nr_pages, 2451 struct page **page_array) 2452 { 2453 struct task_struct *me = current; 2454 unsigned int cpuset_mems_cookie; 2455 unsigned long total_allocated = 0; 2456 unsigned long nr_allocated = 0; 2457 unsigned long rounds; 2458 unsigned long node_pages, delta; 2459 u8 *table, *weights, weight; 2460 unsigned int weight_total = 0; 2461 unsigned long rem_pages = nr_pages; 2462 nodemask_t nodes; 2463 int nnodes, node; 2464 int resume_node = MAX_NUMNODES - 1; 2465 u8 resume_weight = 0; 2466 int prev_node; 2467 int i; 2468 2469 if (!nr_pages) 2470 return 0; 2471 2472 /* read the nodes onto the stack, retry if done during rebind */ 2473 do { 2474 cpuset_mems_cookie = read_mems_allowed_begin(); 2475 nnodes = read_once_policy_nodemask(pol, &nodes); 2476 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2477 2478 /* if the nodemask has become invalid, we cannot do anything */ 2479 if (!nnodes) 2480 return 0; 2481 2482 /* Continue allocating from most recent node and adjust the nr_pages */ 2483 node = me->il_prev; 2484 weight = me->il_weight; 2485 if (weight && node_isset(node, nodes)) { 2486 node_pages = min(rem_pages, weight); 2487 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2488 page_array); 2489 page_array += nr_allocated; 2490 total_allocated += nr_allocated; 2491 /* if that's all the pages, no need to interleave */ 2492 if (rem_pages <= weight) { 2493 me->il_weight -= rem_pages; 2494 return total_allocated; 2495 } 2496 /* Otherwise we adjust remaining pages, continue from there */ 2497 rem_pages -= weight; 2498 } 2499 /* clear active weight in case of an allocation failure */ 2500 me->il_weight = 0; 2501 prev_node = node; 2502 2503 /* create a local copy of node weights to operate on outside rcu */ 2504 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2505 if (!weights) 2506 return total_allocated; 2507 2508 rcu_read_lock(); 2509 table = rcu_dereference(iw_table); 2510 if (table) 2511 memcpy(weights, table, nr_node_ids); 2512 rcu_read_unlock(); 2513 2514 /* calculate total, detect system default usage */ 2515 for_each_node_mask(node, nodes) { 2516 if (!weights[node]) 2517 weights[node] = 1; 2518 weight_total += weights[node]; 2519 } 2520 2521 /* 2522 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2523 * Track which node weighted interleave should resume from. 2524 * 2525 * if (rounds > 0) and (delta == 0), resume_node will always be 2526 * the node following prev_node and its weight. 2527 */ 2528 rounds = rem_pages / weight_total; 2529 delta = rem_pages % weight_total; 2530 resume_node = next_node_in(prev_node, nodes); 2531 resume_weight = weights[resume_node]; 2532 for (i = 0; i < nnodes; i++) { 2533 node = next_node_in(prev_node, nodes); 2534 weight = weights[node]; 2535 node_pages = weight * rounds; 2536 /* If a delta exists, add this node's portion of the delta */ 2537 if (delta > weight) { 2538 node_pages += weight; 2539 delta -= weight; 2540 } else if (delta) { 2541 /* when delta is depleted, resume from that node */ 2542 node_pages += delta; 2543 resume_node = node; 2544 resume_weight = weight - delta; 2545 delta = 0; 2546 } 2547 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2548 if (!node_pages) 2549 break; 2550 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2551 page_array); 2552 page_array += nr_allocated; 2553 total_allocated += nr_allocated; 2554 if (total_allocated == nr_pages) 2555 break; 2556 prev_node = node; 2557 } 2558 me->il_prev = resume_node; 2559 me->il_weight = resume_weight; 2560 kfree(weights); 2561 return total_allocated; 2562 } 2563 2564 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2565 struct mempolicy *pol, unsigned long nr_pages, 2566 struct page **page_array) 2567 { 2568 gfp_t preferred_gfp; 2569 unsigned long nr_allocated = 0; 2570 2571 preferred_gfp = gfp | __GFP_NOWARN; 2572 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2573 2574 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2575 nr_pages, page_array); 2576 2577 if (nr_allocated < nr_pages) 2578 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2579 nr_pages - nr_allocated, 2580 page_array + nr_allocated); 2581 return nr_allocated; 2582 } 2583 2584 /* alloc pages bulk and mempolicy should be considered at the 2585 * same time in some situation such as vmalloc. 2586 * 2587 * It can accelerate memory allocation especially interleaving 2588 * allocate memory. 2589 */ 2590 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2591 unsigned long nr_pages, struct page **page_array) 2592 { 2593 struct mempolicy *pol = &default_policy; 2594 nodemask_t *nodemask; 2595 int nid; 2596 2597 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2598 pol = get_task_policy(current); 2599 2600 if (pol->mode == MPOL_INTERLEAVE) 2601 return alloc_pages_bulk_interleave(gfp, pol, 2602 nr_pages, page_array); 2603 2604 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2605 return alloc_pages_bulk_weighted_interleave( 2606 gfp, pol, nr_pages, page_array); 2607 2608 if (pol->mode == MPOL_PREFERRED_MANY) 2609 return alloc_pages_bulk_preferred_many(gfp, 2610 numa_node_id(), pol, nr_pages, page_array); 2611 2612 nid = numa_node_id(); 2613 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2614 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2615 nr_pages, page_array); 2616 } 2617 2618 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2619 { 2620 struct mempolicy *pol = mpol_dup(src->vm_policy); 2621 2622 if (IS_ERR(pol)) 2623 return PTR_ERR(pol); 2624 dst->vm_policy = pol; 2625 return 0; 2626 } 2627 2628 /* 2629 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2630 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2631 * with the mems_allowed returned by cpuset_mems_allowed(). This 2632 * keeps mempolicies cpuset relative after its cpuset moves. See 2633 * further kernel/cpuset.c update_nodemask(). 2634 * 2635 * current's mempolicy may be rebinded by the other task(the task that changes 2636 * cpuset's mems), so we needn't do rebind work for current task. 2637 */ 2638 2639 /* Slow path of a mempolicy duplicate */ 2640 struct mempolicy *__mpol_dup(struct mempolicy *old) 2641 { 2642 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2643 2644 if (!new) 2645 return ERR_PTR(-ENOMEM); 2646 2647 /* task's mempolicy is protected by alloc_lock */ 2648 if (old == current->mempolicy) { 2649 task_lock(current); 2650 *new = *old; 2651 task_unlock(current); 2652 } else 2653 *new = *old; 2654 2655 if (current_cpuset_is_being_rebound()) { 2656 nodemask_t mems = cpuset_mems_allowed(current); 2657 mpol_rebind_policy(new, &mems); 2658 } 2659 atomic_set(&new->refcnt, 1); 2660 return new; 2661 } 2662 2663 /* Slow path of a mempolicy comparison */ 2664 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2665 { 2666 if (!a || !b) 2667 return false; 2668 if (a->mode != b->mode) 2669 return false; 2670 if (a->flags != b->flags) 2671 return false; 2672 if (a->home_node != b->home_node) 2673 return false; 2674 if (mpol_store_user_nodemask(a)) 2675 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2676 return false; 2677 2678 switch (a->mode) { 2679 case MPOL_BIND: 2680 case MPOL_INTERLEAVE: 2681 case MPOL_PREFERRED: 2682 case MPOL_PREFERRED_MANY: 2683 case MPOL_WEIGHTED_INTERLEAVE: 2684 return !!nodes_equal(a->nodes, b->nodes); 2685 case MPOL_LOCAL: 2686 return true; 2687 default: 2688 BUG(); 2689 return false; 2690 } 2691 } 2692 2693 /* 2694 * Shared memory backing store policy support. 2695 * 2696 * Remember policies even when nobody has shared memory mapped. 2697 * The policies are kept in Red-Black tree linked from the inode. 2698 * They are protected by the sp->lock rwlock, which should be held 2699 * for any accesses to the tree. 2700 */ 2701 2702 /* 2703 * lookup first element intersecting start-end. Caller holds sp->lock for 2704 * reading or for writing 2705 */ 2706 static struct sp_node *sp_lookup(struct shared_policy *sp, 2707 pgoff_t start, pgoff_t end) 2708 { 2709 struct rb_node *n = sp->root.rb_node; 2710 2711 while (n) { 2712 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2713 2714 if (start >= p->end) 2715 n = n->rb_right; 2716 else if (end <= p->start) 2717 n = n->rb_left; 2718 else 2719 break; 2720 } 2721 if (!n) 2722 return NULL; 2723 for (;;) { 2724 struct sp_node *w = NULL; 2725 struct rb_node *prev = rb_prev(n); 2726 if (!prev) 2727 break; 2728 w = rb_entry(prev, struct sp_node, nd); 2729 if (w->end <= start) 2730 break; 2731 n = prev; 2732 } 2733 return rb_entry(n, struct sp_node, nd); 2734 } 2735 2736 /* 2737 * Insert a new shared policy into the list. Caller holds sp->lock for 2738 * writing. 2739 */ 2740 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2741 { 2742 struct rb_node **p = &sp->root.rb_node; 2743 struct rb_node *parent = NULL; 2744 struct sp_node *nd; 2745 2746 while (*p) { 2747 parent = *p; 2748 nd = rb_entry(parent, struct sp_node, nd); 2749 if (new->start < nd->start) 2750 p = &(*p)->rb_left; 2751 else if (new->end > nd->end) 2752 p = &(*p)->rb_right; 2753 else 2754 BUG(); 2755 } 2756 rb_link_node(&new->nd, parent, p); 2757 rb_insert_color(&new->nd, &sp->root); 2758 } 2759 2760 /* Find shared policy intersecting idx */ 2761 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2762 pgoff_t idx) 2763 { 2764 struct mempolicy *pol = NULL; 2765 struct sp_node *sn; 2766 2767 if (!sp->root.rb_node) 2768 return NULL; 2769 read_lock(&sp->lock); 2770 sn = sp_lookup(sp, idx, idx+1); 2771 if (sn) { 2772 mpol_get(sn->policy); 2773 pol = sn->policy; 2774 } 2775 read_unlock(&sp->lock); 2776 return pol; 2777 } 2778 2779 static void sp_free(struct sp_node *n) 2780 { 2781 mpol_put(n->policy); 2782 kmem_cache_free(sn_cache, n); 2783 } 2784 2785 /** 2786 * mpol_misplaced - check whether current folio node is valid in policy 2787 * 2788 * @folio: folio to be checked 2789 * @vmf: structure describing the fault 2790 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2791 * 2792 * Lookup current policy node id for vma,addr and "compare to" folio's 2793 * node id. Policy determination "mimics" alloc_page_vma(). 2794 * Called from fault path where we know the vma and faulting address. 2795 * 2796 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2797 * policy, or a suitable node ID to allocate a replacement folio from. 2798 */ 2799 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2800 unsigned long addr) 2801 { 2802 struct mempolicy *pol; 2803 pgoff_t ilx; 2804 struct zoneref *z; 2805 int curnid = folio_nid(folio); 2806 struct vm_area_struct *vma = vmf->vma; 2807 int thiscpu = raw_smp_processor_id(); 2808 int thisnid = numa_node_id(); 2809 int polnid = NUMA_NO_NODE; 2810 int ret = NUMA_NO_NODE; 2811 2812 /* 2813 * Make sure ptl is held so that we don't preempt and we 2814 * have a stable smp processor id 2815 */ 2816 lockdep_assert_held(vmf->ptl); 2817 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2818 if (!(pol->flags & MPOL_F_MOF)) 2819 goto out; 2820 2821 switch (pol->mode) { 2822 case MPOL_INTERLEAVE: 2823 polnid = interleave_nid(pol, ilx); 2824 break; 2825 2826 case MPOL_WEIGHTED_INTERLEAVE: 2827 polnid = weighted_interleave_nid(pol, ilx); 2828 break; 2829 2830 case MPOL_PREFERRED: 2831 if (node_isset(curnid, pol->nodes)) 2832 goto out; 2833 polnid = first_node(pol->nodes); 2834 break; 2835 2836 case MPOL_LOCAL: 2837 polnid = numa_node_id(); 2838 break; 2839 2840 case MPOL_BIND: 2841 case MPOL_PREFERRED_MANY: 2842 /* 2843 * Even though MPOL_PREFERRED_MANY can allocate pages outside 2844 * policy nodemask we don't allow numa migration to nodes 2845 * outside policy nodemask for now. This is done so that if we 2846 * want demotion to slow memory to happen, before allocating 2847 * from some DRAM node say 'x', we will end up using a 2848 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 2849 * we should not promote to node 'x' from slow memory node. 2850 */ 2851 if (pol->flags & MPOL_F_MORON) { 2852 /* 2853 * Optimize placement among multiple nodes 2854 * via NUMA balancing 2855 */ 2856 if (node_isset(thisnid, pol->nodes)) 2857 break; 2858 goto out; 2859 } 2860 2861 /* 2862 * use current page if in policy nodemask, 2863 * else select nearest allowed node, if any. 2864 * If no allowed nodes, use current [!misplaced]. 2865 */ 2866 if (node_isset(curnid, pol->nodes)) 2867 goto out; 2868 z = first_zones_zonelist( 2869 node_zonelist(thisnid, GFP_HIGHUSER), 2870 gfp_zone(GFP_HIGHUSER), 2871 &pol->nodes); 2872 polnid = zonelist_node_idx(z); 2873 break; 2874 2875 default: 2876 BUG(); 2877 } 2878 2879 /* Migrate the folio towards the node whose CPU is referencing it */ 2880 if (pol->flags & MPOL_F_MORON) { 2881 polnid = thisnid; 2882 2883 if (!should_numa_migrate_memory(current, folio, curnid, 2884 thiscpu)) 2885 goto out; 2886 } 2887 2888 if (curnid != polnid) 2889 ret = polnid; 2890 out: 2891 mpol_cond_put(pol); 2892 2893 return ret; 2894 } 2895 2896 /* 2897 * Drop the (possibly final) reference to task->mempolicy. It needs to be 2898 * dropped after task->mempolicy is set to NULL so that any allocation done as 2899 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 2900 * policy. 2901 */ 2902 void mpol_put_task_policy(struct task_struct *task) 2903 { 2904 struct mempolicy *pol; 2905 2906 task_lock(task); 2907 pol = task->mempolicy; 2908 task->mempolicy = NULL; 2909 task_unlock(task); 2910 mpol_put(pol); 2911 } 2912 2913 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2914 { 2915 rb_erase(&n->nd, &sp->root); 2916 sp_free(n); 2917 } 2918 2919 static void sp_node_init(struct sp_node *node, unsigned long start, 2920 unsigned long end, struct mempolicy *pol) 2921 { 2922 node->start = start; 2923 node->end = end; 2924 node->policy = pol; 2925 } 2926 2927 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2928 struct mempolicy *pol) 2929 { 2930 struct sp_node *n; 2931 struct mempolicy *newpol; 2932 2933 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2934 if (!n) 2935 return NULL; 2936 2937 newpol = mpol_dup(pol); 2938 if (IS_ERR(newpol)) { 2939 kmem_cache_free(sn_cache, n); 2940 return NULL; 2941 } 2942 newpol->flags |= MPOL_F_SHARED; 2943 sp_node_init(n, start, end, newpol); 2944 2945 return n; 2946 } 2947 2948 /* Replace a policy range. */ 2949 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 2950 pgoff_t end, struct sp_node *new) 2951 { 2952 struct sp_node *n; 2953 struct sp_node *n_new = NULL; 2954 struct mempolicy *mpol_new = NULL; 2955 int ret = 0; 2956 2957 restart: 2958 write_lock(&sp->lock); 2959 n = sp_lookup(sp, start, end); 2960 /* Take care of old policies in the same range. */ 2961 while (n && n->start < end) { 2962 struct rb_node *next = rb_next(&n->nd); 2963 if (n->start >= start) { 2964 if (n->end <= end) 2965 sp_delete(sp, n); 2966 else 2967 n->start = end; 2968 } else { 2969 /* Old policy spanning whole new range. */ 2970 if (n->end > end) { 2971 if (!n_new) 2972 goto alloc_new; 2973 2974 *mpol_new = *n->policy; 2975 atomic_set(&mpol_new->refcnt, 1); 2976 sp_node_init(n_new, end, n->end, mpol_new); 2977 n->end = start; 2978 sp_insert(sp, n_new); 2979 n_new = NULL; 2980 mpol_new = NULL; 2981 break; 2982 } else 2983 n->end = start; 2984 } 2985 if (!next) 2986 break; 2987 n = rb_entry(next, struct sp_node, nd); 2988 } 2989 if (new) 2990 sp_insert(sp, new); 2991 write_unlock(&sp->lock); 2992 ret = 0; 2993 2994 err_out: 2995 if (mpol_new) 2996 mpol_put(mpol_new); 2997 if (n_new) 2998 kmem_cache_free(sn_cache, n_new); 2999 3000 return ret; 3001 3002 alloc_new: 3003 write_unlock(&sp->lock); 3004 ret = -ENOMEM; 3005 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3006 if (!n_new) 3007 goto err_out; 3008 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3009 if (!mpol_new) 3010 goto err_out; 3011 atomic_set(&mpol_new->refcnt, 1); 3012 goto restart; 3013 } 3014 3015 /** 3016 * mpol_shared_policy_init - initialize shared policy for inode 3017 * @sp: pointer to inode shared policy 3018 * @mpol: struct mempolicy to install 3019 * 3020 * Install non-NULL @mpol in inode's shared policy rb-tree. 3021 * On entry, the current task has a reference on a non-NULL @mpol. 3022 * This must be released on exit. 3023 * This is called at get_inode() calls and we can use GFP_KERNEL. 3024 */ 3025 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3026 { 3027 int ret; 3028 3029 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3030 rwlock_init(&sp->lock); 3031 3032 if (mpol) { 3033 struct sp_node *sn; 3034 struct mempolicy *npol; 3035 NODEMASK_SCRATCH(scratch); 3036 3037 if (!scratch) 3038 goto put_mpol; 3039 3040 /* contextualize the tmpfs mount point mempolicy to this file */ 3041 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3042 if (IS_ERR(npol)) 3043 goto free_scratch; /* no valid nodemask intersection */ 3044 3045 task_lock(current); 3046 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3047 task_unlock(current); 3048 if (ret) 3049 goto put_npol; 3050 3051 /* alloc node covering entire file; adds ref to file's npol */ 3052 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3053 if (sn) 3054 sp_insert(sp, sn); 3055 put_npol: 3056 mpol_put(npol); /* drop initial ref on file's npol */ 3057 free_scratch: 3058 NODEMASK_SCRATCH_FREE(scratch); 3059 put_mpol: 3060 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3061 } 3062 } 3063 3064 int mpol_set_shared_policy(struct shared_policy *sp, 3065 struct vm_area_struct *vma, struct mempolicy *pol) 3066 { 3067 int err; 3068 struct sp_node *new = NULL; 3069 unsigned long sz = vma_pages(vma); 3070 3071 if (pol) { 3072 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3073 if (!new) 3074 return -ENOMEM; 3075 } 3076 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3077 if (err && new) 3078 sp_free(new); 3079 return err; 3080 } 3081 3082 /* Free a backing policy store on inode delete. */ 3083 void mpol_free_shared_policy(struct shared_policy *sp) 3084 { 3085 struct sp_node *n; 3086 struct rb_node *next; 3087 3088 if (!sp->root.rb_node) 3089 return; 3090 write_lock(&sp->lock); 3091 next = rb_first(&sp->root); 3092 while (next) { 3093 n = rb_entry(next, struct sp_node, nd); 3094 next = rb_next(&n->nd); 3095 sp_delete(sp, n); 3096 } 3097 write_unlock(&sp->lock); 3098 } 3099 3100 #ifdef CONFIG_NUMA_BALANCING 3101 static int __initdata numabalancing_override; 3102 3103 static void __init check_numabalancing_enable(void) 3104 { 3105 bool numabalancing_default = false; 3106 3107 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3108 numabalancing_default = true; 3109 3110 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3111 if (numabalancing_override) 3112 set_numabalancing_state(numabalancing_override == 1); 3113 3114 if (num_online_nodes() > 1 && !numabalancing_override) { 3115 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3116 numabalancing_default ? "Enabling" : "Disabling"); 3117 set_numabalancing_state(numabalancing_default); 3118 } 3119 } 3120 3121 static int __init setup_numabalancing(char *str) 3122 { 3123 int ret = 0; 3124 if (!str) 3125 goto out; 3126 3127 if (!strcmp(str, "enable")) { 3128 numabalancing_override = 1; 3129 ret = 1; 3130 } else if (!strcmp(str, "disable")) { 3131 numabalancing_override = -1; 3132 ret = 1; 3133 } 3134 out: 3135 if (!ret) 3136 pr_warn("Unable to parse numa_balancing=\n"); 3137 3138 return ret; 3139 } 3140 __setup("numa_balancing=", setup_numabalancing); 3141 #else 3142 static inline void __init check_numabalancing_enable(void) 3143 { 3144 } 3145 #endif /* CONFIG_NUMA_BALANCING */ 3146 3147 void __init numa_policy_init(void) 3148 { 3149 nodemask_t interleave_nodes; 3150 unsigned long largest = 0; 3151 int nid, prefer = 0; 3152 3153 policy_cache = kmem_cache_create("numa_policy", 3154 sizeof(struct mempolicy), 3155 0, SLAB_PANIC, NULL); 3156 3157 sn_cache = kmem_cache_create("shared_policy_node", 3158 sizeof(struct sp_node), 3159 0, SLAB_PANIC, NULL); 3160 3161 for_each_node(nid) { 3162 preferred_node_policy[nid] = (struct mempolicy) { 3163 .refcnt = ATOMIC_INIT(1), 3164 .mode = MPOL_PREFERRED, 3165 .flags = MPOL_F_MOF | MPOL_F_MORON, 3166 .nodes = nodemask_of_node(nid), 3167 }; 3168 } 3169 3170 /* 3171 * Set interleaving policy for system init. Interleaving is only 3172 * enabled across suitably sized nodes (default is >= 16MB), or 3173 * fall back to the largest node if they're all smaller. 3174 */ 3175 nodes_clear(interleave_nodes); 3176 for_each_node_state(nid, N_MEMORY) { 3177 unsigned long total_pages = node_present_pages(nid); 3178 3179 /* Preserve the largest node */ 3180 if (largest < total_pages) { 3181 largest = total_pages; 3182 prefer = nid; 3183 } 3184 3185 /* Interleave this node? */ 3186 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3187 node_set(nid, interleave_nodes); 3188 } 3189 3190 /* All too small, use the largest */ 3191 if (unlikely(nodes_empty(interleave_nodes))) 3192 node_set(prefer, interleave_nodes); 3193 3194 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3195 pr_err("%s: interleaving failed\n", __func__); 3196 3197 check_numabalancing_enable(); 3198 } 3199 3200 /* Reset policy of current process to default */ 3201 void numa_default_policy(void) 3202 { 3203 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3204 } 3205 3206 /* 3207 * Parse and format mempolicy from/to strings 3208 */ 3209 static const char * const policy_modes[] = 3210 { 3211 [MPOL_DEFAULT] = "default", 3212 [MPOL_PREFERRED] = "prefer", 3213 [MPOL_BIND] = "bind", 3214 [MPOL_INTERLEAVE] = "interleave", 3215 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3216 [MPOL_LOCAL] = "local", 3217 [MPOL_PREFERRED_MANY] = "prefer (many)", 3218 }; 3219 3220 #ifdef CONFIG_TMPFS 3221 /** 3222 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3223 * @str: string containing mempolicy to parse 3224 * @mpol: pointer to struct mempolicy pointer, returned on success. 3225 * 3226 * Format of input: 3227 * <mode>[=<flags>][:<nodelist>] 3228 * 3229 * Return: %0 on success, else %1 3230 */ 3231 int mpol_parse_str(char *str, struct mempolicy **mpol) 3232 { 3233 struct mempolicy *new = NULL; 3234 unsigned short mode_flags; 3235 nodemask_t nodes; 3236 char *nodelist = strchr(str, ':'); 3237 char *flags = strchr(str, '='); 3238 int err = 1, mode; 3239 3240 if (flags) 3241 *flags++ = '\0'; /* terminate mode string */ 3242 3243 if (nodelist) { 3244 /* NUL-terminate mode or flags string */ 3245 *nodelist++ = '\0'; 3246 if (nodelist_parse(nodelist, nodes)) 3247 goto out; 3248 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3249 goto out; 3250 } else 3251 nodes_clear(nodes); 3252 3253 mode = match_string(policy_modes, MPOL_MAX, str); 3254 if (mode < 0) 3255 goto out; 3256 3257 switch (mode) { 3258 case MPOL_PREFERRED: 3259 /* 3260 * Insist on a nodelist of one node only, although later 3261 * we use first_node(nodes) to grab a single node, so here 3262 * nodelist (or nodes) cannot be empty. 3263 */ 3264 if (nodelist) { 3265 char *rest = nodelist; 3266 while (isdigit(*rest)) 3267 rest++; 3268 if (*rest) 3269 goto out; 3270 if (nodes_empty(nodes)) 3271 goto out; 3272 } 3273 break; 3274 case MPOL_INTERLEAVE: 3275 case MPOL_WEIGHTED_INTERLEAVE: 3276 /* 3277 * Default to online nodes with memory if no nodelist 3278 */ 3279 if (!nodelist) 3280 nodes = node_states[N_MEMORY]; 3281 break; 3282 case MPOL_LOCAL: 3283 /* 3284 * Don't allow a nodelist; mpol_new() checks flags 3285 */ 3286 if (nodelist) 3287 goto out; 3288 break; 3289 case MPOL_DEFAULT: 3290 /* 3291 * Insist on a empty nodelist 3292 */ 3293 if (!nodelist) 3294 err = 0; 3295 goto out; 3296 case MPOL_PREFERRED_MANY: 3297 case MPOL_BIND: 3298 /* 3299 * Insist on a nodelist 3300 */ 3301 if (!nodelist) 3302 goto out; 3303 } 3304 3305 mode_flags = 0; 3306 if (flags) { 3307 /* 3308 * Currently, we only support two mutually exclusive 3309 * mode flags. 3310 */ 3311 if (!strcmp(flags, "static")) 3312 mode_flags |= MPOL_F_STATIC_NODES; 3313 else if (!strcmp(flags, "relative")) 3314 mode_flags |= MPOL_F_RELATIVE_NODES; 3315 else 3316 goto out; 3317 } 3318 3319 new = mpol_new(mode, mode_flags, &nodes); 3320 if (IS_ERR(new)) 3321 goto out; 3322 3323 /* 3324 * Save nodes for mpol_to_str() to show the tmpfs mount options 3325 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3326 */ 3327 if (mode != MPOL_PREFERRED) { 3328 new->nodes = nodes; 3329 } else if (nodelist) { 3330 nodes_clear(new->nodes); 3331 node_set(first_node(nodes), new->nodes); 3332 } else { 3333 new->mode = MPOL_LOCAL; 3334 } 3335 3336 /* 3337 * Save nodes for contextualization: this will be used to "clone" 3338 * the mempolicy in a specific context [cpuset] at a later time. 3339 */ 3340 new->w.user_nodemask = nodes; 3341 3342 err = 0; 3343 3344 out: 3345 /* Restore string for error message */ 3346 if (nodelist) 3347 *--nodelist = ':'; 3348 if (flags) 3349 *--flags = '='; 3350 if (!err) 3351 *mpol = new; 3352 return err; 3353 } 3354 #endif /* CONFIG_TMPFS */ 3355 3356 /** 3357 * mpol_to_str - format a mempolicy structure for printing 3358 * @buffer: to contain formatted mempolicy string 3359 * @maxlen: length of @buffer 3360 * @pol: pointer to mempolicy to be formatted 3361 * 3362 * Convert @pol into a string. If @buffer is too short, truncate the string. 3363 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3364 * interleave", plus the longest flag flags, "relative|balancing", and to 3365 * display at least a few node ids. 3366 */ 3367 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3368 { 3369 char *p = buffer; 3370 nodemask_t nodes = NODE_MASK_NONE; 3371 unsigned short mode = MPOL_DEFAULT; 3372 unsigned short flags = 0; 3373 3374 if (pol && 3375 pol != &default_policy && 3376 !(pol >= &preferred_node_policy[0] && 3377 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3378 mode = pol->mode; 3379 flags = pol->flags; 3380 } 3381 3382 switch (mode) { 3383 case MPOL_DEFAULT: 3384 case MPOL_LOCAL: 3385 break; 3386 case MPOL_PREFERRED: 3387 case MPOL_PREFERRED_MANY: 3388 case MPOL_BIND: 3389 case MPOL_INTERLEAVE: 3390 case MPOL_WEIGHTED_INTERLEAVE: 3391 nodes = pol->nodes; 3392 break; 3393 default: 3394 WARN_ON_ONCE(1); 3395 snprintf(p, maxlen, "unknown"); 3396 return; 3397 } 3398 3399 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3400 3401 if (flags & MPOL_MODE_FLAGS) { 3402 p += snprintf(p, buffer + maxlen - p, "="); 3403 3404 /* 3405 * Static and relative are mutually exclusive. 3406 */ 3407 if (flags & MPOL_F_STATIC_NODES) 3408 p += snprintf(p, buffer + maxlen - p, "static"); 3409 else if (flags & MPOL_F_RELATIVE_NODES) 3410 p += snprintf(p, buffer + maxlen - p, "relative"); 3411 3412 if (flags & MPOL_F_NUMA_BALANCING) { 3413 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3414 p += snprintf(p, buffer + maxlen - p, "|"); 3415 p += snprintf(p, buffer + maxlen - p, "balancing"); 3416 } 3417 } 3418 3419 if (!nodes_empty(nodes)) 3420 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3421 nodemask_pr_args(&nodes)); 3422 } 3423 3424 #ifdef CONFIG_SYSFS 3425 struct iw_node_attr { 3426 struct kobj_attribute kobj_attr; 3427 int nid; 3428 }; 3429 3430 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3431 char *buf) 3432 { 3433 struct iw_node_attr *node_attr; 3434 u8 weight; 3435 3436 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3437 weight = get_il_weight(node_attr->nid); 3438 return sysfs_emit(buf, "%d\n", weight); 3439 } 3440 3441 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3442 const char *buf, size_t count) 3443 { 3444 struct iw_node_attr *node_attr; 3445 u8 *new; 3446 u8 *old; 3447 u8 weight = 0; 3448 3449 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3450 if (count == 0 || sysfs_streq(buf, "")) 3451 weight = 0; 3452 else if (kstrtou8(buf, 0, &weight)) 3453 return -EINVAL; 3454 3455 new = kzalloc(nr_node_ids, GFP_KERNEL); 3456 if (!new) 3457 return -ENOMEM; 3458 3459 mutex_lock(&iw_table_lock); 3460 old = rcu_dereference_protected(iw_table, 3461 lockdep_is_held(&iw_table_lock)); 3462 if (old) 3463 memcpy(new, old, nr_node_ids); 3464 new[node_attr->nid] = weight; 3465 rcu_assign_pointer(iw_table, new); 3466 mutex_unlock(&iw_table_lock); 3467 synchronize_rcu(); 3468 kfree(old); 3469 return count; 3470 } 3471 3472 static struct iw_node_attr **node_attrs; 3473 3474 static void sysfs_wi_node_release(struct iw_node_attr *node_attr, 3475 struct kobject *parent) 3476 { 3477 if (!node_attr) 3478 return; 3479 sysfs_remove_file(parent, &node_attr->kobj_attr.attr); 3480 kfree(node_attr->kobj_attr.attr.name); 3481 kfree(node_attr); 3482 } 3483 3484 static void sysfs_wi_release(struct kobject *wi_kobj) 3485 { 3486 int i; 3487 3488 for (i = 0; i < nr_node_ids; i++) 3489 sysfs_wi_node_release(node_attrs[i], wi_kobj); 3490 kobject_put(wi_kobj); 3491 } 3492 3493 static const struct kobj_type wi_ktype = { 3494 .sysfs_ops = &kobj_sysfs_ops, 3495 .release = sysfs_wi_release, 3496 }; 3497 3498 static int add_weight_node(int nid, struct kobject *wi_kobj) 3499 { 3500 struct iw_node_attr *node_attr; 3501 char *name; 3502 3503 node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); 3504 if (!node_attr) 3505 return -ENOMEM; 3506 3507 name = kasprintf(GFP_KERNEL, "node%d", nid); 3508 if (!name) { 3509 kfree(node_attr); 3510 return -ENOMEM; 3511 } 3512 3513 sysfs_attr_init(&node_attr->kobj_attr.attr); 3514 node_attr->kobj_attr.attr.name = name; 3515 node_attr->kobj_attr.attr.mode = 0644; 3516 node_attr->kobj_attr.show = node_show; 3517 node_attr->kobj_attr.store = node_store; 3518 node_attr->nid = nid; 3519 3520 if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { 3521 kfree(node_attr->kobj_attr.attr.name); 3522 kfree(node_attr); 3523 pr_err("failed to add attribute to weighted_interleave\n"); 3524 return -ENOMEM; 3525 } 3526 3527 node_attrs[nid] = node_attr; 3528 return 0; 3529 } 3530 3531 static int add_weighted_interleave_group(struct kobject *root_kobj) 3532 { 3533 struct kobject *wi_kobj; 3534 int nid, err; 3535 3536 wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); 3537 if (!wi_kobj) 3538 return -ENOMEM; 3539 3540 err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, 3541 "weighted_interleave"); 3542 if (err) { 3543 kfree(wi_kobj); 3544 return err; 3545 } 3546 3547 for_each_node_state(nid, N_POSSIBLE) { 3548 err = add_weight_node(nid, wi_kobj); 3549 if (err) { 3550 pr_err("failed to add sysfs [node%d]\n", nid); 3551 break; 3552 } 3553 } 3554 if (err) 3555 kobject_put(wi_kobj); 3556 return 0; 3557 } 3558 3559 static void mempolicy_kobj_release(struct kobject *kobj) 3560 { 3561 u8 *old; 3562 3563 mutex_lock(&iw_table_lock); 3564 old = rcu_dereference_protected(iw_table, 3565 lockdep_is_held(&iw_table_lock)); 3566 rcu_assign_pointer(iw_table, NULL); 3567 mutex_unlock(&iw_table_lock); 3568 synchronize_rcu(); 3569 kfree(old); 3570 kfree(node_attrs); 3571 kfree(kobj); 3572 } 3573 3574 static const struct kobj_type mempolicy_ktype = { 3575 .release = mempolicy_kobj_release 3576 }; 3577 3578 static int __init mempolicy_sysfs_init(void) 3579 { 3580 int err; 3581 static struct kobject *mempolicy_kobj; 3582 3583 mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); 3584 if (!mempolicy_kobj) { 3585 err = -ENOMEM; 3586 goto err_out; 3587 } 3588 3589 node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), 3590 GFP_KERNEL); 3591 if (!node_attrs) { 3592 err = -ENOMEM; 3593 goto mempol_out; 3594 } 3595 3596 err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, 3597 "mempolicy"); 3598 if (err) 3599 goto node_out; 3600 3601 err = add_weighted_interleave_group(mempolicy_kobj); 3602 if (err) { 3603 pr_err("mempolicy sysfs structure failed to initialize\n"); 3604 kobject_put(mempolicy_kobj); 3605 return err; 3606 } 3607 3608 return err; 3609 node_out: 3610 kfree(node_attrs); 3611 mempol_out: 3612 kfree(mempolicy_kobj); 3613 err_out: 3614 pr_err("failed to add mempolicy kobject to the system\n"); 3615 return err; 3616 } 3617 3618 late_initcall(mempolicy_sysfs_init); 3619 #endif /* CONFIG_SYSFS */ 3620