1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/task.h> 89 #include <linux/nodemask.h> 90 #include <linux/cpuset.h> 91 #include <linux/slab.h> 92 #include <linux/string.h> 93 #include <linux/export.h> 94 #include <linux/nsproxy.h> 95 #include <linux/interrupt.h> 96 #include <linux/init.h> 97 #include <linux/compat.h> 98 #include <linux/ptrace.h> 99 #include <linux/swap.h> 100 #include <linux/seq_file.h> 101 #include <linux/proc_fs.h> 102 #include <linux/migrate.h> 103 #include <linux/ksm.h> 104 #include <linux/rmap.h> 105 #include <linux/security.h> 106 #include <linux/syscalls.h> 107 #include <linux/ctype.h> 108 #include <linux/mm_inline.h> 109 #include <linux/mmu_notifier.h> 110 #include <linux/printk.h> 111 #include <linux/swapops.h> 112 113 #include <asm/tlbflush.h> 114 #include <asm/tlb.h> 115 #include <linux/uaccess.h> 116 117 #include "internal.h" 118 119 /* Internal flags */ 120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 121 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 122 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 123 124 static struct kmem_cache *policy_cache; 125 static struct kmem_cache *sn_cache; 126 127 /* Highest zone. An specific allocation for a zone below that is not 128 policied. */ 129 enum zone_type policy_zone = 0; 130 131 /* 132 * run-time system-wide default policy => local allocation 133 */ 134 static struct mempolicy default_policy = { 135 .refcnt = ATOMIC_INIT(1), /* never free it */ 136 .mode = MPOL_LOCAL, 137 }; 138 139 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 140 141 /* 142 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes 143 * system-default value should be used. A NULL iw_table also denotes that 144 * system-default values should be used. Until the system-default table 145 * is implemented, the system-default is always 1. 146 * 147 * iw_table is RCU protected 148 */ 149 static u8 __rcu *iw_table; 150 static DEFINE_MUTEX(iw_table_lock); 151 152 static u8 get_il_weight(int node) 153 { 154 u8 *table; 155 u8 weight; 156 157 rcu_read_lock(); 158 table = rcu_dereference(iw_table); 159 /* if no iw_table, use system default */ 160 weight = table ? table[node] : 1; 161 /* if value in iw_table is 0, use system default */ 162 weight = weight ? weight : 1; 163 rcu_read_unlock(); 164 return weight; 165 } 166 167 /** 168 * numa_nearest_node - Find nearest node by state 169 * @node: Node id to start the search 170 * @state: State to filter the search 171 * 172 * Lookup the closest node by distance if @nid is not in state. 173 * 174 * Return: this @node if it is in state, otherwise the closest node by distance 175 */ 176 int numa_nearest_node(int node, unsigned int state) 177 { 178 int min_dist = INT_MAX, dist, n, min_node; 179 180 if (state >= NR_NODE_STATES) 181 return -EINVAL; 182 183 if (node == NUMA_NO_NODE || node_state(node, state)) 184 return node; 185 186 min_node = node; 187 for_each_node_state(n, state) { 188 dist = node_distance(node, n); 189 if (dist < min_dist) { 190 min_dist = dist; 191 min_node = n; 192 } 193 } 194 195 return min_node; 196 } 197 EXPORT_SYMBOL_GPL(numa_nearest_node); 198 199 /** 200 * nearest_node_nodemask - Find the node in @mask at the nearest distance 201 * from @node. 202 * 203 * @node: a valid node ID to start the search from. 204 * @mask: a pointer to a nodemask representing the allowed nodes. 205 * 206 * This function iterates over all nodes in @mask and calculates the 207 * distance from the starting @node, then it returns the node ID that is 208 * the closest to @node, or MAX_NUMNODES if no node is found. 209 * 210 * Note that @node must be a valid node ID usable with node_distance(), 211 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 212 * or unexpected behavior. 213 */ 214 int nearest_node_nodemask(int node, nodemask_t *mask) 215 { 216 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 217 218 for_each_node_mask(n, *mask) { 219 dist = node_distance(node, n); 220 if (dist < min_dist) { 221 min_dist = dist; 222 min_node = n; 223 } 224 } 225 226 return min_node; 227 } 228 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 229 230 struct mempolicy *get_task_policy(struct task_struct *p) 231 { 232 struct mempolicy *pol = p->mempolicy; 233 int node; 234 235 if (pol) 236 return pol; 237 238 node = numa_node_id(); 239 if (node != NUMA_NO_NODE) { 240 pol = &preferred_node_policy[node]; 241 /* preferred_node_policy is not initialised early in boot */ 242 if (pol->mode) 243 return pol; 244 } 245 246 return &default_policy; 247 } 248 249 static const struct mempolicy_operations { 250 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 251 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 252 } mpol_ops[MPOL_MAX]; 253 254 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 255 { 256 return pol->flags & MPOL_MODE_FLAGS; 257 } 258 259 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 260 const nodemask_t *rel) 261 { 262 nodemask_t tmp; 263 nodes_fold(tmp, *orig, nodes_weight(*rel)); 264 nodes_onto(*ret, tmp, *rel); 265 } 266 267 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 268 { 269 if (nodes_empty(*nodes)) 270 return -EINVAL; 271 pol->nodes = *nodes; 272 return 0; 273 } 274 275 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 276 { 277 if (nodes_empty(*nodes)) 278 return -EINVAL; 279 280 nodes_clear(pol->nodes); 281 node_set(first_node(*nodes), pol->nodes); 282 return 0; 283 } 284 285 /* 286 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 287 * any, for the new policy. mpol_new() has already validated the nodes 288 * parameter with respect to the policy mode and flags. 289 * 290 * Must be called holding task's alloc_lock to protect task's mems_allowed 291 * and mempolicy. May also be called holding the mmap_lock for write. 292 */ 293 static int mpol_set_nodemask(struct mempolicy *pol, 294 const nodemask_t *nodes, struct nodemask_scratch *nsc) 295 { 296 int ret; 297 298 /* 299 * Default (pol==NULL) resp. local memory policies are not a 300 * subject of any remapping. They also do not need any special 301 * constructor. 302 */ 303 if (!pol || pol->mode == MPOL_LOCAL) 304 return 0; 305 306 /* Check N_MEMORY */ 307 nodes_and(nsc->mask1, 308 cpuset_current_mems_allowed, node_states[N_MEMORY]); 309 310 VM_BUG_ON(!nodes); 311 312 if (pol->flags & MPOL_F_RELATIVE_NODES) 313 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 314 else 315 nodes_and(nsc->mask2, *nodes, nsc->mask1); 316 317 if (mpol_store_user_nodemask(pol)) 318 pol->w.user_nodemask = *nodes; 319 else 320 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 321 322 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 323 return ret; 324 } 325 326 /* 327 * This function just creates a new policy, does some check and simple 328 * initialization. You must invoke mpol_set_nodemask() to set nodes. 329 */ 330 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 331 nodemask_t *nodes) 332 { 333 struct mempolicy *policy; 334 335 if (mode == MPOL_DEFAULT) { 336 if (nodes && !nodes_empty(*nodes)) 337 return ERR_PTR(-EINVAL); 338 return NULL; 339 } 340 VM_BUG_ON(!nodes); 341 342 /* 343 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 344 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 345 * All other modes require a valid pointer to a non-empty nodemask. 346 */ 347 if (mode == MPOL_PREFERRED) { 348 if (nodes_empty(*nodes)) { 349 if (((flags & MPOL_F_STATIC_NODES) || 350 (flags & MPOL_F_RELATIVE_NODES))) 351 return ERR_PTR(-EINVAL); 352 353 mode = MPOL_LOCAL; 354 } 355 } else if (mode == MPOL_LOCAL) { 356 if (!nodes_empty(*nodes) || 357 (flags & MPOL_F_STATIC_NODES) || 358 (flags & MPOL_F_RELATIVE_NODES)) 359 return ERR_PTR(-EINVAL); 360 } else if (nodes_empty(*nodes)) 361 return ERR_PTR(-EINVAL); 362 363 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 364 if (!policy) 365 return ERR_PTR(-ENOMEM); 366 atomic_set(&policy->refcnt, 1); 367 policy->mode = mode; 368 policy->flags = flags; 369 policy->home_node = NUMA_NO_NODE; 370 371 return policy; 372 } 373 374 /* Slow path of a mpol destructor. */ 375 void __mpol_put(struct mempolicy *pol) 376 { 377 if (!atomic_dec_and_test(&pol->refcnt)) 378 return; 379 kmem_cache_free(policy_cache, pol); 380 } 381 382 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 383 { 384 } 385 386 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 387 { 388 nodemask_t tmp; 389 390 if (pol->flags & MPOL_F_STATIC_NODES) 391 nodes_and(tmp, pol->w.user_nodemask, *nodes); 392 else if (pol->flags & MPOL_F_RELATIVE_NODES) 393 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 394 else { 395 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 396 *nodes); 397 pol->w.cpuset_mems_allowed = *nodes; 398 } 399 400 if (nodes_empty(tmp)) 401 tmp = *nodes; 402 403 pol->nodes = tmp; 404 } 405 406 static void mpol_rebind_preferred(struct mempolicy *pol, 407 const nodemask_t *nodes) 408 { 409 pol->w.cpuset_mems_allowed = *nodes; 410 } 411 412 /* 413 * mpol_rebind_policy - Migrate a policy to a different set of nodes 414 * 415 * Per-vma policies are protected by mmap_lock. Allocations using per-task 416 * policies are protected by task->mems_allowed_seq to prevent a premature 417 * OOM/allocation failure due to parallel nodemask modification. 418 */ 419 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 420 { 421 if (!pol || pol->mode == MPOL_LOCAL) 422 return; 423 if (!mpol_store_user_nodemask(pol) && 424 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 425 return; 426 427 mpol_ops[pol->mode].rebind(pol, newmask); 428 } 429 430 /* 431 * Wrapper for mpol_rebind_policy() that just requires task 432 * pointer, and updates task mempolicy. 433 * 434 * Called with task's alloc_lock held. 435 */ 436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 437 { 438 mpol_rebind_policy(tsk->mempolicy, new); 439 } 440 441 /* 442 * Rebind each vma in mm to new nodemask. 443 * 444 * Call holding a reference to mm. Takes mm->mmap_lock during call. 445 */ 446 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 447 { 448 struct vm_area_struct *vma; 449 VMA_ITERATOR(vmi, mm, 0); 450 451 mmap_write_lock(mm); 452 for_each_vma(vmi, vma) { 453 vma_start_write(vma); 454 mpol_rebind_policy(vma->vm_policy, new); 455 } 456 mmap_write_unlock(mm); 457 } 458 459 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 460 [MPOL_DEFAULT] = { 461 .rebind = mpol_rebind_default, 462 }, 463 [MPOL_INTERLEAVE] = { 464 .create = mpol_new_nodemask, 465 .rebind = mpol_rebind_nodemask, 466 }, 467 [MPOL_PREFERRED] = { 468 .create = mpol_new_preferred, 469 .rebind = mpol_rebind_preferred, 470 }, 471 [MPOL_BIND] = { 472 .create = mpol_new_nodemask, 473 .rebind = mpol_rebind_nodemask, 474 }, 475 [MPOL_LOCAL] = { 476 .rebind = mpol_rebind_default, 477 }, 478 [MPOL_PREFERRED_MANY] = { 479 .create = mpol_new_nodemask, 480 .rebind = mpol_rebind_preferred, 481 }, 482 [MPOL_WEIGHTED_INTERLEAVE] = { 483 .create = mpol_new_nodemask, 484 .rebind = mpol_rebind_nodemask, 485 }, 486 }; 487 488 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 489 unsigned long flags); 490 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 491 pgoff_t ilx, int *nid); 492 493 static bool strictly_unmovable(unsigned long flags) 494 { 495 /* 496 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 497 * if any misplaced page is found. 498 */ 499 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 500 MPOL_MF_STRICT; 501 } 502 503 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 504 struct mempolicy *pol; 505 pgoff_t ilx; 506 }; 507 508 struct queue_pages { 509 struct list_head *pagelist; 510 unsigned long flags; 511 nodemask_t *nmask; 512 unsigned long start; 513 unsigned long end; 514 struct vm_area_struct *first; 515 struct folio *large; /* note last large folio encountered */ 516 long nr_failed; /* could not be isolated at this time */ 517 }; 518 519 /* 520 * Check if the folio's nid is in qp->nmask. 521 * 522 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 523 * in the invert of qp->nmask. 524 */ 525 static inline bool queue_folio_required(struct folio *folio, 526 struct queue_pages *qp) 527 { 528 int nid = folio_nid(folio); 529 unsigned long flags = qp->flags; 530 531 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 532 } 533 534 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 535 { 536 struct folio *folio; 537 struct queue_pages *qp = walk->private; 538 539 if (unlikely(is_pmd_migration_entry(*pmd))) { 540 qp->nr_failed++; 541 return; 542 } 543 folio = pmd_folio(*pmd); 544 if (is_huge_zero_folio(folio)) { 545 walk->action = ACTION_CONTINUE; 546 return; 547 } 548 if (!queue_folio_required(folio, qp)) 549 return; 550 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 551 !vma_migratable(walk->vma) || 552 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 553 qp->nr_failed++; 554 } 555 556 /* 557 * Scan through folios, checking if they satisfy the required conditions, 558 * moving them from LRU to local pagelist for migration if they do (or not). 559 * 560 * queue_folios_pte_range() has two possible return values: 561 * 0 - continue walking to scan for more, even if an existing folio on the 562 * wrong node could not be isolated and queued for migration. 563 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 564 * and an existing folio was on a node that does not follow the policy. 565 */ 566 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 567 unsigned long end, struct mm_walk *walk) 568 { 569 struct vm_area_struct *vma = walk->vma; 570 struct folio *folio; 571 struct queue_pages *qp = walk->private; 572 unsigned long flags = qp->flags; 573 pte_t *pte, *mapped_pte; 574 pte_t ptent; 575 spinlock_t *ptl; 576 577 ptl = pmd_trans_huge_lock(pmd, vma); 578 if (ptl) { 579 queue_folios_pmd(pmd, walk); 580 spin_unlock(ptl); 581 goto out; 582 } 583 584 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 585 if (!pte) { 586 walk->action = ACTION_AGAIN; 587 return 0; 588 } 589 for (; addr != end; pte++, addr += PAGE_SIZE) { 590 ptent = ptep_get(pte); 591 if (pte_none(ptent)) 592 continue; 593 if (!pte_present(ptent)) { 594 if (is_migration_entry(pte_to_swp_entry(ptent))) 595 qp->nr_failed++; 596 continue; 597 } 598 folio = vm_normal_folio(vma, addr, ptent); 599 if (!folio || folio_is_zone_device(folio)) 600 continue; 601 /* 602 * vm_normal_folio() filters out zero pages, but there might 603 * still be reserved folios to skip, perhaps in a VDSO. 604 */ 605 if (folio_test_reserved(folio)) 606 continue; 607 if (!queue_folio_required(folio, qp)) 608 continue; 609 if (folio_test_large(folio)) { 610 /* 611 * A large folio can only be isolated from LRU once, 612 * but may be mapped by many PTEs (and Copy-On-Write may 613 * intersperse PTEs of other, order 0, folios). This is 614 * a common case, so don't mistake it for failure (but 615 * there can be other cases of multi-mapped pages which 616 * this quick check does not help to filter out - and a 617 * search of the pagelist might grow to be prohibitive). 618 * 619 * migrate_pages(&pagelist) returns nr_failed folios, so 620 * check "large" now so that queue_pages_range() returns 621 * a comparable nr_failed folios. This does imply that 622 * if folio could not be isolated for some racy reason 623 * at its first PTE, later PTEs will not give it another 624 * chance of isolation; but keeps the accounting simple. 625 */ 626 if (folio == qp->large) 627 continue; 628 qp->large = folio; 629 } 630 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 631 !vma_migratable(vma) || 632 !migrate_folio_add(folio, qp->pagelist, flags)) { 633 qp->nr_failed++; 634 if (strictly_unmovable(flags)) 635 break; 636 } 637 } 638 pte_unmap_unlock(mapped_pte, ptl); 639 cond_resched(); 640 out: 641 if (qp->nr_failed && strictly_unmovable(flags)) 642 return -EIO; 643 return 0; 644 } 645 646 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 647 unsigned long addr, unsigned long end, 648 struct mm_walk *walk) 649 { 650 #ifdef CONFIG_HUGETLB_PAGE 651 struct queue_pages *qp = walk->private; 652 unsigned long flags = qp->flags; 653 struct folio *folio; 654 spinlock_t *ptl; 655 pte_t entry; 656 657 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 658 entry = huge_ptep_get(walk->mm, addr, pte); 659 if (!pte_present(entry)) { 660 if (unlikely(is_hugetlb_entry_migration(entry))) 661 qp->nr_failed++; 662 goto unlock; 663 } 664 folio = pfn_folio(pte_pfn(entry)); 665 if (!queue_folio_required(folio, qp)) 666 goto unlock; 667 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 668 !vma_migratable(walk->vma)) { 669 qp->nr_failed++; 670 goto unlock; 671 } 672 /* 673 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 674 * Choosing not to migrate a shared folio is not counted as a failure. 675 * 676 * See folio_maybe_mapped_shared() on possible imprecision when we 677 * cannot easily detect if a folio is shared. 678 */ 679 if ((flags & MPOL_MF_MOVE_ALL) || 680 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 681 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 682 qp->nr_failed++; 683 unlock: 684 spin_unlock(ptl); 685 if (qp->nr_failed && strictly_unmovable(flags)) 686 return -EIO; 687 #endif 688 return 0; 689 } 690 691 #ifdef CONFIG_NUMA_BALANCING 692 /* 693 * This is used to mark a range of virtual addresses to be inaccessible. 694 * These are later cleared by a NUMA hinting fault. Depending on these 695 * faults, pages may be migrated for better NUMA placement. 696 * 697 * This is assuming that NUMA faults are handled using PROT_NONE. If 698 * an architecture makes a different choice, it will need further 699 * changes to the core. 700 */ 701 unsigned long change_prot_numa(struct vm_area_struct *vma, 702 unsigned long addr, unsigned long end) 703 { 704 struct mmu_gather tlb; 705 long nr_updated; 706 707 tlb_gather_mmu(&tlb, vma->vm_mm); 708 709 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 710 if (nr_updated > 0) { 711 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 712 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 713 } 714 715 tlb_finish_mmu(&tlb); 716 717 return nr_updated; 718 } 719 #endif /* CONFIG_NUMA_BALANCING */ 720 721 static int queue_pages_test_walk(unsigned long start, unsigned long end, 722 struct mm_walk *walk) 723 { 724 struct vm_area_struct *next, *vma = walk->vma; 725 struct queue_pages *qp = walk->private; 726 unsigned long flags = qp->flags; 727 728 /* range check first */ 729 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 730 731 if (!qp->first) { 732 qp->first = vma; 733 if (!(flags & MPOL_MF_DISCONTIG_OK) && 734 (qp->start < vma->vm_start)) 735 /* hole at head side of range */ 736 return -EFAULT; 737 } 738 next = find_vma(vma->vm_mm, vma->vm_end); 739 if (!(flags & MPOL_MF_DISCONTIG_OK) && 740 ((vma->vm_end < qp->end) && 741 (!next || vma->vm_end < next->vm_start))) 742 /* hole at middle or tail of range */ 743 return -EFAULT; 744 745 /* 746 * Need check MPOL_MF_STRICT to return -EIO if possible 747 * regardless of vma_migratable 748 */ 749 if (!vma_migratable(vma) && 750 !(flags & MPOL_MF_STRICT)) 751 return 1; 752 753 /* 754 * Check page nodes, and queue pages to move, in the current vma. 755 * But if no moving, and no strict checking, the scan can be skipped. 756 */ 757 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 758 return 0; 759 return 1; 760 } 761 762 static const struct mm_walk_ops queue_pages_walk_ops = { 763 .hugetlb_entry = queue_folios_hugetlb, 764 .pmd_entry = queue_folios_pte_range, 765 .test_walk = queue_pages_test_walk, 766 .walk_lock = PGWALK_RDLOCK, 767 }; 768 769 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 770 .hugetlb_entry = queue_folios_hugetlb, 771 .pmd_entry = queue_folios_pte_range, 772 .test_walk = queue_pages_test_walk, 773 .walk_lock = PGWALK_WRLOCK, 774 }; 775 776 /* 777 * Walk through page tables and collect pages to be migrated. 778 * 779 * If pages found in a given range are not on the required set of @nodes, 780 * and migration is allowed, they are isolated and queued to @pagelist. 781 * 782 * queue_pages_range() may return: 783 * 0 - all pages already on the right node, or successfully queued for moving 784 * (or neither strict checking nor moving requested: only range checking). 785 * >0 - this number of misplaced folios could not be queued for moving 786 * (a hugetlbfs page or a transparent huge page being counted as 1). 787 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 788 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 789 */ 790 static long 791 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 792 nodemask_t *nodes, unsigned long flags, 793 struct list_head *pagelist) 794 { 795 int err; 796 struct queue_pages qp = { 797 .pagelist = pagelist, 798 .flags = flags, 799 .nmask = nodes, 800 .start = start, 801 .end = end, 802 .first = NULL, 803 }; 804 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 805 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 806 807 err = walk_page_range(mm, start, end, ops, &qp); 808 809 if (!qp.first) 810 /* whole range in hole */ 811 err = -EFAULT; 812 813 return err ? : qp.nr_failed; 814 } 815 816 /* 817 * Apply policy to a single VMA 818 * This must be called with the mmap_lock held for writing. 819 */ 820 static int vma_replace_policy(struct vm_area_struct *vma, 821 struct mempolicy *pol) 822 { 823 int err; 824 struct mempolicy *old; 825 struct mempolicy *new; 826 827 vma_assert_write_locked(vma); 828 829 new = mpol_dup(pol); 830 if (IS_ERR(new)) 831 return PTR_ERR(new); 832 833 if (vma->vm_ops && vma->vm_ops->set_policy) { 834 err = vma->vm_ops->set_policy(vma, new); 835 if (err) 836 goto err_out; 837 } 838 839 old = vma->vm_policy; 840 vma->vm_policy = new; /* protected by mmap_lock */ 841 mpol_put(old); 842 843 return 0; 844 err_out: 845 mpol_put(new); 846 return err; 847 } 848 849 /* Split or merge the VMA (if required) and apply the new policy */ 850 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 851 struct vm_area_struct **prev, unsigned long start, 852 unsigned long end, struct mempolicy *new_pol) 853 { 854 unsigned long vmstart, vmend; 855 856 vmend = min(end, vma->vm_end); 857 if (start > vma->vm_start) { 858 *prev = vma; 859 vmstart = start; 860 } else { 861 vmstart = vma->vm_start; 862 } 863 864 if (mpol_equal(vma->vm_policy, new_pol)) { 865 *prev = vma; 866 return 0; 867 } 868 869 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 870 if (IS_ERR(vma)) 871 return PTR_ERR(vma); 872 873 *prev = vma; 874 return vma_replace_policy(vma, new_pol); 875 } 876 877 /* Set the process memory policy */ 878 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 879 nodemask_t *nodes) 880 { 881 struct mempolicy *new, *old; 882 NODEMASK_SCRATCH(scratch); 883 int ret; 884 885 if (!scratch) 886 return -ENOMEM; 887 888 new = mpol_new(mode, flags, nodes); 889 if (IS_ERR(new)) { 890 ret = PTR_ERR(new); 891 goto out; 892 } 893 894 task_lock(current); 895 ret = mpol_set_nodemask(new, nodes, scratch); 896 if (ret) { 897 task_unlock(current); 898 mpol_put(new); 899 goto out; 900 } 901 902 old = current->mempolicy; 903 current->mempolicy = new; 904 if (new && (new->mode == MPOL_INTERLEAVE || 905 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 906 current->il_prev = MAX_NUMNODES-1; 907 current->il_weight = 0; 908 } 909 task_unlock(current); 910 mpol_put(old); 911 ret = 0; 912 out: 913 NODEMASK_SCRATCH_FREE(scratch); 914 return ret; 915 } 916 917 /* 918 * Return nodemask for policy for get_mempolicy() query 919 * 920 * Called with task's alloc_lock held 921 */ 922 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 923 { 924 nodes_clear(*nodes); 925 if (pol == &default_policy) 926 return; 927 928 switch (pol->mode) { 929 case MPOL_BIND: 930 case MPOL_INTERLEAVE: 931 case MPOL_PREFERRED: 932 case MPOL_PREFERRED_MANY: 933 case MPOL_WEIGHTED_INTERLEAVE: 934 *nodes = pol->nodes; 935 break; 936 case MPOL_LOCAL: 937 /* return empty node mask for local allocation */ 938 break; 939 default: 940 BUG(); 941 } 942 } 943 944 static int lookup_node(struct mm_struct *mm, unsigned long addr) 945 { 946 struct page *p = NULL; 947 int ret; 948 949 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 950 if (ret > 0) { 951 ret = page_to_nid(p); 952 put_page(p); 953 } 954 return ret; 955 } 956 957 /* Retrieve NUMA policy */ 958 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 959 unsigned long addr, unsigned long flags) 960 { 961 int err; 962 struct mm_struct *mm = current->mm; 963 struct vm_area_struct *vma = NULL; 964 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 965 966 if (flags & 967 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 968 return -EINVAL; 969 970 if (flags & MPOL_F_MEMS_ALLOWED) { 971 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 972 return -EINVAL; 973 *policy = 0; /* just so it's initialized */ 974 task_lock(current); 975 *nmask = cpuset_current_mems_allowed; 976 task_unlock(current); 977 return 0; 978 } 979 980 if (flags & MPOL_F_ADDR) { 981 pgoff_t ilx; /* ignored here */ 982 /* 983 * Do NOT fall back to task policy if the 984 * vma/shared policy at addr is NULL. We 985 * want to return MPOL_DEFAULT in this case. 986 */ 987 mmap_read_lock(mm); 988 vma = vma_lookup(mm, addr); 989 if (!vma) { 990 mmap_read_unlock(mm); 991 return -EFAULT; 992 } 993 pol = __get_vma_policy(vma, addr, &ilx); 994 } else if (addr) 995 return -EINVAL; 996 997 if (!pol) 998 pol = &default_policy; /* indicates default behavior */ 999 1000 if (flags & MPOL_F_NODE) { 1001 if (flags & MPOL_F_ADDR) { 1002 /* 1003 * Take a refcount on the mpol, because we are about to 1004 * drop the mmap_lock, after which only "pol" remains 1005 * valid, "vma" is stale. 1006 */ 1007 pol_refcount = pol; 1008 vma = NULL; 1009 mpol_get(pol); 1010 mmap_read_unlock(mm); 1011 err = lookup_node(mm, addr); 1012 if (err < 0) 1013 goto out; 1014 *policy = err; 1015 } else if (pol == current->mempolicy && 1016 pol->mode == MPOL_INTERLEAVE) { 1017 *policy = next_node_in(current->il_prev, pol->nodes); 1018 } else if (pol == current->mempolicy && 1019 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1020 if (current->il_weight) 1021 *policy = current->il_prev; 1022 else 1023 *policy = next_node_in(current->il_prev, 1024 pol->nodes); 1025 } else { 1026 err = -EINVAL; 1027 goto out; 1028 } 1029 } else { 1030 *policy = pol == &default_policy ? MPOL_DEFAULT : 1031 pol->mode; 1032 /* 1033 * Internal mempolicy flags must be masked off before exposing 1034 * the policy to userspace. 1035 */ 1036 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1037 } 1038 1039 err = 0; 1040 if (nmask) { 1041 if (mpol_store_user_nodemask(pol)) { 1042 *nmask = pol->w.user_nodemask; 1043 } else { 1044 task_lock(current); 1045 get_policy_nodemask(pol, nmask); 1046 task_unlock(current); 1047 } 1048 } 1049 1050 out: 1051 mpol_cond_put(pol); 1052 if (vma) 1053 mmap_read_unlock(mm); 1054 if (pol_refcount) 1055 mpol_put(pol_refcount); 1056 return err; 1057 } 1058 1059 #ifdef CONFIG_MIGRATION 1060 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1061 unsigned long flags) 1062 { 1063 /* 1064 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1065 * Choosing not to migrate a shared folio is not counted as a failure. 1066 * 1067 * See folio_maybe_mapped_shared() on possible imprecision when we 1068 * cannot easily detect if a folio is shared. 1069 */ 1070 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1071 if (folio_isolate_lru(folio)) { 1072 list_add_tail(&folio->lru, foliolist); 1073 node_stat_mod_folio(folio, 1074 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1075 folio_nr_pages(folio)); 1076 } else { 1077 /* 1078 * Non-movable folio may reach here. And, there may be 1079 * temporary off LRU folios or non-LRU movable folios. 1080 * Treat them as unmovable folios since they can't be 1081 * isolated, so they can't be moved at the moment. 1082 */ 1083 return false; 1084 } 1085 } 1086 return true; 1087 } 1088 1089 /* 1090 * Migrate pages from one node to a target node. 1091 * Returns error or the number of pages not migrated. 1092 */ 1093 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1094 int flags) 1095 { 1096 nodemask_t nmask; 1097 struct vm_area_struct *vma; 1098 LIST_HEAD(pagelist); 1099 long nr_failed; 1100 long err = 0; 1101 struct migration_target_control mtc = { 1102 .nid = dest, 1103 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1104 .reason = MR_SYSCALL, 1105 }; 1106 1107 nodes_clear(nmask); 1108 node_set(source, nmask); 1109 1110 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1111 1112 mmap_read_lock(mm); 1113 vma = find_vma(mm, 0); 1114 if (unlikely(!vma)) { 1115 mmap_read_unlock(mm); 1116 return 0; 1117 } 1118 1119 /* 1120 * This does not migrate the range, but isolates all pages that 1121 * need migration. Between passing in the full user address 1122 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1123 * but passes back the count of pages which could not be isolated. 1124 */ 1125 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1126 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1127 mmap_read_unlock(mm); 1128 1129 if (!list_empty(&pagelist)) { 1130 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1131 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1132 if (err) 1133 putback_movable_pages(&pagelist); 1134 } 1135 1136 if (err >= 0) 1137 err += nr_failed; 1138 return err; 1139 } 1140 1141 /* 1142 * Move pages between the two nodesets so as to preserve the physical 1143 * layout as much as possible. 1144 * 1145 * Returns the number of page that could not be moved. 1146 */ 1147 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1148 const nodemask_t *to, int flags) 1149 { 1150 long nr_failed = 0; 1151 long err = 0; 1152 nodemask_t tmp; 1153 1154 lru_cache_disable(); 1155 1156 /* 1157 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1158 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1159 * bit in 'tmp', and return that <source, dest> pair for migration. 1160 * The pair of nodemasks 'to' and 'from' define the map. 1161 * 1162 * If no pair of bits is found that way, fallback to picking some 1163 * pair of 'source' and 'dest' bits that are not the same. If the 1164 * 'source' and 'dest' bits are the same, this represents a node 1165 * that will be migrating to itself, so no pages need move. 1166 * 1167 * If no bits are left in 'tmp', or if all remaining bits left 1168 * in 'tmp' correspond to the same bit in 'to', return false 1169 * (nothing left to migrate). 1170 * 1171 * This lets us pick a pair of nodes to migrate between, such that 1172 * if possible the dest node is not already occupied by some other 1173 * source node, minimizing the risk of overloading the memory on a 1174 * node that would happen if we migrated incoming memory to a node 1175 * before migrating outgoing memory source that same node. 1176 * 1177 * A single scan of tmp is sufficient. As we go, we remember the 1178 * most recent <s, d> pair that moved (s != d). If we find a pair 1179 * that not only moved, but what's better, moved to an empty slot 1180 * (d is not set in tmp), then we break out then, with that pair. 1181 * Otherwise when we finish scanning from_tmp, we at least have the 1182 * most recent <s, d> pair that moved. If we get all the way through 1183 * the scan of tmp without finding any node that moved, much less 1184 * moved to an empty node, then there is nothing left worth migrating. 1185 */ 1186 1187 tmp = *from; 1188 while (!nodes_empty(tmp)) { 1189 int s, d; 1190 int source = NUMA_NO_NODE; 1191 int dest = 0; 1192 1193 for_each_node_mask(s, tmp) { 1194 1195 /* 1196 * do_migrate_pages() tries to maintain the relative 1197 * node relationship of the pages established between 1198 * threads and memory areas. 1199 * 1200 * However if the number of source nodes is not equal to 1201 * the number of destination nodes we can not preserve 1202 * this node relative relationship. In that case, skip 1203 * copying memory from a node that is in the destination 1204 * mask. 1205 * 1206 * Example: [2,3,4] -> [3,4,5] moves everything. 1207 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1208 */ 1209 1210 if ((nodes_weight(*from) != nodes_weight(*to)) && 1211 (node_isset(s, *to))) 1212 continue; 1213 1214 d = node_remap(s, *from, *to); 1215 if (s == d) 1216 continue; 1217 1218 source = s; /* Node moved. Memorize */ 1219 dest = d; 1220 1221 /* dest not in remaining from nodes? */ 1222 if (!node_isset(dest, tmp)) 1223 break; 1224 } 1225 if (source == NUMA_NO_NODE) 1226 break; 1227 1228 node_clear(source, tmp); 1229 err = migrate_to_node(mm, source, dest, flags); 1230 if (err > 0) 1231 nr_failed += err; 1232 if (err < 0) 1233 break; 1234 } 1235 1236 lru_cache_enable(); 1237 if (err < 0) 1238 return err; 1239 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1240 } 1241 1242 /* 1243 * Allocate a new folio for page migration, according to NUMA mempolicy. 1244 */ 1245 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1246 unsigned long private) 1247 { 1248 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1249 struct mempolicy *pol = mmpol->pol; 1250 pgoff_t ilx = mmpol->ilx; 1251 unsigned int order; 1252 int nid = numa_node_id(); 1253 gfp_t gfp; 1254 1255 order = folio_order(src); 1256 ilx += src->index >> order; 1257 1258 if (folio_test_hugetlb(src)) { 1259 nodemask_t *nodemask; 1260 struct hstate *h; 1261 1262 h = folio_hstate(src); 1263 gfp = htlb_alloc_mask(h); 1264 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1265 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1266 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1267 } 1268 1269 if (folio_test_large(src)) 1270 gfp = GFP_TRANSHUGE; 1271 else 1272 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1273 1274 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1275 } 1276 #else 1277 1278 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1279 unsigned long flags) 1280 { 1281 return false; 1282 } 1283 1284 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1285 const nodemask_t *to, int flags) 1286 { 1287 return -ENOSYS; 1288 } 1289 1290 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1291 unsigned long private) 1292 { 1293 return NULL; 1294 } 1295 #endif 1296 1297 static long do_mbind(unsigned long start, unsigned long len, 1298 unsigned short mode, unsigned short mode_flags, 1299 nodemask_t *nmask, unsigned long flags) 1300 { 1301 struct mm_struct *mm = current->mm; 1302 struct vm_area_struct *vma, *prev; 1303 struct vma_iterator vmi; 1304 struct migration_mpol mmpol; 1305 struct mempolicy *new; 1306 unsigned long end; 1307 long err; 1308 long nr_failed; 1309 LIST_HEAD(pagelist); 1310 1311 if (flags & ~(unsigned long)MPOL_MF_VALID) 1312 return -EINVAL; 1313 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1314 return -EPERM; 1315 1316 if (start & ~PAGE_MASK) 1317 return -EINVAL; 1318 1319 if (mode == MPOL_DEFAULT) 1320 flags &= ~MPOL_MF_STRICT; 1321 1322 len = PAGE_ALIGN(len); 1323 end = start + len; 1324 1325 if (end < start) 1326 return -EINVAL; 1327 if (end == start) 1328 return 0; 1329 1330 new = mpol_new(mode, mode_flags, nmask); 1331 if (IS_ERR(new)) 1332 return PTR_ERR(new); 1333 1334 /* 1335 * If we are using the default policy then operation 1336 * on discontinuous address spaces is okay after all 1337 */ 1338 if (!new) 1339 flags |= MPOL_MF_DISCONTIG_OK; 1340 1341 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1342 lru_cache_disable(); 1343 { 1344 NODEMASK_SCRATCH(scratch); 1345 if (scratch) { 1346 mmap_write_lock(mm); 1347 err = mpol_set_nodemask(new, nmask, scratch); 1348 if (err) 1349 mmap_write_unlock(mm); 1350 } else 1351 err = -ENOMEM; 1352 NODEMASK_SCRATCH_FREE(scratch); 1353 } 1354 if (err) 1355 goto mpol_out; 1356 1357 /* 1358 * Lock the VMAs before scanning for pages to migrate, 1359 * to ensure we don't miss a concurrently inserted page. 1360 */ 1361 nr_failed = queue_pages_range(mm, start, end, nmask, 1362 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1363 1364 if (nr_failed < 0) { 1365 err = nr_failed; 1366 nr_failed = 0; 1367 } else { 1368 vma_iter_init(&vmi, mm, start); 1369 prev = vma_prev(&vmi); 1370 for_each_vma_range(vmi, vma, end) { 1371 err = mbind_range(&vmi, vma, &prev, start, end, new); 1372 if (err) 1373 break; 1374 } 1375 } 1376 1377 if (!err && !list_empty(&pagelist)) { 1378 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1379 if (!new) { 1380 new = get_task_policy(current); 1381 mpol_get(new); 1382 } 1383 mmpol.pol = new; 1384 mmpol.ilx = 0; 1385 1386 /* 1387 * In the interleaved case, attempt to allocate on exactly the 1388 * targeted nodes, for the first VMA to be migrated; for later 1389 * VMAs, the nodes will still be interleaved from the targeted 1390 * nodemask, but one by one may be selected differently. 1391 */ 1392 if (new->mode == MPOL_INTERLEAVE || 1393 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1394 struct folio *folio; 1395 unsigned int order; 1396 unsigned long addr = -EFAULT; 1397 1398 list_for_each_entry(folio, &pagelist, lru) { 1399 if (!folio_test_ksm(folio)) 1400 break; 1401 } 1402 if (!list_entry_is_head(folio, &pagelist, lru)) { 1403 vma_iter_init(&vmi, mm, start); 1404 for_each_vma_range(vmi, vma, end) { 1405 addr = page_address_in_vma(folio, 1406 folio_page(folio, 0), vma); 1407 if (addr != -EFAULT) 1408 break; 1409 } 1410 } 1411 if (addr != -EFAULT) { 1412 order = folio_order(folio); 1413 /* We already know the pol, but not the ilx */ 1414 mpol_cond_put(get_vma_policy(vma, addr, order, 1415 &mmpol.ilx)); 1416 /* Set base from which to increment by index */ 1417 mmpol.ilx -= folio->index >> order; 1418 } 1419 } 1420 } 1421 1422 mmap_write_unlock(mm); 1423 1424 if (!err && !list_empty(&pagelist)) { 1425 nr_failed |= migrate_pages(&pagelist, 1426 alloc_migration_target_by_mpol, NULL, 1427 (unsigned long)&mmpol, MIGRATE_SYNC, 1428 MR_MEMPOLICY_MBIND, NULL); 1429 } 1430 1431 if (nr_failed && (flags & MPOL_MF_STRICT)) 1432 err = -EIO; 1433 if (!list_empty(&pagelist)) 1434 putback_movable_pages(&pagelist); 1435 mpol_out: 1436 mpol_put(new); 1437 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1438 lru_cache_enable(); 1439 return err; 1440 } 1441 1442 /* 1443 * User space interface with variable sized bitmaps for nodelists. 1444 */ 1445 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1446 unsigned long maxnode) 1447 { 1448 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1449 int ret; 1450 1451 if (in_compat_syscall()) 1452 ret = compat_get_bitmap(mask, 1453 (const compat_ulong_t __user *)nmask, 1454 maxnode); 1455 else 1456 ret = copy_from_user(mask, nmask, 1457 nlongs * sizeof(unsigned long)); 1458 1459 if (ret) 1460 return -EFAULT; 1461 1462 if (maxnode % BITS_PER_LONG) 1463 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1464 1465 return 0; 1466 } 1467 1468 /* Copy a node mask from user space. */ 1469 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1470 unsigned long maxnode) 1471 { 1472 --maxnode; 1473 nodes_clear(*nodes); 1474 if (maxnode == 0 || !nmask) 1475 return 0; 1476 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1477 return -EINVAL; 1478 1479 /* 1480 * When the user specified more nodes than supported just check 1481 * if the non supported part is all zero, one word at a time, 1482 * starting at the end. 1483 */ 1484 while (maxnode > MAX_NUMNODES) { 1485 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1486 unsigned long t; 1487 1488 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1489 return -EFAULT; 1490 1491 if (maxnode - bits >= MAX_NUMNODES) { 1492 maxnode -= bits; 1493 } else { 1494 maxnode = MAX_NUMNODES; 1495 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1496 } 1497 if (t) 1498 return -EINVAL; 1499 } 1500 1501 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1502 } 1503 1504 /* Copy a kernel node mask to user space */ 1505 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1506 nodemask_t *nodes) 1507 { 1508 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1509 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1510 bool compat = in_compat_syscall(); 1511 1512 if (compat) 1513 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1514 1515 if (copy > nbytes) { 1516 if (copy > PAGE_SIZE) 1517 return -EINVAL; 1518 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1519 return -EFAULT; 1520 copy = nbytes; 1521 maxnode = nr_node_ids; 1522 } 1523 1524 if (compat) 1525 return compat_put_bitmap((compat_ulong_t __user *)mask, 1526 nodes_addr(*nodes), maxnode); 1527 1528 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1529 } 1530 1531 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1532 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1533 { 1534 *flags = *mode & MPOL_MODE_FLAGS; 1535 *mode &= ~MPOL_MODE_FLAGS; 1536 1537 if ((unsigned int)(*mode) >= MPOL_MAX) 1538 return -EINVAL; 1539 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1540 return -EINVAL; 1541 if (*flags & MPOL_F_NUMA_BALANCING) { 1542 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1543 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1544 else 1545 return -EINVAL; 1546 } 1547 return 0; 1548 } 1549 1550 static long kernel_mbind(unsigned long start, unsigned long len, 1551 unsigned long mode, const unsigned long __user *nmask, 1552 unsigned long maxnode, unsigned int flags) 1553 { 1554 unsigned short mode_flags; 1555 nodemask_t nodes; 1556 int lmode = mode; 1557 int err; 1558 1559 start = untagged_addr(start); 1560 err = sanitize_mpol_flags(&lmode, &mode_flags); 1561 if (err) 1562 return err; 1563 1564 err = get_nodes(&nodes, nmask, maxnode); 1565 if (err) 1566 return err; 1567 1568 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1569 } 1570 1571 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1572 unsigned long, home_node, unsigned long, flags) 1573 { 1574 struct mm_struct *mm = current->mm; 1575 struct vm_area_struct *vma, *prev; 1576 struct mempolicy *new, *old; 1577 unsigned long end; 1578 int err = -ENOENT; 1579 VMA_ITERATOR(vmi, mm, start); 1580 1581 start = untagged_addr(start); 1582 if (start & ~PAGE_MASK) 1583 return -EINVAL; 1584 /* 1585 * flags is used for future extension if any. 1586 */ 1587 if (flags != 0) 1588 return -EINVAL; 1589 1590 /* 1591 * Check home_node is online to avoid accessing uninitialized 1592 * NODE_DATA. 1593 */ 1594 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1595 return -EINVAL; 1596 1597 len = PAGE_ALIGN(len); 1598 end = start + len; 1599 1600 if (end < start) 1601 return -EINVAL; 1602 if (end == start) 1603 return 0; 1604 mmap_write_lock(mm); 1605 prev = vma_prev(&vmi); 1606 for_each_vma_range(vmi, vma, end) { 1607 /* 1608 * If any vma in the range got policy other than MPOL_BIND 1609 * or MPOL_PREFERRED_MANY we return error. We don't reset 1610 * the home node for vmas we already updated before. 1611 */ 1612 old = vma_policy(vma); 1613 if (!old) { 1614 prev = vma; 1615 continue; 1616 } 1617 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1618 err = -EOPNOTSUPP; 1619 break; 1620 } 1621 new = mpol_dup(old); 1622 if (IS_ERR(new)) { 1623 err = PTR_ERR(new); 1624 break; 1625 } 1626 1627 vma_start_write(vma); 1628 new->home_node = home_node; 1629 err = mbind_range(&vmi, vma, &prev, start, end, new); 1630 mpol_put(new); 1631 if (err) 1632 break; 1633 } 1634 mmap_write_unlock(mm); 1635 return err; 1636 } 1637 1638 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1639 unsigned long, mode, const unsigned long __user *, nmask, 1640 unsigned long, maxnode, unsigned int, flags) 1641 { 1642 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1643 } 1644 1645 /* Set the process memory policy */ 1646 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1647 unsigned long maxnode) 1648 { 1649 unsigned short mode_flags; 1650 nodemask_t nodes; 1651 int lmode = mode; 1652 int err; 1653 1654 err = sanitize_mpol_flags(&lmode, &mode_flags); 1655 if (err) 1656 return err; 1657 1658 err = get_nodes(&nodes, nmask, maxnode); 1659 if (err) 1660 return err; 1661 1662 return do_set_mempolicy(lmode, mode_flags, &nodes); 1663 } 1664 1665 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1666 unsigned long, maxnode) 1667 { 1668 return kernel_set_mempolicy(mode, nmask, maxnode); 1669 } 1670 1671 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1672 const unsigned long __user *old_nodes, 1673 const unsigned long __user *new_nodes) 1674 { 1675 struct mm_struct *mm = NULL; 1676 struct task_struct *task; 1677 nodemask_t task_nodes; 1678 int err; 1679 nodemask_t *old; 1680 nodemask_t *new; 1681 NODEMASK_SCRATCH(scratch); 1682 1683 if (!scratch) 1684 return -ENOMEM; 1685 1686 old = &scratch->mask1; 1687 new = &scratch->mask2; 1688 1689 err = get_nodes(old, old_nodes, maxnode); 1690 if (err) 1691 goto out; 1692 1693 err = get_nodes(new, new_nodes, maxnode); 1694 if (err) 1695 goto out; 1696 1697 /* Find the mm_struct */ 1698 rcu_read_lock(); 1699 task = pid ? find_task_by_vpid(pid) : current; 1700 if (!task) { 1701 rcu_read_unlock(); 1702 err = -ESRCH; 1703 goto out; 1704 } 1705 get_task_struct(task); 1706 1707 err = -EINVAL; 1708 1709 /* 1710 * Check if this process has the right to modify the specified process. 1711 * Use the regular "ptrace_may_access()" checks. 1712 */ 1713 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1714 rcu_read_unlock(); 1715 err = -EPERM; 1716 goto out_put; 1717 } 1718 rcu_read_unlock(); 1719 1720 task_nodes = cpuset_mems_allowed(task); 1721 /* Is the user allowed to access the target nodes? */ 1722 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1723 err = -EPERM; 1724 goto out_put; 1725 } 1726 1727 task_nodes = cpuset_mems_allowed(current); 1728 nodes_and(*new, *new, task_nodes); 1729 if (nodes_empty(*new)) 1730 goto out_put; 1731 1732 err = security_task_movememory(task); 1733 if (err) 1734 goto out_put; 1735 1736 mm = get_task_mm(task); 1737 put_task_struct(task); 1738 1739 if (!mm) { 1740 err = -EINVAL; 1741 goto out; 1742 } 1743 1744 err = do_migrate_pages(mm, old, new, 1745 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1746 1747 mmput(mm); 1748 out: 1749 NODEMASK_SCRATCH_FREE(scratch); 1750 1751 return err; 1752 1753 out_put: 1754 put_task_struct(task); 1755 goto out; 1756 } 1757 1758 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1759 const unsigned long __user *, old_nodes, 1760 const unsigned long __user *, new_nodes) 1761 { 1762 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1763 } 1764 1765 /* Retrieve NUMA policy */ 1766 static int kernel_get_mempolicy(int __user *policy, 1767 unsigned long __user *nmask, 1768 unsigned long maxnode, 1769 unsigned long addr, 1770 unsigned long flags) 1771 { 1772 int err; 1773 int pval; 1774 nodemask_t nodes; 1775 1776 if (nmask != NULL && maxnode < nr_node_ids) 1777 return -EINVAL; 1778 1779 addr = untagged_addr(addr); 1780 1781 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1782 1783 if (err) 1784 return err; 1785 1786 if (policy && put_user(pval, policy)) 1787 return -EFAULT; 1788 1789 if (nmask) 1790 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1791 1792 return err; 1793 } 1794 1795 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1796 unsigned long __user *, nmask, unsigned long, maxnode, 1797 unsigned long, addr, unsigned long, flags) 1798 { 1799 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1800 } 1801 1802 bool vma_migratable(struct vm_area_struct *vma) 1803 { 1804 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1805 return false; 1806 1807 /* 1808 * DAX device mappings require predictable access latency, so avoid 1809 * incurring periodic faults. 1810 */ 1811 if (vma_is_dax(vma)) 1812 return false; 1813 1814 if (is_vm_hugetlb_page(vma) && 1815 !hugepage_migration_supported(hstate_vma(vma))) 1816 return false; 1817 1818 /* 1819 * Migration allocates pages in the highest zone. If we cannot 1820 * do so then migration (at least from node to node) is not 1821 * possible. 1822 */ 1823 if (vma->vm_file && 1824 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 1825 < policy_zone) 1826 return false; 1827 return true; 1828 } 1829 1830 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1831 unsigned long addr, pgoff_t *ilx) 1832 { 1833 *ilx = 0; 1834 return (vma->vm_ops && vma->vm_ops->get_policy) ? 1835 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 1836 } 1837 1838 /* 1839 * get_vma_policy(@vma, @addr, @order, @ilx) 1840 * @vma: virtual memory area whose policy is sought 1841 * @addr: address in @vma for shared policy lookup 1842 * @order: 0, or appropriate huge_page_order for interleaving 1843 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 1844 * MPOL_WEIGHTED_INTERLEAVE 1845 * 1846 * Returns effective policy for a VMA at specified address. 1847 * Falls back to current->mempolicy or system default policy, as necessary. 1848 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1849 * count--added by the get_policy() vm_op, as appropriate--to protect against 1850 * freeing by another task. It is the caller's responsibility to free the 1851 * extra reference for shared policies. 1852 */ 1853 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1854 unsigned long addr, int order, pgoff_t *ilx) 1855 { 1856 struct mempolicy *pol; 1857 1858 pol = __get_vma_policy(vma, addr, ilx); 1859 if (!pol) 1860 pol = get_task_policy(current); 1861 if (pol->mode == MPOL_INTERLEAVE || 1862 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1863 *ilx += vma->vm_pgoff >> order; 1864 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1865 } 1866 return pol; 1867 } 1868 1869 bool vma_policy_mof(struct vm_area_struct *vma) 1870 { 1871 struct mempolicy *pol; 1872 1873 if (vma->vm_ops && vma->vm_ops->get_policy) { 1874 bool ret = false; 1875 pgoff_t ilx; /* ignored here */ 1876 1877 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1878 if (pol && (pol->flags & MPOL_F_MOF)) 1879 ret = true; 1880 mpol_cond_put(pol); 1881 1882 return ret; 1883 } 1884 1885 pol = vma->vm_policy; 1886 if (!pol) 1887 pol = get_task_policy(current); 1888 1889 return pol->flags & MPOL_F_MOF; 1890 } 1891 1892 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1893 { 1894 enum zone_type dynamic_policy_zone = policy_zone; 1895 1896 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1897 1898 /* 1899 * if policy->nodes has movable memory only, 1900 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1901 * 1902 * policy->nodes is intersect with node_states[N_MEMORY]. 1903 * so if the following test fails, it implies 1904 * policy->nodes has movable memory only. 1905 */ 1906 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 1907 dynamic_policy_zone = ZONE_MOVABLE; 1908 1909 return zone >= dynamic_policy_zone; 1910 } 1911 1912 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 1913 { 1914 unsigned int node; 1915 unsigned int cpuset_mems_cookie; 1916 1917 retry: 1918 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 1919 cpuset_mems_cookie = read_mems_allowed_begin(); 1920 node = current->il_prev; 1921 if (!current->il_weight || !node_isset(node, policy->nodes)) { 1922 node = next_node_in(node, policy->nodes); 1923 if (read_mems_allowed_retry(cpuset_mems_cookie)) 1924 goto retry; 1925 if (node == MAX_NUMNODES) 1926 return node; 1927 current->il_prev = node; 1928 current->il_weight = get_il_weight(node); 1929 } 1930 current->il_weight--; 1931 return node; 1932 } 1933 1934 /* Do dynamic interleaving for a process */ 1935 static unsigned int interleave_nodes(struct mempolicy *policy) 1936 { 1937 unsigned int nid; 1938 unsigned int cpuset_mems_cookie; 1939 1940 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 1941 do { 1942 cpuset_mems_cookie = read_mems_allowed_begin(); 1943 nid = next_node_in(current->il_prev, policy->nodes); 1944 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1945 1946 if (nid < MAX_NUMNODES) 1947 current->il_prev = nid; 1948 return nid; 1949 } 1950 1951 /* 1952 * Depending on the memory policy provide a node from which to allocate the 1953 * next slab entry. 1954 */ 1955 unsigned int mempolicy_slab_node(void) 1956 { 1957 struct mempolicy *policy; 1958 int node = numa_mem_id(); 1959 1960 if (!in_task()) 1961 return node; 1962 1963 policy = current->mempolicy; 1964 if (!policy) 1965 return node; 1966 1967 switch (policy->mode) { 1968 case MPOL_PREFERRED: 1969 return first_node(policy->nodes); 1970 1971 case MPOL_INTERLEAVE: 1972 return interleave_nodes(policy); 1973 1974 case MPOL_WEIGHTED_INTERLEAVE: 1975 return weighted_interleave_nodes(policy); 1976 1977 case MPOL_BIND: 1978 case MPOL_PREFERRED_MANY: 1979 { 1980 struct zoneref *z; 1981 1982 /* 1983 * Follow bind policy behavior and start allocation at the 1984 * first node. 1985 */ 1986 struct zonelist *zonelist; 1987 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1988 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 1989 z = first_zones_zonelist(zonelist, highest_zoneidx, 1990 &policy->nodes); 1991 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 1992 } 1993 case MPOL_LOCAL: 1994 return node; 1995 1996 default: 1997 BUG(); 1998 } 1999 } 2000 2001 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2002 nodemask_t *mask) 2003 { 2004 /* 2005 * barrier stabilizes the nodemask locally so that it can be iterated 2006 * over safely without concern for changes. Allocators validate node 2007 * selection does not violate mems_allowed, so this is safe. 2008 */ 2009 barrier(); 2010 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2011 barrier(); 2012 return nodes_weight(*mask); 2013 } 2014 2015 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2016 { 2017 nodemask_t nodemask; 2018 unsigned int target, nr_nodes; 2019 u8 *table; 2020 unsigned int weight_total = 0; 2021 u8 weight; 2022 int nid; 2023 2024 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2025 if (!nr_nodes) 2026 return numa_node_id(); 2027 2028 rcu_read_lock(); 2029 table = rcu_dereference(iw_table); 2030 /* calculate the total weight */ 2031 for_each_node_mask(nid, nodemask) { 2032 /* detect system default usage */ 2033 weight = table ? table[nid] : 1; 2034 weight = weight ? weight : 1; 2035 weight_total += weight; 2036 } 2037 2038 /* Calculate the node offset based on totals */ 2039 target = ilx % weight_total; 2040 nid = first_node(nodemask); 2041 while (target) { 2042 /* detect system default usage */ 2043 weight = table ? table[nid] : 1; 2044 weight = weight ? weight : 1; 2045 if (target < weight) 2046 break; 2047 target -= weight; 2048 nid = next_node_in(nid, nodemask); 2049 } 2050 rcu_read_unlock(); 2051 return nid; 2052 } 2053 2054 /* 2055 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2056 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2057 * exceeds the number of present nodes. 2058 */ 2059 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2060 { 2061 nodemask_t nodemask; 2062 unsigned int target, nnodes; 2063 int i; 2064 int nid; 2065 2066 nnodes = read_once_policy_nodemask(pol, &nodemask); 2067 if (!nnodes) 2068 return numa_node_id(); 2069 target = ilx % nnodes; 2070 nid = first_node(nodemask); 2071 for (i = 0; i < target; i++) 2072 nid = next_node(nid, nodemask); 2073 return nid; 2074 } 2075 2076 /* 2077 * Return a nodemask representing a mempolicy for filtering nodes for 2078 * page allocation, together with preferred node id (or the input node id). 2079 */ 2080 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2081 pgoff_t ilx, int *nid) 2082 { 2083 nodemask_t *nodemask = NULL; 2084 2085 switch (pol->mode) { 2086 case MPOL_PREFERRED: 2087 /* Override input node id */ 2088 *nid = first_node(pol->nodes); 2089 break; 2090 case MPOL_PREFERRED_MANY: 2091 nodemask = &pol->nodes; 2092 if (pol->home_node != NUMA_NO_NODE) 2093 *nid = pol->home_node; 2094 break; 2095 case MPOL_BIND: 2096 /* Restrict to nodemask (but not on lower zones) */ 2097 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2098 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2099 nodemask = &pol->nodes; 2100 if (pol->home_node != NUMA_NO_NODE) 2101 *nid = pol->home_node; 2102 /* 2103 * __GFP_THISNODE shouldn't even be used with the bind policy 2104 * because we might easily break the expectation to stay on the 2105 * requested node and not break the policy. 2106 */ 2107 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2108 break; 2109 case MPOL_INTERLEAVE: 2110 /* Override input node id */ 2111 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2112 interleave_nodes(pol) : interleave_nid(pol, ilx); 2113 break; 2114 case MPOL_WEIGHTED_INTERLEAVE: 2115 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2116 weighted_interleave_nodes(pol) : 2117 weighted_interleave_nid(pol, ilx); 2118 break; 2119 } 2120 2121 return nodemask; 2122 } 2123 2124 #ifdef CONFIG_HUGETLBFS 2125 /* 2126 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2127 * @vma: virtual memory area whose policy is sought 2128 * @addr: address in @vma for shared policy lookup and interleave policy 2129 * @gfp_flags: for requested zone 2130 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2131 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2132 * 2133 * Returns a nid suitable for a huge page allocation and a pointer 2134 * to the struct mempolicy for conditional unref after allocation. 2135 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2136 * to the mempolicy's @nodemask for filtering the zonelist. 2137 */ 2138 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2139 struct mempolicy **mpol, nodemask_t **nodemask) 2140 { 2141 pgoff_t ilx; 2142 int nid; 2143 2144 nid = numa_node_id(); 2145 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2146 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2147 return nid; 2148 } 2149 2150 /* 2151 * init_nodemask_of_mempolicy 2152 * 2153 * If the current task's mempolicy is "default" [NULL], return 'false' 2154 * to indicate default policy. Otherwise, extract the policy nodemask 2155 * for 'bind' or 'interleave' policy into the argument nodemask, or 2156 * initialize the argument nodemask to contain the single node for 2157 * 'preferred' or 'local' policy and return 'true' to indicate presence 2158 * of non-default mempolicy. 2159 * 2160 * We don't bother with reference counting the mempolicy [mpol_get/put] 2161 * because the current task is examining it's own mempolicy and a task's 2162 * mempolicy is only ever changed by the task itself. 2163 * 2164 * N.B., it is the caller's responsibility to free a returned nodemask. 2165 */ 2166 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2167 { 2168 struct mempolicy *mempolicy; 2169 2170 if (!(mask && current->mempolicy)) 2171 return false; 2172 2173 task_lock(current); 2174 mempolicy = current->mempolicy; 2175 switch (mempolicy->mode) { 2176 case MPOL_PREFERRED: 2177 case MPOL_PREFERRED_MANY: 2178 case MPOL_BIND: 2179 case MPOL_INTERLEAVE: 2180 case MPOL_WEIGHTED_INTERLEAVE: 2181 *mask = mempolicy->nodes; 2182 break; 2183 2184 case MPOL_LOCAL: 2185 init_nodemask_of_node(mask, numa_node_id()); 2186 break; 2187 2188 default: 2189 BUG(); 2190 } 2191 task_unlock(current); 2192 2193 return true; 2194 } 2195 #endif 2196 2197 /* 2198 * mempolicy_in_oom_domain 2199 * 2200 * If tsk's mempolicy is "bind", check for intersection between mask and 2201 * the policy nodemask. Otherwise, return true for all other policies 2202 * including "interleave", as a tsk with "interleave" policy may have 2203 * memory allocated from all nodes in system. 2204 * 2205 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2206 */ 2207 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2208 const nodemask_t *mask) 2209 { 2210 struct mempolicy *mempolicy; 2211 bool ret = true; 2212 2213 if (!mask) 2214 return ret; 2215 2216 task_lock(tsk); 2217 mempolicy = tsk->mempolicy; 2218 if (mempolicy && mempolicy->mode == MPOL_BIND) 2219 ret = nodes_intersects(mempolicy->nodes, *mask); 2220 task_unlock(tsk); 2221 2222 return ret; 2223 } 2224 2225 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2226 int nid, nodemask_t *nodemask) 2227 { 2228 struct page *page; 2229 gfp_t preferred_gfp; 2230 2231 /* 2232 * This is a two pass approach. The first pass will only try the 2233 * preferred nodes but skip the direct reclaim and allow the 2234 * allocation to fail, while the second pass will try all the 2235 * nodes in system. 2236 */ 2237 preferred_gfp = gfp | __GFP_NOWARN; 2238 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2239 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2240 if (!page) 2241 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2242 2243 return page; 2244 } 2245 2246 /** 2247 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2248 * @gfp: GFP flags. 2249 * @order: Order of the page allocation. 2250 * @pol: Pointer to the NUMA mempolicy. 2251 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2252 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2253 * 2254 * Return: The page on success or NULL if allocation fails. 2255 */ 2256 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2257 struct mempolicy *pol, pgoff_t ilx, int nid) 2258 { 2259 nodemask_t *nodemask; 2260 struct page *page; 2261 2262 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2263 2264 if (pol->mode == MPOL_PREFERRED_MANY) 2265 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2266 2267 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2268 /* filter "hugepage" allocation, unless from alloc_pages() */ 2269 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2270 /* 2271 * For hugepage allocation and non-interleave policy which 2272 * allows the current node (or other explicitly preferred 2273 * node) we only try to allocate from the current/preferred 2274 * node and don't fall back to other nodes, as the cost of 2275 * remote accesses would likely offset THP benefits. 2276 * 2277 * If the policy is interleave or does not allow the current 2278 * node in its nodemask, we allocate the standard way. 2279 */ 2280 if (pol->mode != MPOL_INTERLEAVE && 2281 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2282 (!nodemask || node_isset(nid, *nodemask))) { 2283 /* 2284 * First, try to allocate THP only on local node, but 2285 * don't reclaim unnecessarily, just compact. 2286 */ 2287 page = __alloc_frozen_pages_noprof( 2288 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2289 nid, NULL); 2290 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2291 return page; 2292 /* 2293 * If hugepage allocations are configured to always 2294 * synchronous compact or the vma has been madvised 2295 * to prefer hugepage backing, retry allowing remote 2296 * memory with both reclaim and compact as well. 2297 */ 2298 } 2299 } 2300 2301 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2302 2303 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2304 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2305 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2306 if (static_branch_likely(&vm_numa_stat_key) && 2307 page_to_nid(page) == nid) { 2308 preempt_disable(); 2309 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2310 preempt_enable(); 2311 } 2312 } 2313 2314 return page; 2315 } 2316 2317 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2318 struct mempolicy *pol, pgoff_t ilx, int nid) 2319 { 2320 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2321 ilx, nid); 2322 if (!page) 2323 return NULL; 2324 2325 set_page_refcounted(page); 2326 return page_rmappable_folio(page); 2327 } 2328 2329 /** 2330 * vma_alloc_folio - Allocate a folio for a VMA. 2331 * @gfp: GFP flags. 2332 * @order: Order of the folio. 2333 * @vma: Pointer to VMA. 2334 * @addr: Virtual address of the allocation. Must be inside @vma. 2335 * 2336 * Allocate a folio for a specific address in @vma, using the appropriate 2337 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2338 * VMA to prevent it from going away. Should be used for all allocations 2339 * for folios that will be mapped into user space, excepting hugetlbfs, and 2340 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2341 * 2342 * Return: The folio on success or NULL if allocation fails. 2343 */ 2344 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2345 unsigned long addr) 2346 { 2347 struct mempolicy *pol; 2348 pgoff_t ilx; 2349 struct folio *folio; 2350 2351 if (vma->vm_flags & VM_DROPPABLE) 2352 gfp |= __GFP_NOWARN; 2353 2354 pol = get_vma_policy(vma, addr, order, &ilx); 2355 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2356 mpol_cond_put(pol); 2357 return folio; 2358 } 2359 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2360 2361 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2362 { 2363 struct mempolicy *pol = &default_policy; 2364 2365 /* 2366 * No reference counting needed for current->mempolicy 2367 * nor system default_policy 2368 */ 2369 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2370 pol = get_task_policy(current); 2371 2372 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2373 numa_node_id()); 2374 } 2375 2376 /** 2377 * alloc_pages - Allocate pages. 2378 * @gfp: GFP flags. 2379 * @order: Power of two of number of pages to allocate. 2380 * 2381 * Allocate 1 << @order contiguous pages. The physical address of the 2382 * first page is naturally aligned (eg an order-3 allocation will be aligned 2383 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2384 * process is honoured when in process context. 2385 * 2386 * Context: Can be called from any context, providing the appropriate GFP 2387 * flags are used. 2388 * Return: The page on success or NULL if allocation fails. 2389 */ 2390 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2391 { 2392 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2393 2394 if (page) 2395 set_page_refcounted(page); 2396 return page; 2397 } 2398 EXPORT_SYMBOL(alloc_pages_noprof); 2399 2400 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2401 { 2402 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2403 } 2404 EXPORT_SYMBOL(folio_alloc_noprof); 2405 2406 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2407 struct mempolicy *pol, unsigned long nr_pages, 2408 struct page **page_array) 2409 { 2410 int nodes; 2411 unsigned long nr_pages_per_node; 2412 int delta; 2413 int i; 2414 unsigned long nr_allocated; 2415 unsigned long total_allocated = 0; 2416 2417 nodes = nodes_weight(pol->nodes); 2418 nr_pages_per_node = nr_pages / nodes; 2419 delta = nr_pages - nodes * nr_pages_per_node; 2420 2421 for (i = 0; i < nodes; i++) { 2422 if (delta) { 2423 nr_allocated = alloc_pages_bulk_noprof(gfp, 2424 interleave_nodes(pol), NULL, 2425 nr_pages_per_node + 1, 2426 page_array); 2427 delta--; 2428 } else { 2429 nr_allocated = alloc_pages_bulk_noprof(gfp, 2430 interleave_nodes(pol), NULL, 2431 nr_pages_per_node, page_array); 2432 } 2433 2434 page_array += nr_allocated; 2435 total_allocated += nr_allocated; 2436 } 2437 2438 return total_allocated; 2439 } 2440 2441 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2442 struct mempolicy *pol, unsigned long nr_pages, 2443 struct page **page_array) 2444 { 2445 struct task_struct *me = current; 2446 unsigned int cpuset_mems_cookie; 2447 unsigned long total_allocated = 0; 2448 unsigned long nr_allocated = 0; 2449 unsigned long rounds; 2450 unsigned long node_pages, delta; 2451 u8 *table, *weights, weight; 2452 unsigned int weight_total = 0; 2453 unsigned long rem_pages = nr_pages; 2454 nodemask_t nodes; 2455 int nnodes, node; 2456 int resume_node = MAX_NUMNODES - 1; 2457 u8 resume_weight = 0; 2458 int prev_node; 2459 int i; 2460 2461 if (!nr_pages) 2462 return 0; 2463 2464 /* read the nodes onto the stack, retry if done during rebind */ 2465 do { 2466 cpuset_mems_cookie = read_mems_allowed_begin(); 2467 nnodes = read_once_policy_nodemask(pol, &nodes); 2468 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2469 2470 /* if the nodemask has become invalid, we cannot do anything */ 2471 if (!nnodes) 2472 return 0; 2473 2474 /* Continue allocating from most recent node and adjust the nr_pages */ 2475 node = me->il_prev; 2476 weight = me->il_weight; 2477 if (weight && node_isset(node, nodes)) { 2478 node_pages = min(rem_pages, weight); 2479 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2480 page_array); 2481 page_array += nr_allocated; 2482 total_allocated += nr_allocated; 2483 /* if that's all the pages, no need to interleave */ 2484 if (rem_pages <= weight) { 2485 me->il_weight -= rem_pages; 2486 return total_allocated; 2487 } 2488 /* Otherwise we adjust remaining pages, continue from there */ 2489 rem_pages -= weight; 2490 } 2491 /* clear active weight in case of an allocation failure */ 2492 me->il_weight = 0; 2493 prev_node = node; 2494 2495 /* create a local copy of node weights to operate on outside rcu */ 2496 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2497 if (!weights) 2498 return total_allocated; 2499 2500 rcu_read_lock(); 2501 table = rcu_dereference(iw_table); 2502 if (table) 2503 memcpy(weights, table, nr_node_ids); 2504 rcu_read_unlock(); 2505 2506 /* calculate total, detect system default usage */ 2507 for_each_node_mask(node, nodes) { 2508 if (!weights[node]) 2509 weights[node] = 1; 2510 weight_total += weights[node]; 2511 } 2512 2513 /* 2514 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2515 * Track which node weighted interleave should resume from. 2516 * 2517 * if (rounds > 0) and (delta == 0), resume_node will always be 2518 * the node following prev_node and its weight. 2519 */ 2520 rounds = rem_pages / weight_total; 2521 delta = rem_pages % weight_total; 2522 resume_node = next_node_in(prev_node, nodes); 2523 resume_weight = weights[resume_node]; 2524 for (i = 0; i < nnodes; i++) { 2525 node = next_node_in(prev_node, nodes); 2526 weight = weights[node]; 2527 node_pages = weight * rounds; 2528 /* If a delta exists, add this node's portion of the delta */ 2529 if (delta > weight) { 2530 node_pages += weight; 2531 delta -= weight; 2532 } else if (delta) { 2533 /* when delta is depleted, resume from that node */ 2534 node_pages += delta; 2535 resume_node = node; 2536 resume_weight = weight - delta; 2537 delta = 0; 2538 } 2539 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2540 if (!node_pages) 2541 break; 2542 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2543 page_array); 2544 page_array += nr_allocated; 2545 total_allocated += nr_allocated; 2546 if (total_allocated == nr_pages) 2547 break; 2548 prev_node = node; 2549 } 2550 me->il_prev = resume_node; 2551 me->il_weight = resume_weight; 2552 kfree(weights); 2553 return total_allocated; 2554 } 2555 2556 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2557 struct mempolicy *pol, unsigned long nr_pages, 2558 struct page **page_array) 2559 { 2560 gfp_t preferred_gfp; 2561 unsigned long nr_allocated = 0; 2562 2563 preferred_gfp = gfp | __GFP_NOWARN; 2564 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2565 2566 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2567 nr_pages, page_array); 2568 2569 if (nr_allocated < nr_pages) 2570 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2571 nr_pages - nr_allocated, 2572 page_array + nr_allocated); 2573 return nr_allocated; 2574 } 2575 2576 /* alloc pages bulk and mempolicy should be considered at the 2577 * same time in some situation such as vmalloc. 2578 * 2579 * It can accelerate memory allocation especially interleaving 2580 * allocate memory. 2581 */ 2582 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2583 unsigned long nr_pages, struct page **page_array) 2584 { 2585 struct mempolicy *pol = &default_policy; 2586 nodemask_t *nodemask; 2587 int nid; 2588 2589 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2590 pol = get_task_policy(current); 2591 2592 if (pol->mode == MPOL_INTERLEAVE) 2593 return alloc_pages_bulk_interleave(gfp, pol, 2594 nr_pages, page_array); 2595 2596 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2597 return alloc_pages_bulk_weighted_interleave( 2598 gfp, pol, nr_pages, page_array); 2599 2600 if (pol->mode == MPOL_PREFERRED_MANY) 2601 return alloc_pages_bulk_preferred_many(gfp, 2602 numa_node_id(), pol, nr_pages, page_array); 2603 2604 nid = numa_node_id(); 2605 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2606 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2607 nr_pages, page_array); 2608 } 2609 2610 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2611 { 2612 struct mempolicy *pol = mpol_dup(src->vm_policy); 2613 2614 if (IS_ERR(pol)) 2615 return PTR_ERR(pol); 2616 dst->vm_policy = pol; 2617 return 0; 2618 } 2619 2620 /* 2621 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2622 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2623 * with the mems_allowed returned by cpuset_mems_allowed(). This 2624 * keeps mempolicies cpuset relative after its cpuset moves. See 2625 * further kernel/cpuset.c update_nodemask(). 2626 * 2627 * current's mempolicy may be rebinded by the other task(the task that changes 2628 * cpuset's mems), so we needn't do rebind work for current task. 2629 */ 2630 2631 /* Slow path of a mempolicy duplicate */ 2632 struct mempolicy *__mpol_dup(struct mempolicy *old) 2633 { 2634 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2635 2636 if (!new) 2637 return ERR_PTR(-ENOMEM); 2638 2639 /* task's mempolicy is protected by alloc_lock */ 2640 if (old == current->mempolicy) { 2641 task_lock(current); 2642 *new = *old; 2643 task_unlock(current); 2644 } else 2645 *new = *old; 2646 2647 if (current_cpuset_is_being_rebound()) { 2648 nodemask_t mems = cpuset_mems_allowed(current); 2649 mpol_rebind_policy(new, &mems); 2650 } 2651 atomic_set(&new->refcnt, 1); 2652 return new; 2653 } 2654 2655 /* Slow path of a mempolicy comparison */ 2656 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2657 { 2658 if (!a || !b) 2659 return false; 2660 if (a->mode != b->mode) 2661 return false; 2662 if (a->flags != b->flags) 2663 return false; 2664 if (a->home_node != b->home_node) 2665 return false; 2666 if (mpol_store_user_nodemask(a)) 2667 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2668 return false; 2669 2670 switch (a->mode) { 2671 case MPOL_BIND: 2672 case MPOL_INTERLEAVE: 2673 case MPOL_PREFERRED: 2674 case MPOL_PREFERRED_MANY: 2675 case MPOL_WEIGHTED_INTERLEAVE: 2676 return !!nodes_equal(a->nodes, b->nodes); 2677 case MPOL_LOCAL: 2678 return true; 2679 default: 2680 BUG(); 2681 return false; 2682 } 2683 } 2684 2685 /* 2686 * Shared memory backing store policy support. 2687 * 2688 * Remember policies even when nobody has shared memory mapped. 2689 * The policies are kept in Red-Black tree linked from the inode. 2690 * They are protected by the sp->lock rwlock, which should be held 2691 * for any accesses to the tree. 2692 */ 2693 2694 /* 2695 * lookup first element intersecting start-end. Caller holds sp->lock for 2696 * reading or for writing 2697 */ 2698 static struct sp_node *sp_lookup(struct shared_policy *sp, 2699 pgoff_t start, pgoff_t end) 2700 { 2701 struct rb_node *n = sp->root.rb_node; 2702 2703 while (n) { 2704 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2705 2706 if (start >= p->end) 2707 n = n->rb_right; 2708 else if (end <= p->start) 2709 n = n->rb_left; 2710 else 2711 break; 2712 } 2713 if (!n) 2714 return NULL; 2715 for (;;) { 2716 struct sp_node *w = NULL; 2717 struct rb_node *prev = rb_prev(n); 2718 if (!prev) 2719 break; 2720 w = rb_entry(prev, struct sp_node, nd); 2721 if (w->end <= start) 2722 break; 2723 n = prev; 2724 } 2725 return rb_entry(n, struct sp_node, nd); 2726 } 2727 2728 /* 2729 * Insert a new shared policy into the list. Caller holds sp->lock for 2730 * writing. 2731 */ 2732 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2733 { 2734 struct rb_node **p = &sp->root.rb_node; 2735 struct rb_node *parent = NULL; 2736 struct sp_node *nd; 2737 2738 while (*p) { 2739 parent = *p; 2740 nd = rb_entry(parent, struct sp_node, nd); 2741 if (new->start < nd->start) 2742 p = &(*p)->rb_left; 2743 else if (new->end > nd->end) 2744 p = &(*p)->rb_right; 2745 else 2746 BUG(); 2747 } 2748 rb_link_node(&new->nd, parent, p); 2749 rb_insert_color(&new->nd, &sp->root); 2750 } 2751 2752 /* Find shared policy intersecting idx */ 2753 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2754 pgoff_t idx) 2755 { 2756 struct mempolicy *pol = NULL; 2757 struct sp_node *sn; 2758 2759 if (!sp->root.rb_node) 2760 return NULL; 2761 read_lock(&sp->lock); 2762 sn = sp_lookup(sp, idx, idx+1); 2763 if (sn) { 2764 mpol_get(sn->policy); 2765 pol = sn->policy; 2766 } 2767 read_unlock(&sp->lock); 2768 return pol; 2769 } 2770 2771 static void sp_free(struct sp_node *n) 2772 { 2773 mpol_put(n->policy); 2774 kmem_cache_free(sn_cache, n); 2775 } 2776 2777 /** 2778 * mpol_misplaced - check whether current folio node is valid in policy 2779 * 2780 * @folio: folio to be checked 2781 * @vmf: structure describing the fault 2782 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2783 * 2784 * Lookup current policy node id for vma,addr and "compare to" folio's 2785 * node id. Policy determination "mimics" alloc_page_vma(). 2786 * Called from fault path where we know the vma and faulting address. 2787 * 2788 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2789 * policy, or a suitable node ID to allocate a replacement folio from. 2790 */ 2791 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2792 unsigned long addr) 2793 { 2794 struct mempolicy *pol; 2795 pgoff_t ilx; 2796 struct zoneref *z; 2797 int curnid = folio_nid(folio); 2798 struct vm_area_struct *vma = vmf->vma; 2799 int thiscpu = raw_smp_processor_id(); 2800 int thisnid = numa_node_id(); 2801 int polnid = NUMA_NO_NODE; 2802 int ret = NUMA_NO_NODE; 2803 2804 /* 2805 * Make sure ptl is held so that we don't preempt and we 2806 * have a stable smp processor id 2807 */ 2808 lockdep_assert_held(vmf->ptl); 2809 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2810 if (!(pol->flags & MPOL_F_MOF)) 2811 goto out; 2812 2813 switch (pol->mode) { 2814 case MPOL_INTERLEAVE: 2815 polnid = interleave_nid(pol, ilx); 2816 break; 2817 2818 case MPOL_WEIGHTED_INTERLEAVE: 2819 polnid = weighted_interleave_nid(pol, ilx); 2820 break; 2821 2822 case MPOL_PREFERRED: 2823 if (node_isset(curnid, pol->nodes)) 2824 goto out; 2825 polnid = first_node(pol->nodes); 2826 break; 2827 2828 case MPOL_LOCAL: 2829 polnid = numa_node_id(); 2830 break; 2831 2832 case MPOL_BIND: 2833 case MPOL_PREFERRED_MANY: 2834 /* 2835 * Even though MPOL_PREFERRED_MANY can allocate pages outside 2836 * policy nodemask we don't allow numa migration to nodes 2837 * outside policy nodemask for now. This is done so that if we 2838 * want demotion to slow memory to happen, before allocating 2839 * from some DRAM node say 'x', we will end up using a 2840 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 2841 * we should not promote to node 'x' from slow memory node. 2842 */ 2843 if (pol->flags & MPOL_F_MORON) { 2844 /* 2845 * Optimize placement among multiple nodes 2846 * via NUMA balancing 2847 */ 2848 if (node_isset(thisnid, pol->nodes)) 2849 break; 2850 goto out; 2851 } 2852 2853 /* 2854 * use current page if in policy nodemask, 2855 * else select nearest allowed node, if any. 2856 * If no allowed nodes, use current [!misplaced]. 2857 */ 2858 if (node_isset(curnid, pol->nodes)) 2859 goto out; 2860 z = first_zones_zonelist( 2861 node_zonelist(thisnid, GFP_HIGHUSER), 2862 gfp_zone(GFP_HIGHUSER), 2863 &pol->nodes); 2864 polnid = zonelist_node_idx(z); 2865 break; 2866 2867 default: 2868 BUG(); 2869 } 2870 2871 /* Migrate the folio towards the node whose CPU is referencing it */ 2872 if (pol->flags & MPOL_F_MORON) { 2873 polnid = thisnid; 2874 2875 if (!should_numa_migrate_memory(current, folio, curnid, 2876 thiscpu)) 2877 goto out; 2878 } 2879 2880 if (curnid != polnid) 2881 ret = polnid; 2882 out: 2883 mpol_cond_put(pol); 2884 2885 return ret; 2886 } 2887 2888 /* 2889 * Drop the (possibly final) reference to task->mempolicy. It needs to be 2890 * dropped after task->mempolicy is set to NULL so that any allocation done as 2891 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 2892 * policy. 2893 */ 2894 void mpol_put_task_policy(struct task_struct *task) 2895 { 2896 struct mempolicy *pol; 2897 2898 task_lock(task); 2899 pol = task->mempolicy; 2900 task->mempolicy = NULL; 2901 task_unlock(task); 2902 mpol_put(pol); 2903 } 2904 2905 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2906 { 2907 rb_erase(&n->nd, &sp->root); 2908 sp_free(n); 2909 } 2910 2911 static void sp_node_init(struct sp_node *node, unsigned long start, 2912 unsigned long end, struct mempolicy *pol) 2913 { 2914 node->start = start; 2915 node->end = end; 2916 node->policy = pol; 2917 } 2918 2919 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2920 struct mempolicy *pol) 2921 { 2922 struct sp_node *n; 2923 struct mempolicy *newpol; 2924 2925 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2926 if (!n) 2927 return NULL; 2928 2929 newpol = mpol_dup(pol); 2930 if (IS_ERR(newpol)) { 2931 kmem_cache_free(sn_cache, n); 2932 return NULL; 2933 } 2934 newpol->flags |= MPOL_F_SHARED; 2935 sp_node_init(n, start, end, newpol); 2936 2937 return n; 2938 } 2939 2940 /* Replace a policy range. */ 2941 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 2942 pgoff_t end, struct sp_node *new) 2943 { 2944 struct sp_node *n; 2945 struct sp_node *n_new = NULL; 2946 struct mempolicy *mpol_new = NULL; 2947 int ret = 0; 2948 2949 restart: 2950 write_lock(&sp->lock); 2951 n = sp_lookup(sp, start, end); 2952 /* Take care of old policies in the same range. */ 2953 while (n && n->start < end) { 2954 struct rb_node *next = rb_next(&n->nd); 2955 if (n->start >= start) { 2956 if (n->end <= end) 2957 sp_delete(sp, n); 2958 else 2959 n->start = end; 2960 } else { 2961 /* Old policy spanning whole new range. */ 2962 if (n->end > end) { 2963 if (!n_new) 2964 goto alloc_new; 2965 2966 *mpol_new = *n->policy; 2967 atomic_set(&mpol_new->refcnt, 1); 2968 sp_node_init(n_new, end, n->end, mpol_new); 2969 n->end = start; 2970 sp_insert(sp, n_new); 2971 n_new = NULL; 2972 mpol_new = NULL; 2973 break; 2974 } else 2975 n->end = start; 2976 } 2977 if (!next) 2978 break; 2979 n = rb_entry(next, struct sp_node, nd); 2980 } 2981 if (new) 2982 sp_insert(sp, new); 2983 write_unlock(&sp->lock); 2984 ret = 0; 2985 2986 err_out: 2987 if (mpol_new) 2988 mpol_put(mpol_new); 2989 if (n_new) 2990 kmem_cache_free(sn_cache, n_new); 2991 2992 return ret; 2993 2994 alloc_new: 2995 write_unlock(&sp->lock); 2996 ret = -ENOMEM; 2997 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2998 if (!n_new) 2999 goto err_out; 3000 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3001 if (!mpol_new) 3002 goto err_out; 3003 atomic_set(&mpol_new->refcnt, 1); 3004 goto restart; 3005 } 3006 3007 /** 3008 * mpol_shared_policy_init - initialize shared policy for inode 3009 * @sp: pointer to inode shared policy 3010 * @mpol: struct mempolicy to install 3011 * 3012 * Install non-NULL @mpol in inode's shared policy rb-tree. 3013 * On entry, the current task has a reference on a non-NULL @mpol. 3014 * This must be released on exit. 3015 * This is called at get_inode() calls and we can use GFP_KERNEL. 3016 */ 3017 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3018 { 3019 int ret; 3020 3021 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3022 rwlock_init(&sp->lock); 3023 3024 if (mpol) { 3025 struct sp_node *sn; 3026 struct mempolicy *npol; 3027 NODEMASK_SCRATCH(scratch); 3028 3029 if (!scratch) 3030 goto put_mpol; 3031 3032 /* contextualize the tmpfs mount point mempolicy to this file */ 3033 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3034 if (IS_ERR(npol)) 3035 goto free_scratch; /* no valid nodemask intersection */ 3036 3037 task_lock(current); 3038 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3039 task_unlock(current); 3040 if (ret) 3041 goto put_npol; 3042 3043 /* alloc node covering entire file; adds ref to file's npol */ 3044 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3045 if (sn) 3046 sp_insert(sp, sn); 3047 put_npol: 3048 mpol_put(npol); /* drop initial ref on file's npol */ 3049 free_scratch: 3050 NODEMASK_SCRATCH_FREE(scratch); 3051 put_mpol: 3052 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3053 } 3054 } 3055 3056 int mpol_set_shared_policy(struct shared_policy *sp, 3057 struct vm_area_struct *vma, struct mempolicy *pol) 3058 { 3059 int err; 3060 struct sp_node *new = NULL; 3061 unsigned long sz = vma_pages(vma); 3062 3063 if (pol) { 3064 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3065 if (!new) 3066 return -ENOMEM; 3067 } 3068 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3069 if (err && new) 3070 sp_free(new); 3071 return err; 3072 } 3073 3074 /* Free a backing policy store on inode delete. */ 3075 void mpol_free_shared_policy(struct shared_policy *sp) 3076 { 3077 struct sp_node *n; 3078 struct rb_node *next; 3079 3080 if (!sp->root.rb_node) 3081 return; 3082 write_lock(&sp->lock); 3083 next = rb_first(&sp->root); 3084 while (next) { 3085 n = rb_entry(next, struct sp_node, nd); 3086 next = rb_next(&n->nd); 3087 sp_delete(sp, n); 3088 } 3089 write_unlock(&sp->lock); 3090 } 3091 3092 #ifdef CONFIG_NUMA_BALANCING 3093 static int __initdata numabalancing_override; 3094 3095 static void __init check_numabalancing_enable(void) 3096 { 3097 bool numabalancing_default = false; 3098 3099 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3100 numabalancing_default = true; 3101 3102 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3103 if (numabalancing_override) 3104 set_numabalancing_state(numabalancing_override == 1); 3105 3106 if (num_online_nodes() > 1 && !numabalancing_override) { 3107 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3108 numabalancing_default ? "Enabling" : "Disabling"); 3109 set_numabalancing_state(numabalancing_default); 3110 } 3111 } 3112 3113 static int __init setup_numabalancing(char *str) 3114 { 3115 int ret = 0; 3116 if (!str) 3117 goto out; 3118 3119 if (!strcmp(str, "enable")) { 3120 numabalancing_override = 1; 3121 ret = 1; 3122 } else if (!strcmp(str, "disable")) { 3123 numabalancing_override = -1; 3124 ret = 1; 3125 } 3126 out: 3127 if (!ret) 3128 pr_warn("Unable to parse numa_balancing=\n"); 3129 3130 return ret; 3131 } 3132 __setup("numa_balancing=", setup_numabalancing); 3133 #else 3134 static inline void __init check_numabalancing_enable(void) 3135 { 3136 } 3137 #endif /* CONFIG_NUMA_BALANCING */ 3138 3139 void __init numa_policy_init(void) 3140 { 3141 nodemask_t interleave_nodes; 3142 unsigned long largest = 0; 3143 int nid, prefer = 0; 3144 3145 policy_cache = kmem_cache_create("numa_policy", 3146 sizeof(struct mempolicy), 3147 0, SLAB_PANIC, NULL); 3148 3149 sn_cache = kmem_cache_create("shared_policy_node", 3150 sizeof(struct sp_node), 3151 0, SLAB_PANIC, NULL); 3152 3153 for_each_node(nid) { 3154 preferred_node_policy[nid] = (struct mempolicy) { 3155 .refcnt = ATOMIC_INIT(1), 3156 .mode = MPOL_PREFERRED, 3157 .flags = MPOL_F_MOF | MPOL_F_MORON, 3158 .nodes = nodemask_of_node(nid), 3159 }; 3160 } 3161 3162 /* 3163 * Set interleaving policy for system init. Interleaving is only 3164 * enabled across suitably sized nodes (default is >= 16MB), or 3165 * fall back to the largest node if they're all smaller. 3166 */ 3167 nodes_clear(interleave_nodes); 3168 for_each_node_state(nid, N_MEMORY) { 3169 unsigned long total_pages = node_present_pages(nid); 3170 3171 /* Preserve the largest node */ 3172 if (largest < total_pages) { 3173 largest = total_pages; 3174 prefer = nid; 3175 } 3176 3177 /* Interleave this node? */ 3178 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3179 node_set(nid, interleave_nodes); 3180 } 3181 3182 /* All too small, use the largest */ 3183 if (unlikely(nodes_empty(interleave_nodes))) 3184 node_set(prefer, interleave_nodes); 3185 3186 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3187 pr_err("%s: interleaving failed\n", __func__); 3188 3189 check_numabalancing_enable(); 3190 } 3191 3192 /* Reset policy of current process to default */ 3193 void numa_default_policy(void) 3194 { 3195 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3196 } 3197 3198 /* 3199 * Parse and format mempolicy from/to strings 3200 */ 3201 static const char * const policy_modes[] = 3202 { 3203 [MPOL_DEFAULT] = "default", 3204 [MPOL_PREFERRED] = "prefer", 3205 [MPOL_BIND] = "bind", 3206 [MPOL_INTERLEAVE] = "interleave", 3207 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3208 [MPOL_LOCAL] = "local", 3209 [MPOL_PREFERRED_MANY] = "prefer (many)", 3210 }; 3211 3212 #ifdef CONFIG_TMPFS 3213 /** 3214 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3215 * @str: string containing mempolicy to parse 3216 * @mpol: pointer to struct mempolicy pointer, returned on success. 3217 * 3218 * Format of input: 3219 * <mode>[=<flags>][:<nodelist>] 3220 * 3221 * Return: %0 on success, else %1 3222 */ 3223 int mpol_parse_str(char *str, struct mempolicy **mpol) 3224 { 3225 struct mempolicy *new = NULL; 3226 unsigned short mode_flags; 3227 nodemask_t nodes; 3228 char *nodelist = strchr(str, ':'); 3229 char *flags = strchr(str, '='); 3230 int err = 1, mode; 3231 3232 if (flags) 3233 *flags++ = '\0'; /* terminate mode string */ 3234 3235 if (nodelist) { 3236 /* NUL-terminate mode or flags string */ 3237 *nodelist++ = '\0'; 3238 if (nodelist_parse(nodelist, nodes)) 3239 goto out; 3240 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3241 goto out; 3242 } else 3243 nodes_clear(nodes); 3244 3245 mode = match_string(policy_modes, MPOL_MAX, str); 3246 if (mode < 0) 3247 goto out; 3248 3249 switch (mode) { 3250 case MPOL_PREFERRED: 3251 /* 3252 * Insist on a nodelist of one node only, although later 3253 * we use first_node(nodes) to grab a single node, so here 3254 * nodelist (or nodes) cannot be empty. 3255 */ 3256 if (nodelist) { 3257 char *rest = nodelist; 3258 while (isdigit(*rest)) 3259 rest++; 3260 if (*rest) 3261 goto out; 3262 if (nodes_empty(nodes)) 3263 goto out; 3264 } 3265 break; 3266 case MPOL_INTERLEAVE: 3267 case MPOL_WEIGHTED_INTERLEAVE: 3268 /* 3269 * Default to online nodes with memory if no nodelist 3270 */ 3271 if (!nodelist) 3272 nodes = node_states[N_MEMORY]; 3273 break; 3274 case MPOL_LOCAL: 3275 /* 3276 * Don't allow a nodelist; mpol_new() checks flags 3277 */ 3278 if (nodelist) 3279 goto out; 3280 break; 3281 case MPOL_DEFAULT: 3282 /* 3283 * Insist on a empty nodelist 3284 */ 3285 if (!nodelist) 3286 err = 0; 3287 goto out; 3288 case MPOL_PREFERRED_MANY: 3289 case MPOL_BIND: 3290 /* 3291 * Insist on a nodelist 3292 */ 3293 if (!nodelist) 3294 goto out; 3295 } 3296 3297 mode_flags = 0; 3298 if (flags) { 3299 /* 3300 * Currently, we only support two mutually exclusive 3301 * mode flags. 3302 */ 3303 if (!strcmp(flags, "static")) 3304 mode_flags |= MPOL_F_STATIC_NODES; 3305 else if (!strcmp(flags, "relative")) 3306 mode_flags |= MPOL_F_RELATIVE_NODES; 3307 else 3308 goto out; 3309 } 3310 3311 new = mpol_new(mode, mode_flags, &nodes); 3312 if (IS_ERR(new)) 3313 goto out; 3314 3315 /* 3316 * Save nodes for mpol_to_str() to show the tmpfs mount options 3317 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3318 */ 3319 if (mode != MPOL_PREFERRED) { 3320 new->nodes = nodes; 3321 } else if (nodelist) { 3322 nodes_clear(new->nodes); 3323 node_set(first_node(nodes), new->nodes); 3324 } else { 3325 new->mode = MPOL_LOCAL; 3326 } 3327 3328 /* 3329 * Save nodes for contextualization: this will be used to "clone" 3330 * the mempolicy in a specific context [cpuset] at a later time. 3331 */ 3332 new->w.user_nodemask = nodes; 3333 3334 err = 0; 3335 3336 out: 3337 /* Restore string for error message */ 3338 if (nodelist) 3339 *--nodelist = ':'; 3340 if (flags) 3341 *--flags = '='; 3342 if (!err) 3343 *mpol = new; 3344 return err; 3345 } 3346 #endif /* CONFIG_TMPFS */ 3347 3348 /** 3349 * mpol_to_str - format a mempolicy structure for printing 3350 * @buffer: to contain formatted mempolicy string 3351 * @maxlen: length of @buffer 3352 * @pol: pointer to mempolicy to be formatted 3353 * 3354 * Convert @pol into a string. If @buffer is too short, truncate the string. 3355 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3356 * interleave", plus the longest flag flags, "relative|balancing", and to 3357 * display at least a few node ids. 3358 */ 3359 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3360 { 3361 char *p = buffer; 3362 nodemask_t nodes = NODE_MASK_NONE; 3363 unsigned short mode = MPOL_DEFAULT; 3364 unsigned short flags = 0; 3365 3366 if (pol && 3367 pol != &default_policy && 3368 !(pol >= &preferred_node_policy[0] && 3369 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3370 mode = pol->mode; 3371 flags = pol->flags; 3372 } 3373 3374 switch (mode) { 3375 case MPOL_DEFAULT: 3376 case MPOL_LOCAL: 3377 break; 3378 case MPOL_PREFERRED: 3379 case MPOL_PREFERRED_MANY: 3380 case MPOL_BIND: 3381 case MPOL_INTERLEAVE: 3382 case MPOL_WEIGHTED_INTERLEAVE: 3383 nodes = pol->nodes; 3384 break; 3385 default: 3386 WARN_ON_ONCE(1); 3387 snprintf(p, maxlen, "unknown"); 3388 return; 3389 } 3390 3391 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3392 3393 if (flags & MPOL_MODE_FLAGS) { 3394 p += snprintf(p, buffer + maxlen - p, "="); 3395 3396 /* 3397 * Static and relative are mutually exclusive. 3398 */ 3399 if (flags & MPOL_F_STATIC_NODES) 3400 p += snprintf(p, buffer + maxlen - p, "static"); 3401 else if (flags & MPOL_F_RELATIVE_NODES) 3402 p += snprintf(p, buffer + maxlen - p, "relative"); 3403 3404 if (flags & MPOL_F_NUMA_BALANCING) { 3405 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3406 p += snprintf(p, buffer + maxlen - p, "|"); 3407 p += snprintf(p, buffer + maxlen - p, "balancing"); 3408 } 3409 } 3410 3411 if (!nodes_empty(nodes)) 3412 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3413 nodemask_pr_args(&nodes)); 3414 } 3415 3416 #ifdef CONFIG_SYSFS 3417 struct iw_node_attr { 3418 struct kobj_attribute kobj_attr; 3419 int nid; 3420 }; 3421 3422 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3423 char *buf) 3424 { 3425 struct iw_node_attr *node_attr; 3426 u8 weight; 3427 3428 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3429 weight = get_il_weight(node_attr->nid); 3430 return sysfs_emit(buf, "%d\n", weight); 3431 } 3432 3433 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3434 const char *buf, size_t count) 3435 { 3436 struct iw_node_attr *node_attr; 3437 u8 *new; 3438 u8 *old; 3439 u8 weight = 0; 3440 3441 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3442 if (count == 0 || sysfs_streq(buf, "")) 3443 weight = 0; 3444 else if (kstrtou8(buf, 0, &weight)) 3445 return -EINVAL; 3446 3447 new = kzalloc(nr_node_ids, GFP_KERNEL); 3448 if (!new) 3449 return -ENOMEM; 3450 3451 mutex_lock(&iw_table_lock); 3452 old = rcu_dereference_protected(iw_table, 3453 lockdep_is_held(&iw_table_lock)); 3454 if (old) 3455 memcpy(new, old, nr_node_ids); 3456 new[node_attr->nid] = weight; 3457 rcu_assign_pointer(iw_table, new); 3458 mutex_unlock(&iw_table_lock); 3459 synchronize_rcu(); 3460 kfree(old); 3461 return count; 3462 } 3463 3464 static struct iw_node_attr **node_attrs; 3465 3466 static void sysfs_wi_node_release(struct iw_node_attr *node_attr, 3467 struct kobject *parent) 3468 { 3469 if (!node_attr) 3470 return; 3471 sysfs_remove_file(parent, &node_attr->kobj_attr.attr); 3472 kfree(node_attr->kobj_attr.attr.name); 3473 kfree(node_attr); 3474 } 3475 3476 static void sysfs_wi_release(struct kobject *wi_kobj) 3477 { 3478 int i; 3479 3480 for (i = 0; i < nr_node_ids; i++) 3481 sysfs_wi_node_release(node_attrs[i], wi_kobj); 3482 kobject_put(wi_kobj); 3483 } 3484 3485 static const struct kobj_type wi_ktype = { 3486 .sysfs_ops = &kobj_sysfs_ops, 3487 .release = sysfs_wi_release, 3488 }; 3489 3490 static int add_weight_node(int nid, struct kobject *wi_kobj) 3491 { 3492 struct iw_node_attr *node_attr; 3493 char *name; 3494 3495 node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); 3496 if (!node_attr) 3497 return -ENOMEM; 3498 3499 name = kasprintf(GFP_KERNEL, "node%d", nid); 3500 if (!name) { 3501 kfree(node_attr); 3502 return -ENOMEM; 3503 } 3504 3505 sysfs_attr_init(&node_attr->kobj_attr.attr); 3506 node_attr->kobj_attr.attr.name = name; 3507 node_attr->kobj_attr.attr.mode = 0644; 3508 node_attr->kobj_attr.show = node_show; 3509 node_attr->kobj_attr.store = node_store; 3510 node_attr->nid = nid; 3511 3512 if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { 3513 kfree(node_attr->kobj_attr.attr.name); 3514 kfree(node_attr); 3515 pr_err("failed to add attribute to weighted_interleave\n"); 3516 return -ENOMEM; 3517 } 3518 3519 node_attrs[nid] = node_attr; 3520 return 0; 3521 } 3522 3523 static int add_weighted_interleave_group(struct kobject *root_kobj) 3524 { 3525 struct kobject *wi_kobj; 3526 int nid, err; 3527 3528 wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); 3529 if (!wi_kobj) 3530 return -ENOMEM; 3531 3532 err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, 3533 "weighted_interleave"); 3534 if (err) { 3535 kfree(wi_kobj); 3536 return err; 3537 } 3538 3539 for_each_node_state(nid, N_POSSIBLE) { 3540 err = add_weight_node(nid, wi_kobj); 3541 if (err) { 3542 pr_err("failed to add sysfs [node%d]\n", nid); 3543 break; 3544 } 3545 } 3546 if (err) 3547 kobject_put(wi_kobj); 3548 return 0; 3549 } 3550 3551 static void mempolicy_kobj_release(struct kobject *kobj) 3552 { 3553 u8 *old; 3554 3555 mutex_lock(&iw_table_lock); 3556 old = rcu_dereference_protected(iw_table, 3557 lockdep_is_held(&iw_table_lock)); 3558 rcu_assign_pointer(iw_table, NULL); 3559 mutex_unlock(&iw_table_lock); 3560 synchronize_rcu(); 3561 kfree(old); 3562 kfree(node_attrs); 3563 kfree(kobj); 3564 } 3565 3566 static const struct kobj_type mempolicy_ktype = { 3567 .release = mempolicy_kobj_release 3568 }; 3569 3570 static int __init mempolicy_sysfs_init(void) 3571 { 3572 int err; 3573 static struct kobject *mempolicy_kobj; 3574 3575 mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); 3576 if (!mempolicy_kobj) { 3577 err = -ENOMEM; 3578 goto err_out; 3579 } 3580 3581 node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), 3582 GFP_KERNEL); 3583 if (!node_attrs) { 3584 err = -ENOMEM; 3585 goto mempol_out; 3586 } 3587 3588 err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, 3589 "mempolicy"); 3590 if (err) 3591 goto node_out; 3592 3593 err = add_weighted_interleave_group(mempolicy_kobj); 3594 if (err) { 3595 pr_err("mempolicy sysfs structure failed to initialize\n"); 3596 kobject_put(mempolicy_kobj); 3597 return err; 3598 } 3599 3600 return err; 3601 node_out: 3602 kfree(node_attrs); 3603 mempol_out: 3604 kfree(mempolicy_kobj); 3605 err_out: 3606 pr_err("failed to add mempolicy kobject to the system\n"); 3607 return err; 3608 } 3609 3610 late_initcall(mempolicy_sysfs_init); 3611 #endif /* CONFIG_SYSFS */ 3612