1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/sysctl.h> 89 #include <linux/sched/task.h> 90 #include <linux/nodemask.h> 91 #include <linux/cpuset.h> 92 #include <linux/slab.h> 93 #include <linux/string.h> 94 #include <linux/export.h> 95 #include <linux/nsproxy.h> 96 #include <linux/interrupt.h> 97 #include <linux/init.h> 98 #include <linux/compat.h> 99 #include <linux/ptrace.h> 100 #include <linux/swap.h> 101 #include <linux/seq_file.h> 102 #include <linux/proc_fs.h> 103 #include <linux/memory-tiers.h> 104 #include <linux/migrate.h> 105 #include <linux/ksm.h> 106 #include <linux/rmap.h> 107 #include <linux/security.h> 108 #include <linux/syscalls.h> 109 #include <linux/ctype.h> 110 #include <linux/mm_inline.h> 111 #include <linux/mmu_notifier.h> 112 #include <linux/printk.h> 113 #include <linux/leafops.h> 114 #include <linux/gcd.h> 115 116 #include <asm/tlbflush.h> 117 #include <asm/tlb.h> 118 #include <linux/uaccess.h> 119 #include <linux/memory.h> 120 121 #include "internal.h" 122 123 /* Internal flags */ 124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 127 128 static struct kmem_cache *policy_cache; 129 static struct kmem_cache *sn_cache; 130 131 /* Highest zone. An specific allocation for a zone below that is not 132 policied. */ 133 enum zone_type policy_zone = 0; 134 135 /* 136 * run-time system-wide default policy => local allocation 137 */ 138 static struct mempolicy default_policy = { 139 .refcnt = ATOMIC_INIT(1), /* never free it */ 140 .mode = MPOL_LOCAL, 141 }; 142 143 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 144 145 /* 146 * weightiness balances the tradeoff between small weights (cycles through nodes 147 * faster, more fair/even distribution) and large weights (smaller errors 148 * between actual bandwidth ratios and weight ratios). 32 is a number that has 149 * been found to perform at a reasonable compromise between the two goals. 150 */ 151 static const int weightiness = 32; 152 153 /* 154 * A null weighted_interleave_state is interpreted as having .mode="auto", 155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids. 156 */ 157 struct weighted_interleave_state { 158 bool mode_auto; 159 u8 iw_table[]; 160 }; 161 static struct weighted_interleave_state __rcu *wi_state; 162 static unsigned int *node_bw_table; 163 164 /* 165 * wi_state_lock protects both wi_state and node_bw_table. 166 * node_bw_table is only used by writers to update wi_state. 167 */ 168 static DEFINE_MUTEX(wi_state_lock); 169 170 static u8 get_il_weight(int node) 171 { 172 struct weighted_interleave_state *state; 173 u8 weight = 1; 174 175 rcu_read_lock(); 176 state = rcu_dereference(wi_state); 177 if (state) 178 weight = state->iw_table[node]; 179 rcu_read_unlock(); 180 return weight; 181 } 182 183 /* 184 * Convert bandwidth values into weighted interleave weights. 185 * Call with wi_state_lock. 186 */ 187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) 188 { 189 u64 sum_bw = 0; 190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; 191 int nid; 192 193 for_each_node_state(nid, N_MEMORY) 194 sum_bw += bw[nid]; 195 196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */ 197 for_each_node_state(nid, N_MEMORY) { 198 /* 199 * Try not to perform 64-bit division. 200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX. 201 * If sum_bw > scaling_factor, then round the weight up to 1. 202 */ 203 scaling_factor = weightiness * bw[nid]; 204 if (bw[nid] && sum_bw < scaling_factor) { 205 cast_sum_bw = (unsigned int)sum_bw; 206 new_iw[nid] = scaling_factor / cast_sum_bw; 207 } else { 208 new_iw[nid] = 1; 209 } 210 if (!iw_gcd) 211 iw_gcd = new_iw[nid]; 212 iw_gcd = gcd(iw_gcd, new_iw[nid]); 213 } 214 215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ 216 for_each_node_state(nid, N_MEMORY) 217 new_iw[nid] /= iw_gcd; 218 } 219 220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) 221 { 222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 223 unsigned int *old_bw, *new_bw; 224 unsigned int bw_val; 225 int i; 226 227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth); 228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); 229 if (!new_bw) 230 return -ENOMEM; 231 232 new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids); 233 if (!new_wi_state) { 234 kfree(new_bw); 235 return -ENOMEM; 236 } 237 new_wi_state->mode_auto = true; 238 for (i = 0; i < nr_node_ids; i++) 239 new_wi_state->iw_table[i] = 1; 240 241 /* 242 * Update bandwidth info, even in manual mode. That way, when switching 243 * to auto mode in the future, iw_table can be overwritten using 244 * accurate bw data. 245 */ 246 mutex_lock(&wi_state_lock); 247 248 old_bw = node_bw_table; 249 if (old_bw) 250 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); 251 new_bw[node] = bw_val; 252 node_bw_table = new_bw; 253 254 old_wi_state = rcu_dereference_protected(wi_state, 255 lockdep_is_held(&wi_state_lock)); 256 if (old_wi_state && !old_wi_state->mode_auto) { 257 /* Manual mode; skip reducing weights and updating wi_state */ 258 mutex_unlock(&wi_state_lock); 259 kfree(new_wi_state); 260 goto out; 261 } 262 263 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ 264 reduce_interleave_weights(new_bw, new_wi_state->iw_table); 265 rcu_assign_pointer(wi_state, new_wi_state); 266 267 mutex_unlock(&wi_state_lock); 268 if (old_wi_state) { 269 synchronize_rcu(); 270 kfree(old_wi_state); 271 } 272 out: 273 kfree(old_bw); 274 return 0; 275 } 276 277 /** 278 * numa_nearest_node - Find nearest node by state 279 * @node: Node id to start the search 280 * @state: State to filter the search 281 * 282 * Lookup the closest node by distance if @nid is not in state. 283 * 284 * Return: this @node if it is in state, otherwise the closest node by distance 285 */ 286 int numa_nearest_node(int node, unsigned int state) 287 { 288 int min_dist = INT_MAX, dist, n, min_node; 289 290 if (state >= NR_NODE_STATES) 291 return -EINVAL; 292 293 if (node == NUMA_NO_NODE || node_state(node, state)) 294 return node; 295 296 min_node = node; 297 for_each_node_state(n, state) { 298 dist = node_distance(node, n); 299 if (dist < min_dist) { 300 min_dist = dist; 301 min_node = n; 302 } 303 } 304 305 return min_node; 306 } 307 EXPORT_SYMBOL_GPL(numa_nearest_node); 308 309 /** 310 * nearest_node_nodemask - Find the node in @mask at the nearest distance 311 * from @node. 312 * 313 * @node: a valid node ID to start the search from. 314 * @mask: a pointer to a nodemask representing the allowed nodes. 315 * 316 * This function iterates over all nodes in @mask and calculates the 317 * distance from the starting @node, then it returns the node ID that is 318 * the closest to @node, or MAX_NUMNODES if no node is found. 319 * 320 * Note that @node must be a valid node ID usable with node_distance(), 321 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 322 * or unexpected behavior. 323 */ 324 int nearest_node_nodemask(int node, nodemask_t *mask) 325 { 326 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 327 328 for_each_node_mask(n, *mask) { 329 dist = node_distance(node, n); 330 if (dist < min_dist) { 331 min_dist = dist; 332 min_node = n; 333 } 334 } 335 336 return min_node; 337 } 338 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 339 340 struct mempolicy *get_task_policy(struct task_struct *p) 341 { 342 struct mempolicy *pol = p->mempolicy; 343 int node; 344 345 if (pol) 346 return pol; 347 348 node = numa_node_id(); 349 if (node != NUMA_NO_NODE) { 350 pol = &preferred_node_policy[node]; 351 /* preferred_node_policy is not initialised early in boot */ 352 if (pol->mode) 353 return pol; 354 } 355 356 return &default_policy; 357 } 358 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); 359 360 static const struct mempolicy_operations { 361 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 362 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 363 } mpol_ops[MPOL_MAX]; 364 365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 366 { 367 return pol->flags & MPOL_USER_NODEMASK_FLAGS; 368 } 369 370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 371 const nodemask_t *rel) 372 { 373 nodemask_t tmp; 374 nodes_fold(tmp, *orig, nodes_weight(*rel)); 375 nodes_onto(*ret, tmp, *rel); 376 } 377 378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 379 { 380 if (nodes_empty(*nodes)) 381 return -EINVAL; 382 pol->nodes = *nodes; 383 return 0; 384 } 385 386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 387 { 388 if (nodes_empty(*nodes)) 389 return -EINVAL; 390 391 nodes_clear(pol->nodes); 392 node_set(first_node(*nodes), pol->nodes); 393 return 0; 394 } 395 396 /* 397 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 398 * any, for the new policy. mpol_new() has already validated the nodes 399 * parameter with respect to the policy mode and flags. 400 * 401 * Must be called holding task's alloc_lock to protect task's mems_allowed 402 * and mempolicy. May also be called holding the mmap_lock for write. 403 */ 404 static int mpol_set_nodemask(struct mempolicy *pol, 405 const nodemask_t *nodes, struct nodemask_scratch *nsc) 406 { 407 int ret; 408 409 /* 410 * Default (pol==NULL) resp. local memory policies are not a 411 * subject of any remapping. They also do not need any special 412 * constructor. 413 */ 414 if (!pol || pol->mode == MPOL_LOCAL) 415 return 0; 416 417 /* Check N_MEMORY */ 418 nodes_and(nsc->mask1, 419 cpuset_current_mems_allowed, node_states[N_MEMORY]); 420 421 VM_BUG_ON(!nodes); 422 423 if (pol->flags & MPOL_F_RELATIVE_NODES) 424 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 425 else 426 nodes_and(nsc->mask2, *nodes, nsc->mask1); 427 428 if (mpol_store_user_nodemask(pol)) 429 pol->w.user_nodemask = *nodes; 430 else 431 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 432 433 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 434 return ret; 435 } 436 437 /* 438 * This function just creates a new policy, does some check and simple 439 * initialization. You must invoke mpol_set_nodemask() to set nodes. 440 */ 441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 442 nodemask_t *nodes) 443 { 444 struct mempolicy *policy; 445 446 if (mode == MPOL_DEFAULT) { 447 if (nodes && !nodes_empty(*nodes)) 448 return ERR_PTR(-EINVAL); 449 return NULL; 450 } 451 VM_BUG_ON(!nodes); 452 453 /* 454 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 455 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 456 * All other modes require a valid pointer to a non-empty nodemask. 457 */ 458 if (mode == MPOL_PREFERRED) { 459 if (nodes_empty(*nodes)) { 460 if (((flags & MPOL_F_STATIC_NODES) || 461 (flags & MPOL_F_RELATIVE_NODES))) 462 return ERR_PTR(-EINVAL); 463 464 mode = MPOL_LOCAL; 465 } 466 } else if (mode == MPOL_LOCAL) { 467 if (!nodes_empty(*nodes) || 468 (flags & MPOL_F_STATIC_NODES) || 469 (flags & MPOL_F_RELATIVE_NODES)) 470 return ERR_PTR(-EINVAL); 471 } else if (nodes_empty(*nodes)) 472 return ERR_PTR(-EINVAL); 473 474 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 475 if (!policy) 476 return ERR_PTR(-ENOMEM); 477 atomic_set(&policy->refcnt, 1); 478 policy->mode = mode; 479 policy->flags = flags; 480 policy->home_node = NUMA_NO_NODE; 481 482 return policy; 483 } 484 485 /* Slow path of a mpol destructor. */ 486 void __mpol_put(struct mempolicy *pol) 487 { 488 if (!atomic_dec_and_test(&pol->refcnt)) 489 return; 490 /* 491 * Required to allow mmap_lock_speculative*() access, see for example 492 * futex_key_to_node_opt(). All accesses are serialized by mmap_lock, 493 * however the speculative lock section unbound by the normal lock 494 * boundaries, requiring RCU freeing. 495 */ 496 kfree_rcu(pol, rcu); 497 } 498 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); 499 500 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 501 { 502 } 503 504 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 505 { 506 nodemask_t tmp; 507 508 if (pol->flags & MPOL_F_STATIC_NODES) 509 nodes_and(tmp, pol->w.user_nodemask, *nodes); 510 else if (pol->flags & MPOL_F_RELATIVE_NODES) 511 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 512 else { 513 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 514 *nodes); 515 pol->w.cpuset_mems_allowed = *nodes; 516 } 517 518 if (nodes_empty(tmp)) 519 tmp = *nodes; 520 521 pol->nodes = tmp; 522 } 523 524 static void mpol_rebind_preferred(struct mempolicy *pol, 525 const nodemask_t *nodes) 526 { 527 pol->w.cpuset_mems_allowed = *nodes; 528 } 529 530 /* 531 * mpol_rebind_policy - Migrate a policy to a different set of nodes 532 * 533 * Per-vma policies are protected by mmap_lock. Allocations using per-task 534 * policies are protected by task->mems_allowed_seq to prevent a premature 535 * OOM/allocation failure due to parallel nodemask modification. 536 */ 537 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 538 { 539 if (!pol || pol->mode == MPOL_LOCAL) 540 return; 541 if (!mpol_store_user_nodemask(pol) && 542 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 543 return; 544 545 mpol_ops[pol->mode].rebind(pol, newmask); 546 } 547 548 /* 549 * Wrapper for mpol_rebind_policy() that just requires task 550 * pointer, and updates task mempolicy. 551 * 552 * Called with task's alloc_lock held. 553 */ 554 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 555 { 556 mpol_rebind_policy(tsk->mempolicy, new); 557 } 558 559 /* 560 * Rebind each vma in mm to new nodemask. 561 * 562 * Call holding a reference to mm. Takes mm->mmap_lock during call. 563 */ 564 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 565 { 566 struct vm_area_struct *vma; 567 VMA_ITERATOR(vmi, mm, 0); 568 569 mmap_write_lock(mm); 570 for_each_vma(vmi, vma) { 571 vma_start_write(vma); 572 mpol_rebind_policy(vma->vm_policy, new); 573 } 574 mmap_write_unlock(mm); 575 } 576 577 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 578 [MPOL_DEFAULT] = { 579 .rebind = mpol_rebind_default, 580 }, 581 [MPOL_INTERLEAVE] = { 582 .create = mpol_new_nodemask, 583 .rebind = mpol_rebind_nodemask, 584 }, 585 [MPOL_PREFERRED] = { 586 .create = mpol_new_preferred, 587 .rebind = mpol_rebind_preferred, 588 }, 589 [MPOL_BIND] = { 590 .create = mpol_new_nodemask, 591 .rebind = mpol_rebind_nodemask, 592 }, 593 [MPOL_LOCAL] = { 594 .rebind = mpol_rebind_default, 595 }, 596 [MPOL_PREFERRED_MANY] = { 597 .create = mpol_new_nodemask, 598 .rebind = mpol_rebind_preferred, 599 }, 600 [MPOL_WEIGHTED_INTERLEAVE] = { 601 .create = mpol_new_nodemask, 602 .rebind = mpol_rebind_nodemask, 603 }, 604 }; 605 606 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 607 unsigned long flags); 608 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 609 pgoff_t ilx, int *nid); 610 611 static bool strictly_unmovable(unsigned long flags) 612 { 613 /* 614 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 615 * if any misplaced page is found. 616 */ 617 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 618 MPOL_MF_STRICT; 619 } 620 621 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 622 struct mempolicy *pol; 623 pgoff_t ilx; 624 }; 625 626 struct queue_pages { 627 struct list_head *pagelist; 628 unsigned long flags; 629 nodemask_t *nmask; 630 unsigned long start; 631 unsigned long end; 632 struct vm_area_struct *first; 633 struct folio *large; /* note last large folio encountered */ 634 long nr_failed; /* could not be isolated at this time */ 635 }; 636 637 /* 638 * Check if the folio's nid is in qp->nmask. 639 * 640 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 641 * in the invert of qp->nmask. 642 */ 643 static inline bool queue_folio_required(struct folio *folio, 644 struct queue_pages *qp) 645 { 646 int nid = folio_nid(folio); 647 unsigned long flags = qp->flags; 648 649 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 650 } 651 652 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 653 { 654 struct folio *folio; 655 struct queue_pages *qp = walk->private; 656 657 if (unlikely(pmd_is_migration_entry(*pmd))) { 658 qp->nr_failed++; 659 return; 660 } 661 folio = pmd_folio(*pmd); 662 if (is_huge_zero_folio(folio)) { 663 walk->action = ACTION_CONTINUE; 664 return; 665 } 666 if (!queue_folio_required(folio, qp)) 667 return; 668 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 669 !vma_migratable(walk->vma) || 670 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 671 qp->nr_failed++; 672 } 673 674 /* 675 * Scan through folios, checking if they satisfy the required conditions, 676 * moving them from LRU to local pagelist for migration if they do (or not). 677 * 678 * queue_folios_pte_range() has two possible return values: 679 * 0 - continue walking to scan for more, even if an existing folio on the 680 * wrong node could not be isolated and queued for migration. 681 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 682 * and an existing folio was on a node that does not follow the policy. 683 */ 684 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 685 unsigned long end, struct mm_walk *walk) 686 { 687 struct vm_area_struct *vma = walk->vma; 688 struct folio *folio; 689 struct queue_pages *qp = walk->private; 690 unsigned long flags = qp->flags; 691 pte_t *pte, *mapped_pte; 692 pte_t ptent; 693 spinlock_t *ptl; 694 int max_nr, nr; 695 696 ptl = pmd_trans_huge_lock(pmd, vma); 697 if (ptl) { 698 queue_folios_pmd(pmd, walk); 699 spin_unlock(ptl); 700 goto out; 701 } 702 703 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 704 if (!pte) { 705 walk->action = ACTION_AGAIN; 706 return 0; 707 } 708 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 709 max_nr = (end - addr) >> PAGE_SHIFT; 710 nr = 1; 711 ptent = ptep_get(pte); 712 if (pte_none(ptent)) 713 continue; 714 if (!pte_present(ptent)) { 715 const softleaf_t entry = softleaf_from_pte(ptent); 716 717 if (softleaf_is_migration(entry)) 718 qp->nr_failed++; 719 continue; 720 } 721 folio = vm_normal_folio(vma, addr, ptent); 722 if (!folio || folio_is_zone_device(folio)) 723 continue; 724 if (folio_test_large(folio) && max_nr != 1) 725 nr = folio_pte_batch(folio, pte, ptent, max_nr); 726 /* 727 * vm_normal_folio() filters out zero pages, but there might 728 * still be reserved folios to skip, perhaps in a VDSO. 729 */ 730 if (folio_test_reserved(folio)) 731 continue; 732 if (!queue_folio_required(folio, qp)) 733 continue; 734 if (folio_test_large(folio)) { 735 /* 736 * A large folio can only be isolated from LRU once, 737 * but may be mapped by many PTEs (and Copy-On-Write may 738 * intersperse PTEs of other, order 0, folios). This is 739 * a common case, so don't mistake it for failure (but 740 * there can be other cases of multi-mapped pages which 741 * this quick check does not help to filter out - and a 742 * search of the pagelist might grow to be prohibitive). 743 * 744 * migrate_pages(&pagelist) returns nr_failed folios, so 745 * check "large" now so that queue_pages_range() returns 746 * a comparable nr_failed folios. This does imply that 747 * if folio could not be isolated for some racy reason 748 * at its first PTE, later PTEs will not give it another 749 * chance of isolation; but keeps the accounting simple. 750 */ 751 if (folio == qp->large) 752 continue; 753 qp->large = folio; 754 } 755 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 756 !vma_migratable(vma) || 757 !migrate_folio_add(folio, qp->pagelist, flags)) { 758 qp->nr_failed += nr; 759 if (strictly_unmovable(flags)) 760 break; 761 } 762 } 763 pte_unmap_unlock(mapped_pte, ptl); 764 cond_resched(); 765 out: 766 if (qp->nr_failed && strictly_unmovable(flags)) 767 return -EIO; 768 return 0; 769 } 770 771 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 772 unsigned long addr, unsigned long end, 773 struct mm_walk *walk) 774 { 775 #ifdef CONFIG_HUGETLB_PAGE 776 struct queue_pages *qp = walk->private; 777 unsigned long flags = qp->flags; 778 struct folio *folio; 779 spinlock_t *ptl; 780 pte_t ptep; 781 782 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 783 ptep = huge_ptep_get(walk->mm, addr, pte); 784 if (!pte_present(ptep)) { 785 if (!huge_pte_none(ptep)) { 786 const softleaf_t entry = softleaf_from_pte(ptep); 787 788 if (unlikely(softleaf_is_migration(entry))) 789 qp->nr_failed++; 790 } 791 792 goto unlock; 793 } 794 folio = pfn_folio(pte_pfn(ptep)); 795 if (!queue_folio_required(folio, qp)) 796 goto unlock; 797 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 798 !vma_migratable(walk->vma)) { 799 qp->nr_failed++; 800 goto unlock; 801 } 802 /* 803 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 804 * Choosing not to migrate a shared folio is not counted as a failure. 805 * 806 * See folio_maybe_mapped_shared() on possible imprecision when we 807 * cannot easily detect if a folio is shared. 808 */ 809 if ((flags & MPOL_MF_MOVE_ALL) || 810 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 811 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 812 qp->nr_failed++; 813 unlock: 814 spin_unlock(ptl); 815 if (qp->nr_failed && strictly_unmovable(flags)) 816 return -EIO; 817 #endif 818 return 0; 819 } 820 821 #ifdef CONFIG_NUMA_BALANCING 822 /** 823 * folio_can_map_prot_numa() - check whether the folio can map prot numa 824 * @folio: The folio whose mapping considered for being made NUMA hintable 825 * @vma: The VMA that the folio belongs to. 826 * @is_private_single_threaded: Is this a single-threaded private VMA or not 827 * 828 * This function checks to see if the folio actually indicates that 829 * we need to make the mapping one which causes a NUMA hinting fault, 830 * as there are cases where it's simply unnecessary, and the folio's 831 * access time is adjusted for memory tiering if prot numa needed. 832 * 833 * Return: True if the mapping of the folio needs to be changed, false otherwise. 834 */ 835 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, 836 bool is_private_single_threaded) 837 { 838 int nid; 839 840 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) 841 return false; 842 843 /* Also skip shared copy-on-write folios */ 844 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) 845 return false; 846 847 /* Folios are pinned and can't be migrated */ 848 if (folio_maybe_dma_pinned(folio)) 849 return false; 850 851 /* 852 * While migration can move some dirty folios, 853 * it cannot move them all from MIGRATE_ASYNC 854 * context. 855 */ 856 if (folio_is_file_lru(folio) && folio_test_dirty(folio)) 857 return false; 858 859 /* 860 * Don't mess with PTEs if folio is already on the node 861 * a single-threaded process is running on. 862 */ 863 nid = folio_nid(folio); 864 if (is_private_single_threaded && (nid == numa_node_id())) 865 return false; 866 867 /* 868 * Skip scanning top tier node if normal numa 869 * balancing is disabled 870 */ 871 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 872 node_is_toptier(nid)) 873 return false; 874 875 if (folio_use_access_time(folio)) 876 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); 877 878 return true; 879 } 880 881 /* 882 * This is used to mark a range of virtual addresses to be inaccessible. 883 * These are later cleared by a NUMA hinting fault. Depending on these 884 * faults, pages may be migrated for better NUMA placement. 885 * 886 * This is assuming that NUMA faults are handled using PROT_NONE. If 887 * an architecture makes a different choice, it will need further 888 * changes to the core. 889 */ 890 unsigned long change_prot_numa(struct vm_area_struct *vma, 891 unsigned long addr, unsigned long end) 892 { 893 struct mmu_gather tlb; 894 long nr_updated; 895 896 tlb_gather_mmu(&tlb, vma->vm_mm); 897 898 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 899 if (nr_updated > 0) { 900 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 901 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 902 } 903 904 tlb_finish_mmu(&tlb); 905 906 return nr_updated; 907 } 908 #endif /* CONFIG_NUMA_BALANCING */ 909 910 static int queue_pages_test_walk(unsigned long start, unsigned long end, 911 struct mm_walk *walk) 912 { 913 struct vm_area_struct *next, *vma = walk->vma; 914 struct queue_pages *qp = walk->private; 915 unsigned long flags = qp->flags; 916 917 /* range check first */ 918 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 919 920 if (!qp->first) { 921 qp->first = vma; 922 if (!(flags & MPOL_MF_DISCONTIG_OK) && 923 (qp->start < vma->vm_start)) 924 /* hole at head side of range */ 925 return -EFAULT; 926 } 927 next = find_vma(vma->vm_mm, vma->vm_end); 928 if (!(flags & MPOL_MF_DISCONTIG_OK) && 929 ((vma->vm_end < qp->end) && 930 (!next || vma->vm_end < next->vm_start))) 931 /* hole at middle or tail of range */ 932 return -EFAULT; 933 934 /* 935 * Need check MPOL_MF_STRICT to return -EIO if possible 936 * regardless of vma_migratable 937 */ 938 if (!vma_migratable(vma) && 939 !(flags & MPOL_MF_STRICT)) 940 return 1; 941 942 /* 943 * Check page nodes, and queue pages to move, in the current vma. 944 * But if no moving, and no strict checking, the scan can be skipped. 945 */ 946 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 947 return 0; 948 return 1; 949 } 950 951 static const struct mm_walk_ops queue_pages_walk_ops = { 952 .hugetlb_entry = queue_folios_hugetlb, 953 .pmd_entry = queue_folios_pte_range, 954 .test_walk = queue_pages_test_walk, 955 .walk_lock = PGWALK_RDLOCK, 956 }; 957 958 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 959 .hugetlb_entry = queue_folios_hugetlb, 960 .pmd_entry = queue_folios_pte_range, 961 .test_walk = queue_pages_test_walk, 962 .walk_lock = PGWALK_WRLOCK, 963 }; 964 965 /* 966 * Walk through page tables and collect pages to be migrated. 967 * 968 * If pages found in a given range are not on the required set of @nodes, 969 * and migration is allowed, they are isolated and queued to @pagelist. 970 * 971 * queue_pages_range() may return: 972 * 0 - all pages already on the right node, or successfully queued for moving 973 * (or neither strict checking nor moving requested: only range checking). 974 * >0 - this number of misplaced folios could not be queued for moving 975 * (a hugetlbfs page or a transparent huge page being counted as 1). 976 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 977 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 978 */ 979 static long 980 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 981 nodemask_t *nodes, unsigned long flags, 982 struct list_head *pagelist) 983 { 984 int err; 985 struct queue_pages qp = { 986 .pagelist = pagelist, 987 .flags = flags, 988 .nmask = nodes, 989 .start = start, 990 .end = end, 991 .first = NULL, 992 }; 993 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 994 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 995 996 err = walk_page_range(mm, start, end, ops, &qp); 997 998 if (!qp.first) 999 /* whole range in hole */ 1000 err = -EFAULT; 1001 1002 return err ? : qp.nr_failed; 1003 } 1004 1005 /* 1006 * Apply policy to a single VMA 1007 * This must be called with the mmap_lock held for writing. 1008 */ 1009 static int vma_replace_policy(struct vm_area_struct *vma, 1010 struct mempolicy *pol) 1011 { 1012 int err; 1013 struct mempolicy *old; 1014 struct mempolicy *new; 1015 1016 vma_assert_write_locked(vma); 1017 1018 new = mpol_dup(pol); 1019 if (IS_ERR(new)) 1020 return PTR_ERR(new); 1021 1022 if (vma->vm_ops && vma->vm_ops->set_policy) { 1023 err = vma->vm_ops->set_policy(vma, new); 1024 if (err) 1025 goto err_out; 1026 } 1027 1028 old = vma->vm_policy; 1029 WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */ 1030 mpol_put(old); 1031 1032 return 0; 1033 err_out: 1034 mpol_put(new); 1035 return err; 1036 } 1037 1038 /* Split or merge the VMA (if required) and apply the new policy */ 1039 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 1040 struct vm_area_struct **prev, unsigned long start, 1041 unsigned long end, struct mempolicy *new_pol) 1042 { 1043 unsigned long vmstart, vmend; 1044 1045 vmend = min(end, vma->vm_end); 1046 if (start > vma->vm_start) { 1047 *prev = vma; 1048 vmstart = start; 1049 } else { 1050 vmstart = vma->vm_start; 1051 } 1052 1053 if (mpol_equal(vma->vm_policy, new_pol)) { 1054 *prev = vma; 1055 return 0; 1056 } 1057 1058 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 1059 if (IS_ERR(vma)) 1060 return PTR_ERR(vma); 1061 1062 *prev = vma; 1063 return vma_replace_policy(vma, new_pol); 1064 } 1065 1066 /* Set the process memory policy */ 1067 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 1068 nodemask_t *nodes) 1069 { 1070 struct mempolicy *new, *old; 1071 NODEMASK_SCRATCH(scratch); 1072 int ret; 1073 1074 if (!scratch) 1075 return -ENOMEM; 1076 1077 new = mpol_new(mode, flags, nodes); 1078 if (IS_ERR(new)) { 1079 ret = PTR_ERR(new); 1080 goto out; 1081 } 1082 1083 task_lock(current); 1084 ret = mpol_set_nodemask(new, nodes, scratch); 1085 if (ret) { 1086 task_unlock(current); 1087 mpol_put(new); 1088 goto out; 1089 } 1090 1091 old = current->mempolicy; 1092 current->mempolicy = new; 1093 if (new && (new->mode == MPOL_INTERLEAVE || 1094 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 1095 current->il_prev = MAX_NUMNODES-1; 1096 current->il_weight = 0; 1097 } 1098 task_unlock(current); 1099 mpol_put(old); 1100 ret = 0; 1101 out: 1102 NODEMASK_SCRATCH_FREE(scratch); 1103 return ret; 1104 } 1105 1106 /* 1107 * Return nodemask for policy for get_mempolicy() query 1108 * 1109 * Called with task's alloc_lock held 1110 */ 1111 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 1112 { 1113 nodes_clear(*nodes); 1114 if (pol == &default_policy) 1115 return; 1116 1117 switch (pol->mode) { 1118 case MPOL_BIND: 1119 case MPOL_INTERLEAVE: 1120 case MPOL_PREFERRED: 1121 case MPOL_PREFERRED_MANY: 1122 case MPOL_WEIGHTED_INTERLEAVE: 1123 *nodes = pol->nodes; 1124 break; 1125 case MPOL_LOCAL: 1126 /* return empty node mask for local allocation */ 1127 break; 1128 default: 1129 BUG(); 1130 } 1131 } 1132 1133 static int lookup_node(struct mm_struct *mm, unsigned long addr) 1134 { 1135 struct page *p = NULL; 1136 int ret; 1137 1138 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 1139 if (ret > 0) { 1140 ret = page_to_nid(p); 1141 put_page(p); 1142 } 1143 return ret; 1144 } 1145 1146 /* Retrieve NUMA policy */ 1147 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 1148 unsigned long addr, unsigned long flags) 1149 { 1150 int err; 1151 struct mm_struct *mm = current->mm; 1152 struct vm_area_struct *vma = NULL; 1153 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 1154 1155 if (flags & 1156 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 1157 return -EINVAL; 1158 1159 if (flags & MPOL_F_MEMS_ALLOWED) { 1160 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 1161 return -EINVAL; 1162 *policy = 0; /* just so it's initialized */ 1163 task_lock(current); 1164 *nmask = cpuset_current_mems_allowed; 1165 task_unlock(current); 1166 return 0; 1167 } 1168 1169 if (flags & MPOL_F_ADDR) { 1170 pgoff_t ilx; /* ignored here */ 1171 /* 1172 * Do NOT fall back to task policy if the 1173 * vma/shared policy at addr is NULL. We 1174 * want to return MPOL_DEFAULT in this case. 1175 */ 1176 mmap_read_lock(mm); 1177 vma = vma_lookup(mm, addr); 1178 if (!vma) { 1179 mmap_read_unlock(mm); 1180 return -EFAULT; 1181 } 1182 pol = __get_vma_policy(vma, addr, &ilx); 1183 } else if (addr) 1184 return -EINVAL; 1185 1186 if (!pol) 1187 pol = &default_policy; /* indicates default behavior */ 1188 1189 if (flags & MPOL_F_NODE) { 1190 if (flags & MPOL_F_ADDR) { 1191 /* 1192 * Take a refcount on the mpol, because we are about to 1193 * drop the mmap_lock, after which only "pol" remains 1194 * valid, "vma" is stale. 1195 */ 1196 pol_refcount = pol; 1197 vma = NULL; 1198 mpol_get(pol); 1199 mmap_read_unlock(mm); 1200 err = lookup_node(mm, addr); 1201 if (err < 0) 1202 goto out; 1203 *policy = err; 1204 } else if (pol == current->mempolicy && 1205 pol->mode == MPOL_INTERLEAVE) { 1206 *policy = next_node_in(current->il_prev, pol->nodes); 1207 } else if (pol == current->mempolicy && 1208 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1209 if (current->il_weight) 1210 *policy = current->il_prev; 1211 else 1212 *policy = next_node_in(current->il_prev, 1213 pol->nodes); 1214 } else { 1215 err = -EINVAL; 1216 goto out; 1217 } 1218 } else { 1219 *policy = pol == &default_policy ? MPOL_DEFAULT : 1220 pol->mode; 1221 /* 1222 * Internal mempolicy flags must be masked off before exposing 1223 * the policy to userspace. 1224 */ 1225 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1226 } 1227 1228 err = 0; 1229 if (nmask) { 1230 if (mpol_store_user_nodemask(pol)) { 1231 *nmask = pol->w.user_nodemask; 1232 } else { 1233 task_lock(current); 1234 get_policy_nodemask(pol, nmask); 1235 task_unlock(current); 1236 } 1237 } 1238 1239 out: 1240 mpol_cond_put(pol); 1241 if (vma) 1242 mmap_read_unlock(mm); 1243 if (pol_refcount) 1244 mpol_put(pol_refcount); 1245 return err; 1246 } 1247 1248 #ifdef CONFIG_MIGRATION 1249 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1250 unsigned long flags) 1251 { 1252 /* 1253 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1254 * Choosing not to migrate a shared folio is not counted as a failure. 1255 * 1256 * See folio_maybe_mapped_shared() on possible imprecision when we 1257 * cannot easily detect if a folio is shared. 1258 */ 1259 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1260 if (folio_isolate_lru(folio)) { 1261 list_add_tail(&folio->lru, foliolist); 1262 node_stat_mod_folio(folio, 1263 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1264 folio_nr_pages(folio)); 1265 } else { 1266 /* 1267 * Non-movable folio may reach here. And, there may be 1268 * temporary off LRU folios or non-LRU movable folios. 1269 * Treat them as unmovable folios since they can't be 1270 * isolated, so they can't be moved at the moment. 1271 */ 1272 return false; 1273 } 1274 } 1275 return true; 1276 } 1277 1278 /* 1279 * Migrate pages from one node to a target node. 1280 * Returns error or the number of pages not migrated. 1281 */ 1282 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1283 int flags) 1284 { 1285 nodemask_t nmask; 1286 struct vm_area_struct *vma; 1287 LIST_HEAD(pagelist); 1288 long nr_failed; 1289 long err = 0; 1290 struct migration_target_control mtc = { 1291 .nid = dest, 1292 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1293 .reason = MR_SYSCALL, 1294 }; 1295 1296 nodes_clear(nmask); 1297 node_set(source, nmask); 1298 1299 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1300 1301 mmap_read_lock(mm); 1302 vma = find_vma(mm, 0); 1303 if (unlikely(!vma)) { 1304 mmap_read_unlock(mm); 1305 return 0; 1306 } 1307 1308 /* 1309 * This does not migrate the range, but isolates all pages that 1310 * need migration. Between passing in the full user address 1311 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1312 * but passes back the count of pages which could not be isolated. 1313 */ 1314 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1315 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1316 mmap_read_unlock(mm); 1317 1318 if (!list_empty(&pagelist)) { 1319 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1320 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1321 if (err) 1322 putback_movable_pages(&pagelist); 1323 } 1324 1325 if (err >= 0) 1326 err += nr_failed; 1327 return err; 1328 } 1329 1330 /* 1331 * Move pages between the two nodesets so as to preserve the physical 1332 * layout as much as possible. 1333 * 1334 * Returns the number of page that could not be moved. 1335 */ 1336 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1337 const nodemask_t *to, int flags) 1338 { 1339 long nr_failed = 0; 1340 long err = 0; 1341 nodemask_t tmp; 1342 1343 lru_cache_disable(); 1344 1345 /* 1346 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1347 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1348 * bit in 'tmp', and return that <source, dest> pair for migration. 1349 * The pair of nodemasks 'to' and 'from' define the map. 1350 * 1351 * If no pair of bits is found that way, fallback to picking some 1352 * pair of 'source' and 'dest' bits that are not the same. If the 1353 * 'source' and 'dest' bits are the same, this represents a node 1354 * that will be migrating to itself, so no pages need move. 1355 * 1356 * If no bits are left in 'tmp', or if all remaining bits left 1357 * in 'tmp' correspond to the same bit in 'to', return false 1358 * (nothing left to migrate). 1359 * 1360 * This lets us pick a pair of nodes to migrate between, such that 1361 * if possible the dest node is not already occupied by some other 1362 * source node, minimizing the risk of overloading the memory on a 1363 * node that would happen if we migrated incoming memory to a node 1364 * before migrating outgoing memory source that same node. 1365 * 1366 * A single scan of tmp is sufficient. As we go, we remember the 1367 * most recent <s, d> pair that moved (s != d). If we find a pair 1368 * that not only moved, but what's better, moved to an empty slot 1369 * (d is not set in tmp), then we break out then, with that pair. 1370 * Otherwise when we finish scanning from_tmp, we at least have the 1371 * most recent <s, d> pair that moved. If we get all the way through 1372 * the scan of tmp without finding any node that moved, much less 1373 * moved to an empty node, then there is nothing left worth migrating. 1374 */ 1375 1376 tmp = *from; 1377 while (!nodes_empty(tmp)) { 1378 int s, d; 1379 int source = NUMA_NO_NODE; 1380 int dest = 0; 1381 1382 for_each_node_mask(s, tmp) { 1383 1384 /* 1385 * do_migrate_pages() tries to maintain the relative 1386 * node relationship of the pages established between 1387 * threads and memory areas. 1388 * 1389 * However if the number of source nodes is not equal to 1390 * the number of destination nodes we can not preserve 1391 * this node relative relationship. In that case, skip 1392 * copying memory from a node that is in the destination 1393 * mask. 1394 * 1395 * Example: [2,3,4] -> [3,4,5] moves everything. 1396 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1397 */ 1398 1399 if ((nodes_weight(*from) != nodes_weight(*to)) && 1400 (node_isset(s, *to))) 1401 continue; 1402 1403 d = node_remap(s, *from, *to); 1404 if (s == d) 1405 continue; 1406 1407 source = s; /* Node moved. Memorize */ 1408 dest = d; 1409 1410 /* dest not in remaining from nodes? */ 1411 if (!node_isset(dest, tmp)) 1412 break; 1413 } 1414 if (source == NUMA_NO_NODE) 1415 break; 1416 1417 node_clear(source, tmp); 1418 err = migrate_to_node(mm, source, dest, flags); 1419 if (err > 0) 1420 nr_failed += err; 1421 if (err < 0) 1422 break; 1423 } 1424 1425 lru_cache_enable(); 1426 if (err < 0) 1427 return err; 1428 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1429 } 1430 1431 /* 1432 * Allocate a new folio for page migration, according to NUMA mempolicy. 1433 */ 1434 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1435 unsigned long private) 1436 { 1437 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1438 struct mempolicy *pol = mmpol->pol; 1439 pgoff_t ilx = mmpol->ilx; 1440 unsigned int order; 1441 int nid = numa_node_id(); 1442 gfp_t gfp; 1443 1444 order = folio_order(src); 1445 ilx += src->index >> order; 1446 1447 if (folio_test_hugetlb(src)) { 1448 nodemask_t *nodemask; 1449 struct hstate *h; 1450 1451 h = folio_hstate(src); 1452 gfp = htlb_alloc_mask(h); 1453 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1454 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1455 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1456 } 1457 1458 if (folio_test_large(src)) 1459 gfp = GFP_TRANSHUGE; 1460 else 1461 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1462 1463 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1464 } 1465 #else 1466 1467 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1468 unsigned long flags) 1469 { 1470 return false; 1471 } 1472 1473 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1474 const nodemask_t *to, int flags) 1475 { 1476 return -ENOSYS; 1477 } 1478 1479 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1480 unsigned long private) 1481 { 1482 return NULL; 1483 } 1484 #endif 1485 1486 static long do_mbind(unsigned long start, unsigned long len, 1487 unsigned short mode, unsigned short mode_flags, 1488 nodemask_t *nmask, unsigned long flags) 1489 { 1490 struct mm_struct *mm = current->mm; 1491 struct vm_area_struct *vma, *prev; 1492 struct vma_iterator vmi; 1493 struct migration_mpol mmpol; 1494 struct mempolicy *new; 1495 unsigned long end; 1496 long err; 1497 long nr_failed; 1498 LIST_HEAD(pagelist); 1499 1500 if (flags & ~(unsigned long)MPOL_MF_VALID) 1501 return -EINVAL; 1502 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1503 return -EPERM; 1504 1505 if (start & ~PAGE_MASK) 1506 return -EINVAL; 1507 1508 if (mode == MPOL_DEFAULT) 1509 flags &= ~MPOL_MF_STRICT; 1510 1511 len = PAGE_ALIGN(len); 1512 end = start + len; 1513 1514 if (end < start) 1515 return -EINVAL; 1516 if (end == start) 1517 return 0; 1518 1519 new = mpol_new(mode, mode_flags, nmask); 1520 if (IS_ERR(new)) 1521 return PTR_ERR(new); 1522 1523 /* 1524 * If we are using the default policy then operation 1525 * on discontinuous address spaces is okay after all 1526 */ 1527 if (!new) 1528 flags |= MPOL_MF_DISCONTIG_OK; 1529 1530 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1531 lru_cache_disable(); 1532 { 1533 NODEMASK_SCRATCH(scratch); 1534 if (scratch) { 1535 mmap_write_lock(mm); 1536 err = mpol_set_nodemask(new, nmask, scratch); 1537 if (err) 1538 mmap_write_unlock(mm); 1539 } else 1540 err = -ENOMEM; 1541 NODEMASK_SCRATCH_FREE(scratch); 1542 } 1543 if (err) 1544 goto mpol_out; 1545 1546 /* 1547 * Lock the VMAs before scanning for pages to migrate, 1548 * to ensure we don't miss a concurrently inserted page. 1549 */ 1550 nr_failed = queue_pages_range(mm, start, end, nmask, 1551 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1552 1553 if (nr_failed < 0) { 1554 err = nr_failed; 1555 nr_failed = 0; 1556 } else { 1557 vma_iter_init(&vmi, mm, start); 1558 prev = vma_prev(&vmi); 1559 for_each_vma_range(vmi, vma, end) { 1560 err = mbind_range(&vmi, vma, &prev, start, end, new); 1561 if (err) 1562 break; 1563 } 1564 } 1565 1566 if (!err && !list_empty(&pagelist)) { 1567 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1568 if (!new) { 1569 new = get_task_policy(current); 1570 mpol_get(new); 1571 } 1572 mmpol.pol = new; 1573 mmpol.ilx = 0; 1574 1575 /* 1576 * In the interleaved case, attempt to allocate on exactly the 1577 * targeted nodes, for the first VMA to be migrated; for later 1578 * VMAs, the nodes will still be interleaved from the targeted 1579 * nodemask, but one by one may be selected differently. 1580 */ 1581 if (new->mode == MPOL_INTERLEAVE || 1582 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1583 struct folio *folio; 1584 unsigned int order; 1585 unsigned long addr = -EFAULT; 1586 1587 list_for_each_entry(folio, &pagelist, lru) { 1588 if (!folio_test_ksm(folio)) 1589 break; 1590 } 1591 if (!list_entry_is_head(folio, &pagelist, lru)) { 1592 vma_iter_init(&vmi, mm, start); 1593 for_each_vma_range(vmi, vma, end) { 1594 addr = page_address_in_vma(folio, 1595 folio_page(folio, 0), vma); 1596 if (addr != -EFAULT) 1597 break; 1598 } 1599 } 1600 if (addr != -EFAULT) { 1601 order = folio_order(folio); 1602 /* We already know the pol, but not the ilx */ 1603 mpol_cond_put(get_vma_policy(vma, addr, order, 1604 &mmpol.ilx)); 1605 /* Set base from which to increment by index */ 1606 mmpol.ilx -= folio->index >> order; 1607 } 1608 } 1609 } 1610 1611 mmap_write_unlock(mm); 1612 1613 if (!err && !list_empty(&pagelist)) { 1614 nr_failed |= migrate_pages(&pagelist, 1615 alloc_migration_target_by_mpol, NULL, 1616 (unsigned long)&mmpol, MIGRATE_SYNC, 1617 MR_MEMPOLICY_MBIND, NULL); 1618 } 1619 1620 if (nr_failed && (flags & MPOL_MF_STRICT)) 1621 err = -EIO; 1622 if (!list_empty(&pagelist)) 1623 putback_movable_pages(&pagelist); 1624 mpol_out: 1625 mpol_put(new); 1626 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1627 lru_cache_enable(); 1628 return err; 1629 } 1630 1631 /* 1632 * User space interface with variable sized bitmaps for nodelists. 1633 */ 1634 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1635 unsigned long maxnode) 1636 { 1637 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1638 int ret; 1639 1640 if (in_compat_syscall()) 1641 ret = compat_get_bitmap(mask, 1642 (const compat_ulong_t __user *)nmask, 1643 maxnode); 1644 else 1645 ret = copy_from_user(mask, nmask, 1646 nlongs * sizeof(unsigned long)); 1647 1648 if (ret) 1649 return -EFAULT; 1650 1651 if (maxnode % BITS_PER_LONG) 1652 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1653 1654 return 0; 1655 } 1656 1657 /* Copy a node mask from user space. */ 1658 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1659 unsigned long maxnode) 1660 { 1661 --maxnode; 1662 nodes_clear(*nodes); 1663 if (maxnode == 0 || !nmask) 1664 return 0; 1665 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1666 return -EINVAL; 1667 1668 /* 1669 * When the user specified more nodes than supported just check 1670 * if the non supported part is all zero, one word at a time, 1671 * starting at the end. 1672 */ 1673 while (maxnode > MAX_NUMNODES) { 1674 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1675 unsigned long t; 1676 1677 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1678 return -EFAULT; 1679 1680 if (maxnode - bits >= MAX_NUMNODES) { 1681 maxnode -= bits; 1682 } else { 1683 maxnode = MAX_NUMNODES; 1684 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1685 } 1686 if (t) 1687 return -EINVAL; 1688 } 1689 1690 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1691 } 1692 1693 /* Copy a kernel node mask to user space */ 1694 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1695 nodemask_t *nodes) 1696 { 1697 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1698 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1699 bool compat = in_compat_syscall(); 1700 1701 if (compat) 1702 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1703 1704 if (copy > nbytes) { 1705 if (copy > PAGE_SIZE) 1706 return -EINVAL; 1707 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1708 return -EFAULT; 1709 copy = nbytes; 1710 maxnode = nr_node_ids; 1711 } 1712 1713 if (compat) 1714 return compat_put_bitmap((compat_ulong_t __user *)mask, 1715 nodes_addr(*nodes), maxnode); 1716 1717 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1718 } 1719 1720 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1721 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1722 { 1723 *flags = *mode & MPOL_MODE_FLAGS; 1724 *mode &= ~MPOL_MODE_FLAGS; 1725 1726 if ((unsigned int)(*mode) >= MPOL_MAX) 1727 return -EINVAL; 1728 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1729 return -EINVAL; 1730 if (*flags & MPOL_F_NUMA_BALANCING) { 1731 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1732 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1733 else 1734 return -EINVAL; 1735 } 1736 return 0; 1737 } 1738 1739 static long kernel_mbind(unsigned long start, unsigned long len, 1740 unsigned long mode, const unsigned long __user *nmask, 1741 unsigned long maxnode, unsigned int flags) 1742 { 1743 unsigned short mode_flags; 1744 nodemask_t nodes; 1745 int lmode = mode; 1746 int err; 1747 1748 start = untagged_addr(start); 1749 err = sanitize_mpol_flags(&lmode, &mode_flags); 1750 if (err) 1751 return err; 1752 1753 err = get_nodes(&nodes, nmask, maxnode); 1754 if (err) 1755 return err; 1756 1757 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1758 } 1759 1760 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1761 unsigned long, home_node, unsigned long, flags) 1762 { 1763 struct mm_struct *mm = current->mm; 1764 struct vm_area_struct *vma, *prev; 1765 struct mempolicy *new, *old; 1766 unsigned long end; 1767 int err = -ENOENT; 1768 VMA_ITERATOR(vmi, mm, start); 1769 1770 start = untagged_addr(start); 1771 if (start & ~PAGE_MASK) 1772 return -EINVAL; 1773 /* 1774 * flags is used for future extension if any. 1775 */ 1776 if (flags != 0) 1777 return -EINVAL; 1778 1779 /* 1780 * Check home_node is online to avoid accessing uninitialized 1781 * NODE_DATA. 1782 */ 1783 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1784 return -EINVAL; 1785 1786 len = PAGE_ALIGN(len); 1787 end = start + len; 1788 1789 if (end < start) 1790 return -EINVAL; 1791 if (end == start) 1792 return 0; 1793 mmap_write_lock(mm); 1794 prev = vma_prev(&vmi); 1795 for_each_vma_range(vmi, vma, end) { 1796 /* 1797 * If any vma in the range got policy other than MPOL_BIND 1798 * or MPOL_PREFERRED_MANY we return error. We don't reset 1799 * the home node for vmas we already updated before. 1800 */ 1801 old = vma_policy(vma); 1802 if (!old) { 1803 prev = vma; 1804 continue; 1805 } 1806 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1807 err = -EOPNOTSUPP; 1808 break; 1809 } 1810 new = mpol_dup(old); 1811 if (IS_ERR(new)) { 1812 err = PTR_ERR(new); 1813 break; 1814 } 1815 1816 vma_start_write(vma); 1817 new->home_node = home_node; 1818 err = mbind_range(&vmi, vma, &prev, start, end, new); 1819 mpol_put(new); 1820 if (err) 1821 break; 1822 } 1823 mmap_write_unlock(mm); 1824 return err; 1825 } 1826 1827 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1828 unsigned long, mode, const unsigned long __user *, nmask, 1829 unsigned long, maxnode, unsigned int, flags) 1830 { 1831 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1832 } 1833 1834 /* Set the process memory policy */ 1835 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1836 unsigned long maxnode) 1837 { 1838 unsigned short mode_flags; 1839 nodemask_t nodes; 1840 int lmode = mode; 1841 int err; 1842 1843 err = sanitize_mpol_flags(&lmode, &mode_flags); 1844 if (err) 1845 return err; 1846 1847 err = get_nodes(&nodes, nmask, maxnode); 1848 if (err) 1849 return err; 1850 1851 return do_set_mempolicy(lmode, mode_flags, &nodes); 1852 } 1853 1854 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1855 unsigned long, maxnode) 1856 { 1857 return kernel_set_mempolicy(mode, nmask, maxnode); 1858 } 1859 1860 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1861 const unsigned long __user *old_nodes, 1862 const unsigned long __user *new_nodes) 1863 { 1864 struct mm_struct *mm = NULL; 1865 struct task_struct *task; 1866 nodemask_t task_nodes; 1867 int err; 1868 nodemask_t *old; 1869 nodemask_t *new; 1870 NODEMASK_SCRATCH(scratch); 1871 1872 if (!scratch) 1873 return -ENOMEM; 1874 1875 old = &scratch->mask1; 1876 new = &scratch->mask2; 1877 1878 err = get_nodes(old, old_nodes, maxnode); 1879 if (err) 1880 goto out; 1881 1882 err = get_nodes(new, new_nodes, maxnode); 1883 if (err) 1884 goto out; 1885 1886 /* Find the mm_struct */ 1887 rcu_read_lock(); 1888 task = pid ? find_task_by_vpid(pid) : current; 1889 if (!task) { 1890 rcu_read_unlock(); 1891 err = -ESRCH; 1892 goto out; 1893 } 1894 get_task_struct(task); 1895 1896 err = -EINVAL; 1897 1898 /* 1899 * Check if this process has the right to modify the specified process. 1900 * Use the regular "ptrace_may_access()" checks. 1901 */ 1902 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1903 rcu_read_unlock(); 1904 err = -EPERM; 1905 goto out_put; 1906 } 1907 rcu_read_unlock(); 1908 1909 task_nodes = cpuset_mems_allowed(task); 1910 /* Is the user allowed to access the target nodes? */ 1911 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1912 err = -EPERM; 1913 goto out_put; 1914 } 1915 1916 task_nodes = cpuset_mems_allowed(current); 1917 if (!nodes_and(*new, *new, task_nodes)) 1918 goto out_put; 1919 1920 err = security_task_movememory(task); 1921 if (err) 1922 goto out_put; 1923 1924 mm = get_task_mm(task); 1925 put_task_struct(task); 1926 1927 if (!mm) { 1928 err = -EINVAL; 1929 goto out; 1930 } 1931 1932 err = do_migrate_pages(mm, old, new, 1933 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1934 1935 mmput(mm); 1936 out: 1937 NODEMASK_SCRATCH_FREE(scratch); 1938 1939 return err; 1940 1941 out_put: 1942 put_task_struct(task); 1943 goto out; 1944 } 1945 1946 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1947 const unsigned long __user *, old_nodes, 1948 const unsigned long __user *, new_nodes) 1949 { 1950 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1951 } 1952 1953 /* Retrieve NUMA policy */ 1954 static int kernel_get_mempolicy(int __user *policy, 1955 unsigned long __user *nmask, 1956 unsigned long maxnode, 1957 unsigned long addr, 1958 unsigned long flags) 1959 { 1960 int err; 1961 int pval; 1962 nodemask_t nodes; 1963 1964 if (nmask != NULL && maxnode < nr_node_ids) 1965 return -EINVAL; 1966 1967 addr = untagged_addr(addr); 1968 1969 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1970 1971 if (err) 1972 return err; 1973 1974 if (policy && put_user(pval, policy)) 1975 return -EFAULT; 1976 1977 if (nmask) 1978 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1979 1980 return err; 1981 } 1982 1983 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1984 unsigned long __user *, nmask, unsigned long, maxnode, 1985 unsigned long, addr, unsigned long, flags) 1986 { 1987 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1988 } 1989 1990 bool vma_migratable(struct vm_area_struct *vma) 1991 { 1992 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1993 return false; 1994 1995 /* 1996 * DAX device mappings require predictable access latency, so avoid 1997 * incurring periodic faults. 1998 */ 1999 if (vma_is_dax(vma)) 2000 return false; 2001 2002 if (is_vm_hugetlb_page(vma) && 2003 !hugepage_migration_supported(hstate_vma(vma))) 2004 return false; 2005 2006 /* 2007 * Migration allocates pages in the highest zone. If we cannot 2008 * do so then migration (at least from node to node) is not 2009 * possible. 2010 */ 2011 if (vma->vm_file && 2012 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 2013 < policy_zone) 2014 return false; 2015 return true; 2016 } 2017 2018 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 2019 unsigned long addr, pgoff_t *ilx) 2020 { 2021 *ilx = 0; 2022 return (vma->vm_ops && vma->vm_ops->get_policy) ? 2023 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 2024 } 2025 2026 /* 2027 * get_vma_policy(@vma, @addr, @order, @ilx) 2028 * @vma: virtual memory area whose policy is sought 2029 * @addr: address in @vma for shared policy lookup 2030 * @order: 0, or appropriate huge_page_order for interleaving 2031 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 2032 * MPOL_WEIGHTED_INTERLEAVE 2033 * 2034 * Returns effective policy for a VMA at specified address. 2035 * Falls back to current->mempolicy or system default policy, as necessary. 2036 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 2037 * count--added by the get_policy() vm_op, as appropriate--to protect against 2038 * freeing by another task. It is the caller's responsibility to free the 2039 * extra reference for shared policies. 2040 */ 2041 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 2042 unsigned long addr, int order, pgoff_t *ilx) 2043 { 2044 struct mempolicy *pol; 2045 2046 pol = __get_vma_policy(vma, addr, ilx); 2047 if (!pol) 2048 pol = get_task_policy(current); 2049 if (pol->mode == MPOL_INTERLEAVE || 2050 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 2051 *ilx += vma->vm_pgoff >> order; 2052 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 2053 } 2054 return pol; 2055 } 2056 2057 bool vma_policy_mof(struct vm_area_struct *vma) 2058 { 2059 struct mempolicy *pol; 2060 2061 if (vma->vm_ops && vma->vm_ops->get_policy) { 2062 bool ret = false; 2063 pgoff_t ilx; /* ignored here */ 2064 2065 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 2066 if (pol && (pol->flags & MPOL_F_MOF)) 2067 ret = true; 2068 mpol_cond_put(pol); 2069 2070 return ret; 2071 } 2072 2073 pol = vma->vm_policy; 2074 if (!pol) 2075 pol = get_task_policy(current); 2076 2077 return pol->flags & MPOL_F_MOF; 2078 } 2079 2080 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 2081 { 2082 enum zone_type dynamic_policy_zone = policy_zone; 2083 2084 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 2085 2086 /* 2087 * if policy->nodes has movable memory only, 2088 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 2089 * 2090 * policy->nodes is intersect with node_states[N_MEMORY]. 2091 * so if the following test fails, it implies 2092 * policy->nodes has movable memory only. 2093 */ 2094 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 2095 dynamic_policy_zone = ZONE_MOVABLE; 2096 2097 return zone >= dynamic_policy_zone; 2098 } 2099 2100 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 2101 { 2102 unsigned int node; 2103 unsigned int cpuset_mems_cookie; 2104 2105 retry: 2106 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 2107 cpuset_mems_cookie = read_mems_allowed_begin(); 2108 node = current->il_prev; 2109 if (!current->il_weight || !node_isset(node, policy->nodes)) { 2110 node = next_node_in(node, policy->nodes); 2111 if (read_mems_allowed_retry(cpuset_mems_cookie)) 2112 goto retry; 2113 if (node == MAX_NUMNODES) 2114 return node; 2115 current->il_prev = node; 2116 current->il_weight = get_il_weight(node); 2117 } 2118 current->il_weight--; 2119 return node; 2120 } 2121 2122 /* Do dynamic interleaving for a process */ 2123 static unsigned int interleave_nodes(struct mempolicy *policy) 2124 { 2125 unsigned int nid; 2126 unsigned int cpuset_mems_cookie; 2127 2128 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 2129 do { 2130 cpuset_mems_cookie = read_mems_allowed_begin(); 2131 nid = next_node_in(current->il_prev, policy->nodes); 2132 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2133 2134 if (nid < MAX_NUMNODES) 2135 current->il_prev = nid; 2136 return nid; 2137 } 2138 2139 /* 2140 * Depending on the memory policy provide a node from which to allocate the 2141 * next slab entry. 2142 */ 2143 unsigned int mempolicy_slab_node(void) 2144 { 2145 struct mempolicy *policy; 2146 int node = numa_mem_id(); 2147 2148 if (!in_task()) 2149 return node; 2150 2151 policy = current->mempolicy; 2152 if (!policy) 2153 return node; 2154 2155 switch (policy->mode) { 2156 case MPOL_PREFERRED: 2157 return first_node(policy->nodes); 2158 2159 case MPOL_INTERLEAVE: 2160 return interleave_nodes(policy); 2161 2162 case MPOL_WEIGHTED_INTERLEAVE: 2163 return weighted_interleave_nodes(policy); 2164 2165 case MPOL_BIND: 2166 case MPOL_PREFERRED_MANY: 2167 { 2168 struct zoneref *z; 2169 2170 /* 2171 * Follow bind policy behavior and start allocation at the 2172 * first node. 2173 */ 2174 struct zonelist *zonelist; 2175 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 2176 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 2177 z = first_zones_zonelist(zonelist, highest_zoneidx, 2178 &policy->nodes); 2179 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2180 } 2181 case MPOL_LOCAL: 2182 return node; 2183 2184 default: 2185 BUG(); 2186 } 2187 } 2188 2189 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2190 nodemask_t *mask) 2191 { 2192 /* 2193 * barrier stabilizes the nodemask locally so that it can be iterated 2194 * over safely without concern for changes. Allocators validate node 2195 * selection does not violate mems_allowed, so this is safe. 2196 */ 2197 barrier(); 2198 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2199 barrier(); 2200 return nodes_weight(*mask); 2201 } 2202 2203 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2204 { 2205 struct weighted_interleave_state *state; 2206 nodemask_t nodemask; 2207 unsigned int target, nr_nodes; 2208 u8 *table = NULL; 2209 unsigned int weight_total = 0; 2210 u8 weight; 2211 int nid = 0; 2212 2213 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2214 if (!nr_nodes) 2215 return numa_node_id(); 2216 2217 rcu_read_lock(); 2218 2219 state = rcu_dereference(wi_state); 2220 /* Uninitialized wi_state means we should assume all weights are 1 */ 2221 if (state) 2222 table = state->iw_table; 2223 2224 /* calculate the total weight */ 2225 for_each_node_mask(nid, nodemask) 2226 weight_total += table ? table[nid] : 1; 2227 2228 /* Calculate the node offset based on totals */ 2229 target = ilx % weight_total; 2230 nid = first_node(nodemask); 2231 while (target) { 2232 /* detect system default usage */ 2233 weight = table ? table[nid] : 1; 2234 if (target < weight) 2235 break; 2236 target -= weight; 2237 nid = next_node_in(nid, nodemask); 2238 } 2239 rcu_read_unlock(); 2240 return nid; 2241 } 2242 2243 /* 2244 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2245 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2246 * exceeds the number of present nodes. 2247 */ 2248 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2249 { 2250 nodemask_t nodemask; 2251 unsigned int target, nnodes; 2252 int i; 2253 int nid; 2254 2255 nnodes = read_once_policy_nodemask(pol, &nodemask); 2256 if (!nnodes) 2257 return numa_node_id(); 2258 target = ilx % nnodes; 2259 nid = first_node(nodemask); 2260 for (i = 0; i < target; i++) 2261 nid = next_node(nid, nodemask); 2262 return nid; 2263 } 2264 2265 /* 2266 * Return a nodemask representing a mempolicy for filtering nodes for 2267 * page allocation, together with preferred node id (or the input node id). 2268 */ 2269 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2270 pgoff_t ilx, int *nid) 2271 { 2272 nodemask_t *nodemask = NULL; 2273 2274 switch (pol->mode) { 2275 case MPOL_PREFERRED: 2276 /* Override input node id */ 2277 *nid = first_node(pol->nodes); 2278 break; 2279 case MPOL_PREFERRED_MANY: 2280 nodemask = &pol->nodes; 2281 if (pol->home_node != NUMA_NO_NODE) 2282 *nid = pol->home_node; 2283 break; 2284 case MPOL_BIND: 2285 /* Restrict to nodemask (but not on lower zones) */ 2286 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2287 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2288 nodemask = &pol->nodes; 2289 if (pol->home_node != NUMA_NO_NODE) 2290 *nid = pol->home_node; 2291 /* 2292 * __GFP_THISNODE shouldn't even be used with the bind policy 2293 * because we might easily break the expectation to stay on the 2294 * requested node and not break the policy. 2295 */ 2296 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2297 break; 2298 case MPOL_INTERLEAVE: 2299 /* Override input node id */ 2300 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2301 interleave_nodes(pol) : interleave_nid(pol, ilx); 2302 break; 2303 case MPOL_WEIGHTED_INTERLEAVE: 2304 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2305 weighted_interleave_nodes(pol) : 2306 weighted_interleave_nid(pol, ilx); 2307 break; 2308 } 2309 2310 return nodemask; 2311 } 2312 2313 #ifdef CONFIG_HUGETLBFS 2314 /* 2315 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2316 * @vma: virtual memory area whose policy is sought 2317 * @addr: address in @vma for shared policy lookup and interleave policy 2318 * @gfp_flags: for requested zone 2319 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2320 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2321 * 2322 * Returns a nid suitable for a huge page allocation and a pointer 2323 * to the struct mempolicy for conditional unref after allocation. 2324 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2325 * to the mempolicy's @nodemask for filtering the zonelist. 2326 */ 2327 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2328 struct mempolicy **mpol, nodemask_t **nodemask) 2329 { 2330 pgoff_t ilx; 2331 int nid; 2332 2333 nid = numa_node_id(); 2334 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2335 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2336 return nid; 2337 } 2338 2339 /* 2340 * init_nodemask_of_mempolicy 2341 * 2342 * If the current task's mempolicy is "default" [NULL], return 'false' 2343 * to indicate default policy. Otherwise, extract the policy nodemask 2344 * for 'bind' or 'interleave' policy into the argument nodemask, or 2345 * initialize the argument nodemask to contain the single node for 2346 * 'preferred' or 'local' policy and return 'true' to indicate presence 2347 * of non-default mempolicy. 2348 * 2349 * We don't bother with reference counting the mempolicy [mpol_get/put] 2350 * because the current task is examining it's own mempolicy and a task's 2351 * mempolicy is only ever changed by the task itself. 2352 * 2353 * N.B., it is the caller's responsibility to free a returned nodemask. 2354 */ 2355 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2356 { 2357 struct mempolicy *mempolicy; 2358 2359 if (!(mask && current->mempolicy)) 2360 return false; 2361 2362 task_lock(current); 2363 mempolicy = current->mempolicy; 2364 switch (mempolicy->mode) { 2365 case MPOL_PREFERRED: 2366 case MPOL_PREFERRED_MANY: 2367 case MPOL_BIND: 2368 case MPOL_INTERLEAVE: 2369 case MPOL_WEIGHTED_INTERLEAVE: 2370 *mask = mempolicy->nodes; 2371 break; 2372 2373 case MPOL_LOCAL: 2374 init_nodemask_of_node(mask, numa_node_id()); 2375 break; 2376 2377 default: 2378 BUG(); 2379 } 2380 task_unlock(current); 2381 2382 return true; 2383 } 2384 #endif 2385 2386 /* 2387 * mempolicy_in_oom_domain 2388 * 2389 * If tsk's mempolicy is "bind", check for intersection between mask and 2390 * the policy nodemask. Otherwise, return true for all other policies 2391 * including "interleave", as a tsk with "interleave" policy may have 2392 * memory allocated from all nodes in system. 2393 * 2394 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2395 */ 2396 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2397 const nodemask_t *mask) 2398 { 2399 struct mempolicy *mempolicy; 2400 bool ret = true; 2401 2402 if (!mask) 2403 return ret; 2404 2405 task_lock(tsk); 2406 mempolicy = tsk->mempolicy; 2407 if (mempolicy && mempolicy->mode == MPOL_BIND) 2408 ret = nodes_intersects(mempolicy->nodes, *mask); 2409 task_unlock(tsk); 2410 2411 return ret; 2412 } 2413 2414 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2415 int nid, nodemask_t *nodemask) 2416 { 2417 struct page *page; 2418 gfp_t preferred_gfp; 2419 2420 /* 2421 * This is a two pass approach. The first pass will only try the 2422 * preferred nodes but skip the direct reclaim and allow the 2423 * allocation to fail, while the second pass will try all the 2424 * nodes in system. 2425 */ 2426 preferred_gfp = gfp | __GFP_NOWARN; 2427 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2428 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2429 if (!page) 2430 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2431 2432 return page; 2433 } 2434 2435 /** 2436 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2437 * @gfp: GFP flags. 2438 * @order: Order of the page allocation. 2439 * @pol: Pointer to the NUMA mempolicy. 2440 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2441 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2442 * 2443 * Return: The page on success or NULL if allocation fails. 2444 */ 2445 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2446 struct mempolicy *pol, pgoff_t ilx, int nid) 2447 { 2448 nodemask_t *nodemask; 2449 struct page *page; 2450 2451 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2452 2453 if (pol->mode == MPOL_PREFERRED_MANY) 2454 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2455 2456 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2457 /* filter "hugepage" allocation, unless from alloc_pages() */ 2458 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2459 /* 2460 * For hugepage allocation and non-interleave policy which 2461 * allows the current node (or other explicitly preferred 2462 * node) we only try to allocate from the current/preferred 2463 * node and don't fall back to other nodes, as the cost of 2464 * remote accesses would likely offset THP benefits. 2465 * 2466 * If the policy is interleave or does not allow the current 2467 * node in its nodemask, we allocate the standard way. 2468 */ 2469 if (pol->mode != MPOL_INTERLEAVE && 2470 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2471 (!nodemask || node_isset(nid, *nodemask))) { 2472 /* 2473 * First, try to allocate THP only on local node, but 2474 * don't reclaim unnecessarily, just compact. 2475 */ 2476 page = __alloc_frozen_pages_noprof( 2477 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2478 nid, NULL); 2479 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2480 return page; 2481 /* 2482 * If hugepage allocations are configured to always 2483 * synchronous compact or the vma has been madvised 2484 * to prefer hugepage backing, retry allowing remote 2485 * memory with both reclaim and compact as well. 2486 */ 2487 } 2488 } 2489 2490 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2491 2492 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2493 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2494 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2495 if (static_branch_likely(&vm_numa_stat_key) && 2496 page_to_nid(page) == nid) { 2497 preempt_disable(); 2498 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2499 preempt_enable(); 2500 } 2501 } 2502 2503 return page; 2504 } 2505 2506 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2507 struct mempolicy *pol, pgoff_t ilx, int nid) 2508 { 2509 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2510 ilx, nid); 2511 if (!page) 2512 return NULL; 2513 2514 set_page_refcounted(page); 2515 return page_rmappable_folio(page); 2516 } 2517 2518 /** 2519 * vma_alloc_folio - Allocate a folio for a VMA. 2520 * @gfp: GFP flags. 2521 * @order: Order of the folio. 2522 * @vma: Pointer to VMA. 2523 * @addr: Virtual address of the allocation. Must be inside @vma. 2524 * 2525 * Allocate a folio for a specific address in @vma, using the appropriate 2526 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2527 * VMA to prevent it from going away. Should be used for all allocations 2528 * for folios that will be mapped into user space, excepting hugetlbfs, and 2529 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2530 * 2531 * Return: The folio on success or NULL if allocation fails. 2532 */ 2533 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2534 unsigned long addr) 2535 { 2536 struct mempolicy *pol; 2537 pgoff_t ilx; 2538 struct folio *folio; 2539 2540 if (vma->vm_flags & VM_DROPPABLE) 2541 gfp |= __GFP_NOWARN; 2542 2543 pol = get_vma_policy(vma, addr, order, &ilx); 2544 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2545 mpol_cond_put(pol); 2546 return folio; 2547 } 2548 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2549 2550 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2551 { 2552 struct mempolicy *pol = &default_policy; 2553 2554 /* 2555 * No reference counting needed for current->mempolicy 2556 * nor system default_policy 2557 */ 2558 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2559 pol = get_task_policy(current); 2560 2561 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2562 numa_node_id()); 2563 } 2564 2565 /** 2566 * alloc_pages - Allocate pages. 2567 * @gfp: GFP flags. 2568 * @order: Power of two of number of pages to allocate. 2569 * 2570 * Allocate 1 << @order contiguous pages. The physical address of the 2571 * first page is naturally aligned (eg an order-3 allocation will be aligned 2572 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2573 * process is honoured when in process context. 2574 * 2575 * Context: Can be called from any context, providing the appropriate GFP 2576 * flags are used. 2577 * Return: The page on success or NULL if allocation fails. 2578 */ 2579 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2580 { 2581 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2582 2583 if (page) 2584 set_page_refcounted(page); 2585 return page; 2586 } 2587 EXPORT_SYMBOL(alloc_pages_noprof); 2588 2589 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2590 { 2591 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2592 } 2593 EXPORT_SYMBOL(folio_alloc_noprof); 2594 2595 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2596 struct mempolicy *pol, unsigned long nr_pages, 2597 struct page **page_array) 2598 { 2599 int nodes; 2600 unsigned long nr_pages_per_node; 2601 int delta; 2602 int i; 2603 unsigned long nr_allocated; 2604 unsigned long total_allocated = 0; 2605 2606 nodes = nodes_weight(pol->nodes); 2607 nr_pages_per_node = nr_pages / nodes; 2608 delta = nr_pages - nodes * nr_pages_per_node; 2609 2610 for (i = 0; i < nodes; i++) { 2611 if (delta) { 2612 nr_allocated = alloc_pages_bulk_noprof(gfp, 2613 interleave_nodes(pol), NULL, 2614 nr_pages_per_node + 1, 2615 page_array); 2616 delta--; 2617 } else { 2618 nr_allocated = alloc_pages_bulk_noprof(gfp, 2619 interleave_nodes(pol), NULL, 2620 nr_pages_per_node, page_array); 2621 } 2622 2623 page_array += nr_allocated; 2624 total_allocated += nr_allocated; 2625 } 2626 2627 return total_allocated; 2628 } 2629 2630 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2631 struct mempolicy *pol, unsigned long nr_pages, 2632 struct page **page_array) 2633 { 2634 struct weighted_interleave_state *state; 2635 struct task_struct *me = current; 2636 unsigned int cpuset_mems_cookie; 2637 unsigned long total_allocated = 0; 2638 unsigned long nr_allocated = 0; 2639 unsigned long rounds; 2640 unsigned long node_pages, delta; 2641 u8 *weights, weight; 2642 unsigned int weight_total = 0; 2643 unsigned long rem_pages = nr_pages; 2644 nodemask_t nodes; 2645 int nnodes, node; 2646 int resume_node = MAX_NUMNODES - 1; 2647 u8 resume_weight = 0; 2648 int prev_node; 2649 int i; 2650 2651 if (!nr_pages) 2652 return 0; 2653 2654 /* read the nodes onto the stack, retry if done during rebind */ 2655 do { 2656 cpuset_mems_cookie = read_mems_allowed_begin(); 2657 nnodes = read_once_policy_nodemask(pol, &nodes); 2658 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2659 2660 /* if the nodemask has become invalid, we cannot do anything */ 2661 if (!nnodes) 2662 return 0; 2663 2664 /* Continue allocating from most recent node and adjust the nr_pages */ 2665 node = me->il_prev; 2666 weight = me->il_weight; 2667 if (weight && node_isset(node, nodes)) { 2668 node_pages = min(rem_pages, weight); 2669 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2670 page_array); 2671 page_array += nr_allocated; 2672 total_allocated += nr_allocated; 2673 /* if that's all the pages, no need to interleave */ 2674 if (rem_pages <= weight) { 2675 me->il_weight -= rem_pages; 2676 return total_allocated; 2677 } 2678 /* Otherwise we adjust remaining pages, continue from there */ 2679 rem_pages -= weight; 2680 } 2681 /* clear active weight in case of an allocation failure */ 2682 me->il_weight = 0; 2683 prev_node = node; 2684 2685 /* create a local copy of node weights to operate on outside rcu */ 2686 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2687 if (!weights) 2688 return total_allocated; 2689 2690 rcu_read_lock(); 2691 state = rcu_dereference(wi_state); 2692 if (state) { 2693 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); 2694 rcu_read_unlock(); 2695 } else { 2696 rcu_read_unlock(); 2697 for (i = 0; i < nr_node_ids; i++) 2698 weights[i] = 1; 2699 } 2700 2701 /* calculate total, detect system default usage */ 2702 for_each_node_mask(node, nodes) 2703 weight_total += weights[node]; 2704 2705 /* 2706 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2707 * Track which node weighted interleave should resume from. 2708 * 2709 * if (rounds > 0) and (delta == 0), resume_node will always be 2710 * the node following prev_node and its weight. 2711 */ 2712 rounds = rem_pages / weight_total; 2713 delta = rem_pages % weight_total; 2714 resume_node = next_node_in(prev_node, nodes); 2715 resume_weight = weights[resume_node]; 2716 for (i = 0; i < nnodes; i++) { 2717 node = next_node_in(prev_node, nodes); 2718 weight = weights[node]; 2719 node_pages = weight * rounds; 2720 /* If a delta exists, add this node's portion of the delta */ 2721 if (delta > weight) { 2722 node_pages += weight; 2723 delta -= weight; 2724 } else if (delta) { 2725 /* when delta is depleted, resume from that node */ 2726 node_pages += delta; 2727 resume_node = node; 2728 resume_weight = weight - delta; 2729 delta = 0; 2730 } 2731 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2732 if (!node_pages) 2733 break; 2734 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2735 page_array); 2736 page_array += nr_allocated; 2737 total_allocated += nr_allocated; 2738 if (total_allocated == nr_pages) 2739 break; 2740 prev_node = node; 2741 } 2742 me->il_prev = resume_node; 2743 me->il_weight = resume_weight; 2744 kfree(weights); 2745 return total_allocated; 2746 } 2747 2748 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2749 struct mempolicy *pol, unsigned long nr_pages, 2750 struct page **page_array) 2751 { 2752 gfp_t preferred_gfp; 2753 unsigned long nr_allocated = 0; 2754 2755 preferred_gfp = gfp | __GFP_NOWARN; 2756 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2757 2758 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2759 nr_pages, page_array); 2760 2761 if (nr_allocated < nr_pages) 2762 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2763 nr_pages - nr_allocated, 2764 page_array + nr_allocated); 2765 return nr_allocated; 2766 } 2767 2768 /* alloc pages bulk and mempolicy should be considered at the 2769 * same time in some situation such as vmalloc. 2770 * 2771 * It can accelerate memory allocation especially interleaving 2772 * allocate memory. 2773 */ 2774 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2775 unsigned long nr_pages, struct page **page_array) 2776 { 2777 struct mempolicy *pol = &default_policy; 2778 nodemask_t *nodemask; 2779 int nid; 2780 2781 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2782 pol = get_task_policy(current); 2783 2784 if (pol->mode == MPOL_INTERLEAVE) 2785 return alloc_pages_bulk_interleave(gfp, pol, 2786 nr_pages, page_array); 2787 2788 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2789 return alloc_pages_bulk_weighted_interleave( 2790 gfp, pol, nr_pages, page_array); 2791 2792 if (pol->mode == MPOL_PREFERRED_MANY) 2793 return alloc_pages_bulk_preferred_many(gfp, 2794 numa_node_id(), pol, nr_pages, page_array); 2795 2796 nid = numa_node_id(); 2797 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2798 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2799 nr_pages, page_array); 2800 } 2801 2802 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2803 { 2804 struct mempolicy *pol = mpol_dup(src->vm_policy); 2805 2806 if (IS_ERR(pol)) 2807 return PTR_ERR(pol); 2808 dst->vm_policy = pol; 2809 return 0; 2810 } 2811 2812 /* 2813 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2814 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2815 * with the mems_allowed returned by cpuset_mems_allowed(). This 2816 * keeps mempolicies cpuset relative after its cpuset moves. See 2817 * further kernel/cpuset.c update_nodemask(). 2818 * 2819 * current's mempolicy may be rebinded by the other task(the task that changes 2820 * cpuset's mems), so we needn't do rebind work for current task. 2821 */ 2822 2823 /* Slow path of a mempolicy duplicate */ 2824 struct mempolicy *__mpol_dup(struct mempolicy *old) 2825 { 2826 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2827 2828 if (!new) 2829 return ERR_PTR(-ENOMEM); 2830 2831 /* task's mempolicy is protected by alloc_lock */ 2832 if (old == current->mempolicy) { 2833 task_lock(current); 2834 *new = *old; 2835 task_unlock(current); 2836 } else 2837 *new = *old; 2838 2839 if (current_cpuset_is_being_rebound()) { 2840 nodemask_t mems = cpuset_mems_allowed(current); 2841 mpol_rebind_policy(new, &mems); 2842 } 2843 atomic_set(&new->refcnt, 1); 2844 return new; 2845 } 2846 2847 /* Slow path of a mempolicy comparison */ 2848 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2849 { 2850 if (!a || !b) 2851 return false; 2852 if (a->mode != b->mode) 2853 return false; 2854 if (a->flags != b->flags) 2855 return false; 2856 if (a->home_node != b->home_node) 2857 return false; 2858 if (mpol_store_user_nodemask(a)) 2859 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2860 return false; 2861 2862 switch (a->mode) { 2863 case MPOL_BIND: 2864 case MPOL_INTERLEAVE: 2865 case MPOL_PREFERRED: 2866 case MPOL_PREFERRED_MANY: 2867 case MPOL_WEIGHTED_INTERLEAVE: 2868 return !!nodes_equal(a->nodes, b->nodes); 2869 case MPOL_LOCAL: 2870 return true; 2871 default: 2872 BUG(); 2873 return false; 2874 } 2875 } 2876 2877 /* 2878 * Shared memory backing store policy support. 2879 * 2880 * Remember policies even when nobody has shared memory mapped. 2881 * The policies are kept in Red-Black tree linked from the inode. 2882 * They are protected by the sp->lock rwlock, which should be held 2883 * for any accesses to the tree. 2884 */ 2885 2886 /* 2887 * lookup first element intersecting start-end. Caller holds sp->lock for 2888 * reading or for writing 2889 */ 2890 static struct sp_node *sp_lookup(struct shared_policy *sp, 2891 pgoff_t start, pgoff_t end) 2892 { 2893 struct rb_node *n = sp->root.rb_node; 2894 2895 while (n) { 2896 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2897 2898 if (start >= p->end) 2899 n = n->rb_right; 2900 else if (end <= p->start) 2901 n = n->rb_left; 2902 else 2903 break; 2904 } 2905 if (!n) 2906 return NULL; 2907 for (;;) { 2908 struct sp_node *w = NULL; 2909 struct rb_node *prev = rb_prev(n); 2910 if (!prev) 2911 break; 2912 w = rb_entry(prev, struct sp_node, nd); 2913 if (w->end <= start) 2914 break; 2915 n = prev; 2916 } 2917 return rb_entry(n, struct sp_node, nd); 2918 } 2919 2920 /* 2921 * Insert a new shared policy into the list. Caller holds sp->lock for 2922 * writing. 2923 */ 2924 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2925 { 2926 struct rb_node **p = &sp->root.rb_node; 2927 struct rb_node *parent = NULL; 2928 struct sp_node *nd; 2929 2930 while (*p) { 2931 parent = *p; 2932 nd = rb_entry(parent, struct sp_node, nd); 2933 if (new->start < nd->start) 2934 p = &(*p)->rb_left; 2935 else if (new->end > nd->end) 2936 p = &(*p)->rb_right; 2937 else 2938 BUG(); 2939 } 2940 rb_link_node(&new->nd, parent, p); 2941 rb_insert_color(&new->nd, &sp->root); 2942 } 2943 2944 /* Find shared policy intersecting idx */ 2945 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2946 pgoff_t idx) 2947 { 2948 struct mempolicy *pol = NULL; 2949 struct sp_node *sn; 2950 2951 if (!sp->root.rb_node) 2952 return NULL; 2953 read_lock(&sp->lock); 2954 sn = sp_lookup(sp, idx, idx+1); 2955 if (sn) { 2956 mpol_get(sn->policy); 2957 pol = sn->policy; 2958 } 2959 read_unlock(&sp->lock); 2960 return pol; 2961 } 2962 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); 2963 2964 static void sp_free(struct sp_node *n) 2965 { 2966 mpol_put(n->policy); 2967 kmem_cache_free(sn_cache, n); 2968 } 2969 2970 /** 2971 * mpol_misplaced - check whether current folio node is valid in policy 2972 * 2973 * @folio: folio to be checked 2974 * @vmf: structure describing the fault 2975 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2976 * 2977 * Lookup current policy node id for vma,addr and "compare to" folio's 2978 * node id. Policy determination "mimics" alloc_page_vma(). 2979 * Called from fault path where we know the vma and faulting address. 2980 * 2981 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2982 * policy, or a suitable node ID to allocate a replacement folio from. 2983 */ 2984 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2985 unsigned long addr) 2986 { 2987 struct mempolicy *pol; 2988 pgoff_t ilx; 2989 struct zoneref *z; 2990 int curnid = folio_nid(folio); 2991 struct vm_area_struct *vma = vmf->vma; 2992 int thiscpu = raw_smp_processor_id(); 2993 int thisnid = numa_node_id(); 2994 int polnid = NUMA_NO_NODE; 2995 int ret = NUMA_NO_NODE; 2996 2997 /* 2998 * Make sure ptl is held so that we don't preempt and we 2999 * have a stable smp processor id 3000 */ 3001 lockdep_assert_held(vmf->ptl); 3002 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 3003 if (!(pol->flags & MPOL_F_MOF)) 3004 goto out; 3005 3006 switch (pol->mode) { 3007 case MPOL_INTERLEAVE: 3008 polnid = interleave_nid(pol, ilx); 3009 break; 3010 3011 case MPOL_WEIGHTED_INTERLEAVE: 3012 polnid = weighted_interleave_nid(pol, ilx); 3013 break; 3014 3015 case MPOL_PREFERRED: 3016 if (node_isset(curnid, pol->nodes)) 3017 goto out; 3018 polnid = first_node(pol->nodes); 3019 break; 3020 3021 case MPOL_LOCAL: 3022 polnid = numa_node_id(); 3023 break; 3024 3025 case MPOL_BIND: 3026 case MPOL_PREFERRED_MANY: 3027 /* 3028 * Even though MPOL_PREFERRED_MANY can allocate pages outside 3029 * policy nodemask we don't allow numa migration to nodes 3030 * outside policy nodemask for now. This is done so that if we 3031 * want demotion to slow memory to happen, before allocating 3032 * from some DRAM node say 'x', we will end up using a 3033 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 3034 * we should not promote to node 'x' from slow memory node. 3035 */ 3036 if (pol->flags & MPOL_F_MORON) { 3037 /* 3038 * Optimize placement among multiple nodes 3039 * via NUMA balancing 3040 */ 3041 if (node_isset(thisnid, pol->nodes)) 3042 break; 3043 goto out; 3044 } 3045 3046 /* 3047 * use current page if in policy nodemask, 3048 * else select nearest allowed node, if any. 3049 * If no allowed nodes, use current [!misplaced]. 3050 */ 3051 if (node_isset(curnid, pol->nodes)) 3052 goto out; 3053 z = first_zones_zonelist( 3054 node_zonelist(thisnid, GFP_HIGHUSER), 3055 gfp_zone(GFP_HIGHUSER), 3056 &pol->nodes); 3057 polnid = zonelist_node_idx(z); 3058 break; 3059 3060 default: 3061 BUG(); 3062 } 3063 3064 /* Migrate the folio towards the node whose CPU is referencing it */ 3065 if (pol->flags & MPOL_F_MORON) { 3066 polnid = thisnid; 3067 3068 if (!should_numa_migrate_memory(current, folio, curnid, 3069 thiscpu)) 3070 goto out; 3071 } 3072 3073 if (curnid != polnid) 3074 ret = polnid; 3075 out: 3076 mpol_cond_put(pol); 3077 3078 return ret; 3079 } 3080 3081 /* 3082 * Drop the (possibly final) reference to task->mempolicy. It needs to be 3083 * dropped after task->mempolicy is set to NULL so that any allocation done as 3084 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 3085 * policy. 3086 */ 3087 void mpol_put_task_policy(struct task_struct *task) 3088 { 3089 struct mempolicy *pol; 3090 3091 task_lock(task); 3092 pol = task->mempolicy; 3093 task->mempolicy = NULL; 3094 task_unlock(task); 3095 mpol_put(pol); 3096 } 3097 3098 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 3099 { 3100 rb_erase(&n->nd, &sp->root); 3101 sp_free(n); 3102 } 3103 3104 static void sp_node_init(struct sp_node *node, unsigned long start, 3105 unsigned long end, struct mempolicy *pol) 3106 { 3107 node->start = start; 3108 node->end = end; 3109 node->policy = pol; 3110 } 3111 3112 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 3113 struct mempolicy *pol) 3114 { 3115 struct sp_node *n; 3116 struct mempolicy *newpol; 3117 3118 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3119 if (!n) 3120 return NULL; 3121 3122 newpol = mpol_dup(pol); 3123 if (IS_ERR(newpol)) { 3124 kmem_cache_free(sn_cache, n); 3125 return NULL; 3126 } 3127 newpol->flags |= MPOL_F_SHARED; 3128 sp_node_init(n, start, end, newpol); 3129 3130 return n; 3131 } 3132 3133 /* Replace a policy range. */ 3134 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 3135 pgoff_t end, struct sp_node *new) 3136 { 3137 struct sp_node *n; 3138 struct sp_node *n_new = NULL; 3139 struct mempolicy *mpol_new = NULL; 3140 int ret = 0; 3141 3142 restart: 3143 write_lock(&sp->lock); 3144 n = sp_lookup(sp, start, end); 3145 /* Take care of old policies in the same range. */ 3146 while (n && n->start < end) { 3147 struct rb_node *next = rb_next(&n->nd); 3148 if (n->start >= start) { 3149 if (n->end <= end) 3150 sp_delete(sp, n); 3151 else 3152 n->start = end; 3153 } else { 3154 /* Old policy spanning whole new range. */ 3155 if (n->end > end) { 3156 if (!n_new) 3157 goto alloc_new; 3158 3159 *mpol_new = *n->policy; 3160 atomic_set(&mpol_new->refcnt, 1); 3161 sp_node_init(n_new, end, n->end, mpol_new); 3162 n->end = start; 3163 sp_insert(sp, n_new); 3164 n_new = NULL; 3165 mpol_new = NULL; 3166 break; 3167 } else 3168 n->end = start; 3169 } 3170 if (!next) 3171 break; 3172 n = rb_entry(next, struct sp_node, nd); 3173 } 3174 if (new) 3175 sp_insert(sp, new); 3176 write_unlock(&sp->lock); 3177 ret = 0; 3178 3179 err_out: 3180 if (mpol_new) 3181 mpol_put(mpol_new); 3182 if (n_new) 3183 kmem_cache_free(sn_cache, n_new); 3184 3185 return ret; 3186 3187 alloc_new: 3188 write_unlock(&sp->lock); 3189 ret = -ENOMEM; 3190 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3191 if (!n_new) 3192 goto err_out; 3193 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3194 if (!mpol_new) 3195 goto err_out; 3196 atomic_set(&mpol_new->refcnt, 1); 3197 goto restart; 3198 } 3199 3200 /** 3201 * mpol_shared_policy_init - initialize shared policy for inode 3202 * @sp: pointer to inode shared policy 3203 * @mpol: struct mempolicy to install 3204 * 3205 * Install non-NULL @mpol in inode's shared policy rb-tree. 3206 * On entry, the current task has a reference on a non-NULL @mpol. 3207 * This must be released on exit. 3208 * This is called at get_inode() calls and we can use GFP_KERNEL. 3209 */ 3210 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3211 { 3212 int ret; 3213 3214 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3215 rwlock_init(&sp->lock); 3216 3217 if (mpol) { 3218 struct sp_node *sn; 3219 struct mempolicy *npol; 3220 NODEMASK_SCRATCH(scratch); 3221 3222 if (!scratch) 3223 goto put_mpol; 3224 3225 /* contextualize the tmpfs mount point mempolicy to this file */ 3226 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3227 if (IS_ERR(npol)) 3228 goto free_scratch; /* no valid nodemask intersection */ 3229 3230 task_lock(current); 3231 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3232 task_unlock(current); 3233 if (ret) 3234 goto put_npol; 3235 3236 /* alloc node covering entire file; adds ref to file's npol */ 3237 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3238 if (sn) 3239 sp_insert(sp, sn); 3240 put_npol: 3241 mpol_put(npol); /* drop initial ref on file's npol */ 3242 free_scratch: 3243 NODEMASK_SCRATCH_FREE(scratch); 3244 put_mpol: 3245 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3246 } 3247 } 3248 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); 3249 3250 int mpol_set_shared_policy(struct shared_policy *sp, 3251 struct vm_area_struct *vma, struct mempolicy *pol) 3252 { 3253 int err; 3254 struct sp_node *new = NULL; 3255 unsigned long sz = vma_pages(vma); 3256 3257 if (pol) { 3258 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3259 if (!new) 3260 return -ENOMEM; 3261 } 3262 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3263 if (err && new) 3264 sp_free(new); 3265 return err; 3266 } 3267 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); 3268 3269 /* Free a backing policy store on inode delete. */ 3270 void mpol_free_shared_policy(struct shared_policy *sp) 3271 { 3272 struct sp_node *n; 3273 struct rb_node *next; 3274 3275 if (!sp->root.rb_node) 3276 return; 3277 write_lock(&sp->lock); 3278 next = rb_first(&sp->root); 3279 while (next) { 3280 n = rb_entry(next, struct sp_node, nd); 3281 next = rb_next(&n->nd); 3282 sp_delete(sp, n); 3283 } 3284 write_unlock(&sp->lock); 3285 } 3286 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); 3287 3288 #ifdef CONFIG_NUMA_BALANCING 3289 static int __initdata numabalancing_override; 3290 3291 static void __init check_numabalancing_enable(void) 3292 { 3293 bool numabalancing_default = false; 3294 3295 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3296 numabalancing_default = true; 3297 3298 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3299 if (numabalancing_override) 3300 set_numabalancing_state(numabalancing_override == 1); 3301 3302 if (num_online_nodes() > 1 && !numabalancing_override) { 3303 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3304 numabalancing_default ? "Enabling" : "Disabling"); 3305 set_numabalancing_state(numabalancing_default); 3306 } 3307 } 3308 3309 static int __init setup_numabalancing(char *str) 3310 { 3311 int ret = 0; 3312 if (!str) 3313 goto out; 3314 3315 if (!strcmp(str, "enable")) { 3316 numabalancing_override = 1; 3317 ret = 1; 3318 } else if (!strcmp(str, "disable")) { 3319 numabalancing_override = -1; 3320 ret = 1; 3321 } 3322 out: 3323 if (!ret) 3324 pr_warn("Unable to parse numa_balancing=\n"); 3325 3326 return ret; 3327 } 3328 __setup("numa_balancing=", setup_numabalancing); 3329 #else 3330 static inline void __init check_numabalancing_enable(void) 3331 { 3332 } 3333 #endif /* CONFIG_NUMA_BALANCING */ 3334 3335 void __init numa_policy_init(void) 3336 { 3337 nodemask_t interleave_nodes; 3338 unsigned long largest = 0; 3339 int nid, prefer = 0; 3340 3341 policy_cache = kmem_cache_create("numa_policy", 3342 sizeof(struct mempolicy), 3343 0, SLAB_PANIC, NULL); 3344 3345 sn_cache = kmem_cache_create("shared_policy_node", 3346 sizeof(struct sp_node), 3347 0, SLAB_PANIC, NULL); 3348 3349 for_each_node(nid) { 3350 preferred_node_policy[nid] = (struct mempolicy) { 3351 .refcnt = ATOMIC_INIT(1), 3352 .mode = MPOL_PREFERRED, 3353 .flags = MPOL_F_MOF | MPOL_F_MORON, 3354 .nodes = nodemask_of_node(nid), 3355 }; 3356 } 3357 3358 /* 3359 * Set interleaving policy for system init. Interleaving is only 3360 * enabled across suitably sized nodes (default is >= 16MB), or 3361 * fall back to the largest node if they're all smaller. 3362 */ 3363 nodes_clear(interleave_nodes); 3364 for_each_node_state(nid, N_MEMORY) { 3365 unsigned long total_pages = node_present_pages(nid); 3366 3367 /* Preserve the largest node */ 3368 if (largest < total_pages) { 3369 largest = total_pages; 3370 prefer = nid; 3371 } 3372 3373 /* Interleave this node? */ 3374 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3375 node_set(nid, interleave_nodes); 3376 } 3377 3378 /* All too small, use the largest */ 3379 if (unlikely(nodes_empty(interleave_nodes))) 3380 node_set(prefer, interleave_nodes); 3381 3382 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3383 pr_err("%s: interleaving failed\n", __func__); 3384 3385 check_numabalancing_enable(); 3386 } 3387 3388 /* Reset policy of current process to default */ 3389 void numa_default_policy(void) 3390 { 3391 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3392 } 3393 3394 /* 3395 * Parse and format mempolicy from/to strings 3396 */ 3397 static const char * const policy_modes[] = 3398 { 3399 [MPOL_DEFAULT] = "default", 3400 [MPOL_PREFERRED] = "prefer", 3401 [MPOL_BIND] = "bind", 3402 [MPOL_INTERLEAVE] = "interleave", 3403 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3404 [MPOL_LOCAL] = "local", 3405 [MPOL_PREFERRED_MANY] = "prefer (many)", 3406 }; 3407 3408 #ifdef CONFIG_TMPFS 3409 /** 3410 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3411 * @str: string containing mempolicy to parse 3412 * @mpol: pointer to struct mempolicy pointer, returned on success. 3413 * 3414 * Format of input: 3415 * <mode>[=<flags>][:<nodelist>] 3416 * 3417 * Return: %0 on success, else %1 3418 */ 3419 int mpol_parse_str(char *str, struct mempolicy **mpol) 3420 { 3421 struct mempolicy *new = NULL; 3422 unsigned short mode_flags; 3423 nodemask_t nodes; 3424 char *nodelist = strchr(str, ':'); 3425 char *flags = strchr(str, '='); 3426 int err = 1, mode; 3427 3428 if (flags) 3429 *flags++ = '\0'; /* terminate mode string */ 3430 3431 if (nodelist) { 3432 /* NUL-terminate mode or flags string */ 3433 *nodelist++ = '\0'; 3434 if (nodelist_parse(nodelist, nodes)) 3435 goto out; 3436 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3437 goto out; 3438 } else 3439 nodes_clear(nodes); 3440 3441 mode = match_string(policy_modes, MPOL_MAX, str); 3442 if (mode < 0) 3443 goto out; 3444 3445 switch (mode) { 3446 case MPOL_PREFERRED: 3447 /* 3448 * Insist on a nodelist of one node only, although later 3449 * we use first_node(nodes) to grab a single node, so here 3450 * nodelist (or nodes) cannot be empty. 3451 */ 3452 if (nodelist) { 3453 char *rest = nodelist; 3454 while (isdigit(*rest)) 3455 rest++; 3456 if (*rest) 3457 goto out; 3458 if (nodes_empty(nodes)) 3459 goto out; 3460 } 3461 break; 3462 case MPOL_INTERLEAVE: 3463 case MPOL_WEIGHTED_INTERLEAVE: 3464 /* 3465 * Default to online nodes with memory if no nodelist 3466 */ 3467 if (!nodelist) 3468 nodes = node_states[N_MEMORY]; 3469 break; 3470 case MPOL_LOCAL: 3471 /* 3472 * Don't allow a nodelist; mpol_new() checks flags 3473 */ 3474 if (nodelist) 3475 goto out; 3476 break; 3477 case MPOL_DEFAULT: 3478 /* 3479 * Insist on a empty nodelist 3480 */ 3481 if (!nodelist) 3482 err = 0; 3483 goto out; 3484 case MPOL_PREFERRED_MANY: 3485 case MPOL_BIND: 3486 /* 3487 * Insist on a nodelist 3488 */ 3489 if (!nodelist) 3490 goto out; 3491 } 3492 3493 mode_flags = 0; 3494 if (flags) { 3495 /* 3496 * Currently, we only support two mutually exclusive 3497 * mode flags. 3498 */ 3499 if (!strcmp(flags, "static")) 3500 mode_flags |= MPOL_F_STATIC_NODES; 3501 else if (!strcmp(flags, "relative")) 3502 mode_flags |= MPOL_F_RELATIVE_NODES; 3503 else 3504 goto out; 3505 } 3506 3507 new = mpol_new(mode, mode_flags, &nodes); 3508 if (IS_ERR(new)) 3509 goto out; 3510 3511 /* 3512 * Save nodes for mpol_to_str() to show the tmpfs mount options 3513 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3514 */ 3515 if (mode != MPOL_PREFERRED) { 3516 new->nodes = nodes; 3517 } else if (nodelist) { 3518 nodes_clear(new->nodes); 3519 node_set(first_node(nodes), new->nodes); 3520 } else { 3521 new->mode = MPOL_LOCAL; 3522 } 3523 3524 /* 3525 * Save nodes for contextualization: this will be used to "clone" 3526 * the mempolicy in a specific context [cpuset] at a later time. 3527 */ 3528 new->w.user_nodemask = nodes; 3529 3530 err = 0; 3531 3532 out: 3533 /* Restore string for error message */ 3534 if (nodelist) 3535 *--nodelist = ':'; 3536 if (flags) 3537 *--flags = '='; 3538 if (!err) 3539 *mpol = new; 3540 return err; 3541 } 3542 #endif /* CONFIG_TMPFS */ 3543 3544 /** 3545 * mpol_to_str - format a mempolicy structure for printing 3546 * @buffer: to contain formatted mempolicy string 3547 * @maxlen: length of @buffer 3548 * @pol: pointer to mempolicy to be formatted 3549 * 3550 * Convert @pol into a string. If @buffer is too short, truncate the string. 3551 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3552 * interleave", plus the longest flag flags, "relative|balancing", and to 3553 * display at least a few node ids. 3554 */ 3555 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3556 { 3557 char *p = buffer; 3558 nodemask_t nodes = NODE_MASK_NONE; 3559 unsigned short mode = MPOL_DEFAULT; 3560 unsigned short flags = 0; 3561 3562 if (pol && 3563 pol != &default_policy && 3564 !(pol >= &preferred_node_policy[0] && 3565 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3566 mode = pol->mode; 3567 flags = pol->flags; 3568 } 3569 3570 switch (mode) { 3571 case MPOL_DEFAULT: 3572 case MPOL_LOCAL: 3573 break; 3574 case MPOL_PREFERRED: 3575 case MPOL_PREFERRED_MANY: 3576 case MPOL_BIND: 3577 case MPOL_INTERLEAVE: 3578 case MPOL_WEIGHTED_INTERLEAVE: 3579 nodes = pol->nodes; 3580 break; 3581 default: 3582 WARN_ON_ONCE(1); 3583 snprintf(p, maxlen, "unknown"); 3584 return; 3585 } 3586 3587 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3588 3589 if (flags & MPOL_MODE_FLAGS) { 3590 p += snprintf(p, buffer + maxlen - p, "="); 3591 3592 /* 3593 * Static and relative are mutually exclusive. 3594 */ 3595 if (flags & MPOL_F_STATIC_NODES) 3596 p += snprintf(p, buffer + maxlen - p, "static"); 3597 else if (flags & MPOL_F_RELATIVE_NODES) 3598 p += snprintf(p, buffer + maxlen - p, "relative"); 3599 3600 if (flags & MPOL_F_NUMA_BALANCING) { 3601 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3602 p += snprintf(p, buffer + maxlen - p, "|"); 3603 p += snprintf(p, buffer + maxlen - p, "balancing"); 3604 } 3605 } 3606 3607 if (!nodes_empty(nodes)) 3608 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3609 nodemask_pr_args(&nodes)); 3610 } 3611 3612 #ifdef CONFIG_SYSFS 3613 struct iw_node_attr { 3614 struct kobj_attribute kobj_attr; 3615 int nid; 3616 }; 3617 3618 struct sysfs_wi_group { 3619 struct kobject wi_kobj; 3620 struct mutex kobj_lock; 3621 struct iw_node_attr *nattrs[]; 3622 }; 3623 3624 static struct sysfs_wi_group *wi_group; 3625 3626 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3627 char *buf) 3628 { 3629 struct iw_node_attr *node_attr; 3630 u8 weight; 3631 3632 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3633 weight = get_il_weight(node_attr->nid); 3634 return sysfs_emit(buf, "%d\n", weight); 3635 } 3636 3637 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3638 const char *buf, size_t count) 3639 { 3640 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3641 struct iw_node_attr *node_attr; 3642 u8 weight = 0; 3643 int i; 3644 3645 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3646 if (count == 0 || sysfs_streq(buf, "") || 3647 kstrtou8(buf, 0, &weight) || weight == 0) 3648 return -EINVAL; 3649 3650 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids); 3651 if (!new_wi_state) 3652 return -ENOMEM; 3653 3654 mutex_lock(&wi_state_lock); 3655 old_wi_state = rcu_dereference_protected(wi_state, 3656 lockdep_is_held(&wi_state_lock)); 3657 if (old_wi_state) { 3658 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3659 nr_node_ids * sizeof(u8)); 3660 } else { 3661 for (i = 0; i < nr_node_ids; i++) 3662 new_wi_state->iw_table[i] = 1; 3663 } 3664 new_wi_state->iw_table[node_attr->nid] = weight; 3665 new_wi_state->mode_auto = false; 3666 3667 rcu_assign_pointer(wi_state, new_wi_state); 3668 mutex_unlock(&wi_state_lock); 3669 if (old_wi_state) { 3670 synchronize_rcu(); 3671 kfree(old_wi_state); 3672 } 3673 return count; 3674 } 3675 3676 static ssize_t weighted_interleave_auto_show(struct kobject *kobj, 3677 struct kobj_attribute *attr, char *buf) 3678 { 3679 struct weighted_interleave_state *state; 3680 bool wi_auto = true; 3681 3682 rcu_read_lock(); 3683 state = rcu_dereference(wi_state); 3684 if (state) 3685 wi_auto = state->mode_auto; 3686 rcu_read_unlock(); 3687 3688 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); 3689 } 3690 3691 static ssize_t weighted_interleave_auto_store(struct kobject *kobj, 3692 struct kobj_attribute *attr, const char *buf, size_t count) 3693 { 3694 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3695 unsigned int *bw; 3696 bool input; 3697 int i; 3698 3699 if (kstrtobool(buf, &input)) 3700 return -EINVAL; 3701 3702 new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids); 3703 if (!new_wi_state) 3704 return -ENOMEM; 3705 for (i = 0; i < nr_node_ids; i++) 3706 new_wi_state->iw_table[i] = 1; 3707 3708 mutex_lock(&wi_state_lock); 3709 if (!input) { 3710 old_wi_state = rcu_dereference_protected(wi_state, 3711 lockdep_is_held(&wi_state_lock)); 3712 if (!old_wi_state) 3713 goto update_wi_state; 3714 if (input == old_wi_state->mode_auto) { 3715 mutex_unlock(&wi_state_lock); 3716 return count; 3717 } 3718 3719 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3720 nr_node_ids * sizeof(u8)); 3721 goto update_wi_state; 3722 } 3723 3724 bw = node_bw_table; 3725 if (!bw) { 3726 mutex_unlock(&wi_state_lock); 3727 kfree(new_wi_state); 3728 return -ENODEV; 3729 } 3730 3731 new_wi_state->mode_auto = true; 3732 reduce_interleave_weights(bw, new_wi_state->iw_table); 3733 3734 update_wi_state: 3735 rcu_assign_pointer(wi_state, new_wi_state); 3736 mutex_unlock(&wi_state_lock); 3737 if (old_wi_state) { 3738 synchronize_rcu(); 3739 kfree(old_wi_state); 3740 } 3741 return count; 3742 } 3743 3744 static void sysfs_wi_node_delete(int nid) 3745 { 3746 struct iw_node_attr *attr; 3747 3748 if (nid < 0 || nid >= nr_node_ids) 3749 return; 3750 3751 mutex_lock(&wi_group->kobj_lock); 3752 attr = wi_group->nattrs[nid]; 3753 if (!attr) { 3754 mutex_unlock(&wi_group->kobj_lock); 3755 return; 3756 } 3757 3758 wi_group->nattrs[nid] = NULL; 3759 mutex_unlock(&wi_group->kobj_lock); 3760 3761 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); 3762 kfree(attr->kobj_attr.attr.name); 3763 kfree(attr); 3764 } 3765 3766 static void sysfs_wi_node_delete_all(void) 3767 { 3768 int nid; 3769 3770 for (nid = 0; nid < nr_node_ids; nid++) 3771 sysfs_wi_node_delete(nid); 3772 } 3773 3774 static void wi_state_free(void) 3775 { 3776 struct weighted_interleave_state *old_wi_state; 3777 3778 mutex_lock(&wi_state_lock); 3779 old_wi_state = rcu_dereference_protected(wi_state, 3780 lockdep_is_held(&wi_state_lock)); 3781 rcu_assign_pointer(wi_state, NULL); 3782 mutex_unlock(&wi_state_lock); 3783 3784 if (old_wi_state) { 3785 synchronize_rcu(); 3786 kfree(old_wi_state); 3787 } 3788 } 3789 3790 static struct kobj_attribute wi_auto_attr = 3791 __ATTR(auto, 0664, weighted_interleave_auto_show, 3792 weighted_interleave_auto_store); 3793 3794 static void wi_cleanup(void) { 3795 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3796 sysfs_wi_node_delete_all(); 3797 wi_state_free(); 3798 } 3799 3800 static void wi_kobj_release(struct kobject *wi_kobj) 3801 { 3802 kfree(wi_group); 3803 } 3804 3805 static const struct kobj_type wi_ktype = { 3806 .sysfs_ops = &kobj_sysfs_ops, 3807 .release = wi_kobj_release, 3808 }; 3809 3810 static int sysfs_wi_node_add(int nid) 3811 { 3812 int ret; 3813 char *name; 3814 struct iw_node_attr *new_attr; 3815 3816 if (nid < 0 || nid >= nr_node_ids) { 3817 pr_err("invalid node id: %d\n", nid); 3818 return -EINVAL; 3819 } 3820 3821 new_attr = kzalloc_obj(*new_attr); 3822 if (!new_attr) 3823 return -ENOMEM; 3824 3825 name = kasprintf(GFP_KERNEL, "node%d", nid); 3826 if (!name) { 3827 kfree(new_attr); 3828 return -ENOMEM; 3829 } 3830 3831 sysfs_attr_init(&new_attr->kobj_attr.attr); 3832 new_attr->kobj_attr.attr.name = name; 3833 new_attr->kobj_attr.attr.mode = 0644; 3834 new_attr->kobj_attr.show = node_show; 3835 new_attr->kobj_attr.store = node_store; 3836 new_attr->nid = nid; 3837 3838 mutex_lock(&wi_group->kobj_lock); 3839 if (wi_group->nattrs[nid]) { 3840 mutex_unlock(&wi_group->kobj_lock); 3841 ret = -EEXIST; 3842 goto out; 3843 } 3844 3845 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); 3846 if (ret) { 3847 mutex_unlock(&wi_group->kobj_lock); 3848 goto out; 3849 } 3850 wi_group->nattrs[nid] = new_attr; 3851 mutex_unlock(&wi_group->kobj_lock); 3852 return 0; 3853 3854 out: 3855 kfree(new_attr->kobj_attr.attr.name); 3856 kfree(new_attr); 3857 return ret; 3858 } 3859 3860 static int wi_node_notifier(struct notifier_block *nb, 3861 unsigned long action, void *data) 3862 { 3863 int err; 3864 struct node_notify *nn = data; 3865 int nid = nn->nid; 3866 3867 switch (action) { 3868 case NODE_ADDED_FIRST_MEMORY: 3869 err = sysfs_wi_node_add(nid); 3870 if (err) 3871 pr_err("failed to add sysfs for node%d during hotplug: %d\n", 3872 nid, err); 3873 break; 3874 case NODE_REMOVED_LAST_MEMORY: 3875 sysfs_wi_node_delete(nid); 3876 break; 3877 } 3878 3879 return NOTIFY_OK; 3880 } 3881 3882 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) 3883 { 3884 int nid, err; 3885 3886 wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids); 3887 if (!wi_group) 3888 return -ENOMEM; 3889 mutex_init(&wi_group->kobj_lock); 3890 3891 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, 3892 "weighted_interleave"); 3893 if (err) 3894 goto err_put_kobj; 3895 3896 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3897 if (err) 3898 goto err_put_kobj; 3899 3900 for_each_online_node(nid) { 3901 if (!node_state(nid, N_MEMORY)) 3902 continue; 3903 3904 err = sysfs_wi_node_add(nid); 3905 if (err) { 3906 pr_err("failed to add sysfs for node%d during init: %d\n", 3907 nid, err); 3908 goto err_cleanup_kobj; 3909 } 3910 } 3911 3912 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); 3913 return 0; 3914 3915 err_cleanup_kobj: 3916 wi_cleanup(); 3917 kobject_del(&wi_group->wi_kobj); 3918 err_put_kobj: 3919 kobject_put(&wi_group->wi_kobj); 3920 return err; 3921 } 3922 3923 static int __init mempolicy_sysfs_init(void) 3924 { 3925 int err; 3926 static struct kobject *mempolicy_kobj; 3927 3928 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); 3929 if (!mempolicy_kobj) 3930 return -ENOMEM; 3931 3932 err = add_weighted_interleave_group(mempolicy_kobj); 3933 if (err) 3934 goto err_kobj; 3935 3936 return 0; 3937 3938 err_kobj: 3939 kobject_del(mempolicy_kobj); 3940 kobject_put(mempolicy_kobj); 3941 return err; 3942 } 3943 3944 late_initcall(mempolicy_sysfs_init); 3945 #endif /* CONFIG_SYSFS */ 3946