1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/sysctl.h> 89 #include <linux/sched/task.h> 90 #include <linux/nodemask.h> 91 #include <linux/cpuset.h> 92 #include <linux/slab.h> 93 #include <linux/string.h> 94 #include <linux/export.h> 95 #include <linux/nsproxy.h> 96 #include <linux/interrupt.h> 97 #include <linux/init.h> 98 #include <linux/compat.h> 99 #include <linux/ptrace.h> 100 #include <linux/swap.h> 101 #include <linux/seq_file.h> 102 #include <linux/proc_fs.h> 103 #include <linux/memory-tiers.h> 104 #include <linux/migrate.h> 105 #include <linux/ksm.h> 106 #include <linux/rmap.h> 107 #include <linux/security.h> 108 #include <linux/syscalls.h> 109 #include <linux/ctype.h> 110 #include <linux/mm_inline.h> 111 #include <linux/mmu_notifier.h> 112 #include <linux/printk.h> 113 #include <linux/leafops.h> 114 #include <linux/gcd.h> 115 116 #include <asm/tlbflush.h> 117 #include <asm/tlb.h> 118 #include <linux/uaccess.h> 119 #include <linux/memory.h> 120 121 #include "internal.h" 122 123 /* Internal flags */ 124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 127 128 static struct kmem_cache *policy_cache; 129 static struct kmem_cache *sn_cache; 130 131 /* Highest zone. An specific allocation for a zone below that is not 132 policied. */ 133 enum zone_type policy_zone = 0; 134 135 /* 136 * run-time system-wide default policy => local allocation 137 */ 138 static struct mempolicy default_policy = { 139 .refcnt = ATOMIC_INIT(1), /* never free it */ 140 .mode = MPOL_LOCAL, 141 }; 142 143 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 144 145 /* 146 * weightiness balances the tradeoff between small weights (cycles through nodes 147 * faster, more fair/even distribution) and large weights (smaller errors 148 * between actual bandwidth ratios and weight ratios). 32 is a number that has 149 * been found to perform at a reasonable compromise between the two goals. 150 */ 151 static const int weightiness = 32; 152 153 /* 154 * A null weighted_interleave_state is interpreted as having .mode="auto", 155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids. 156 */ 157 struct weighted_interleave_state { 158 bool mode_auto; 159 u8 iw_table[]; 160 }; 161 static struct weighted_interleave_state __rcu *wi_state; 162 static unsigned int *node_bw_table; 163 164 /* 165 * wi_state_lock protects both wi_state and node_bw_table. 166 * node_bw_table is only used by writers to update wi_state. 167 */ 168 static DEFINE_MUTEX(wi_state_lock); 169 170 static u8 get_il_weight(int node) 171 { 172 struct weighted_interleave_state *state; 173 u8 weight = 1; 174 175 rcu_read_lock(); 176 state = rcu_dereference(wi_state); 177 if (state) 178 weight = state->iw_table[node]; 179 rcu_read_unlock(); 180 return weight; 181 } 182 183 /* 184 * Convert bandwidth values into weighted interleave weights. 185 * Call with wi_state_lock. 186 */ 187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) 188 { 189 u64 sum_bw = 0; 190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; 191 int nid; 192 193 for_each_node_state(nid, N_MEMORY) 194 sum_bw += bw[nid]; 195 196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */ 197 for_each_node_state(nid, N_MEMORY) { 198 /* 199 * Try not to perform 64-bit division. 200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX. 201 * If sum_bw > scaling_factor, then round the weight up to 1. 202 */ 203 scaling_factor = weightiness * bw[nid]; 204 if (bw[nid] && sum_bw < scaling_factor) { 205 cast_sum_bw = (unsigned int)sum_bw; 206 new_iw[nid] = scaling_factor / cast_sum_bw; 207 } else { 208 new_iw[nid] = 1; 209 } 210 if (!iw_gcd) 211 iw_gcd = new_iw[nid]; 212 iw_gcd = gcd(iw_gcd, new_iw[nid]); 213 } 214 215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ 216 for_each_node_state(nid, N_MEMORY) 217 new_iw[nid] /= iw_gcd; 218 } 219 220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) 221 { 222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 223 unsigned int *old_bw, *new_bw; 224 unsigned int bw_val; 225 int i; 226 227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth); 228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); 229 if (!new_bw) 230 return -ENOMEM; 231 232 new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 233 GFP_KERNEL); 234 if (!new_wi_state) { 235 kfree(new_bw); 236 return -ENOMEM; 237 } 238 new_wi_state->mode_auto = true; 239 for (i = 0; i < nr_node_ids; i++) 240 new_wi_state->iw_table[i] = 1; 241 242 /* 243 * Update bandwidth info, even in manual mode. That way, when switching 244 * to auto mode in the future, iw_table can be overwritten using 245 * accurate bw data. 246 */ 247 mutex_lock(&wi_state_lock); 248 249 old_bw = node_bw_table; 250 if (old_bw) 251 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); 252 new_bw[node] = bw_val; 253 node_bw_table = new_bw; 254 255 old_wi_state = rcu_dereference_protected(wi_state, 256 lockdep_is_held(&wi_state_lock)); 257 if (old_wi_state && !old_wi_state->mode_auto) { 258 /* Manual mode; skip reducing weights and updating wi_state */ 259 mutex_unlock(&wi_state_lock); 260 kfree(new_wi_state); 261 goto out; 262 } 263 264 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ 265 reduce_interleave_weights(new_bw, new_wi_state->iw_table); 266 rcu_assign_pointer(wi_state, new_wi_state); 267 268 mutex_unlock(&wi_state_lock); 269 if (old_wi_state) { 270 synchronize_rcu(); 271 kfree(old_wi_state); 272 } 273 out: 274 kfree(old_bw); 275 return 0; 276 } 277 278 /** 279 * numa_nearest_node - Find nearest node by state 280 * @node: Node id to start the search 281 * @state: State to filter the search 282 * 283 * Lookup the closest node by distance if @nid is not in state. 284 * 285 * Return: this @node if it is in state, otherwise the closest node by distance 286 */ 287 int numa_nearest_node(int node, unsigned int state) 288 { 289 int min_dist = INT_MAX, dist, n, min_node; 290 291 if (state >= NR_NODE_STATES) 292 return -EINVAL; 293 294 if (node == NUMA_NO_NODE || node_state(node, state)) 295 return node; 296 297 min_node = node; 298 for_each_node_state(n, state) { 299 dist = node_distance(node, n); 300 if (dist < min_dist) { 301 min_dist = dist; 302 min_node = n; 303 } 304 } 305 306 return min_node; 307 } 308 EXPORT_SYMBOL_GPL(numa_nearest_node); 309 310 /** 311 * nearest_node_nodemask - Find the node in @mask at the nearest distance 312 * from @node. 313 * 314 * @node: a valid node ID to start the search from. 315 * @mask: a pointer to a nodemask representing the allowed nodes. 316 * 317 * This function iterates over all nodes in @mask and calculates the 318 * distance from the starting @node, then it returns the node ID that is 319 * the closest to @node, or MAX_NUMNODES if no node is found. 320 * 321 * Note that @node must be a valid node ID usable with node_distance(), 322 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 323 * or unexpected behavior. 324 */ 325 int nearest_node_nodemask(int node, nodemask_t *mask) 326 { 327 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 328 329 for_each_node_mask(n, *mask) { 330 dist = node_distance(node, n); 331 if (dist < min_dist) { 332 min_dist = dist; 333 min_node = n; 334 } 335 } 336 337 return min_node; 338 } 339 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 340 341 struct mempolicy *get_task_policy(struct task_struct *p) 342 { 343 struct mempolicy *pol = p->mempolicy; 344 int node; 345 346 if (pol) 347 return pol; 348 349 node = numa_node_id(); 350 if (node != NUMA_NO_NODE) { 351 pol = &preferred_node_policy[node]; 352 /* preferred_node_policy is not initialised early in boot */ 353 if (pol->mode) 354 return pol; 355 } 356 357 return &default_policy; 358 } 359 360 static const struct mempolicy_operations { 361 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 362 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 363 } mpol_ops[MPOL_MAX]; 364 365 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 366 { 367 return pol->flags & MPOL_MODE_FLAGS; 368 } 369 370 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 371 const nodemask_t *rel) 372 { 373 nodemask_t tmp; 374 nodes_fold(tmp, *orig, nodes_weight(*rel)); 375 nodes_onto(*ret, tmp, *rel); 376 } 377 378 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 379 { 380 if (nodes_empty(*nodes)) 381 return -EINVAL; 382 pol->nodes = *nodes; 383 return 0; 384 } 385 386 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 387 { 388 if (nodes_empty(*nodes)) 389 return -EINVAL; 390 391 nodes_clear(pol->nodes); 392 node_set(first_node(*nodes), pol->nodes); 393 return 0; 394 } 395 396 /* 397 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 398 * any, for the new policy. mpol_new() has already validated the nodes 399 * parameter with respect to the policy mode and flags. 400 * 401 * Must be called holding task's alloc_lock to protect task's mems_allowed 402 * and mempolicy. May also be called holding the mmap_lock for write. 403 */ 404 static int mpol_set_nodemask(struct mempolicy *pol, 405 const nodemask_t *nodes, struct nodemask_scratch *nsc) 406 { 407 int ret; 408 409 /* 410 * Default (pol==NULL) resp. local memory policies are not a 411 * subject of any remapping. They also do not need any special 412 * constructor. 413 */ 414 if (!pol || pol->mode == MPOL_LOCAL) 415 return 0; 416 417 /* Check N_MEMORY */ 418 nodes_and(nsc->mask1, 419 cpuset_current_mems_allowed, node_states[N_MEMORY]); 420 421 VM_BUG_ON(!nodes); 422 423 if (pol->flags & MPOL_F_RELATIVE_NODES) 424 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 425 else 426 nodes_and(nsc->mask2, *nodes, nsc->mask1); 427 428 if (mpol_store_user_nodemask(pol)) 429 pol->w.user_nodemask = *nodes; 430 else 431 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 432 433 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 434 return ret; 435 } 436 437 /* 438 * This function just creates a new policy, does some check and simple 439 * initialization. You must invoke mpol_set_nodemask() to set nodes. 440 */ 441 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 442 nodemask_t *nodes) 443 { 444 struct mempolicy *policy; 445 446 if (mode == MPOL_DEFAULT) { 447 if (nodes && !nodes_empty(*nodes)) 448 return ERR_PTR(-EINVAL); 449 return NULL; 450 } 451 VM_BUG_ON(!nodes); 452 453 /* 454 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 455 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 456 * All other modes require a valid pointer to a non-empty nodemask. 457 */ 458 if (mode == MPOL_PREFERRED) { 459 if (nodes_empty(*nodes)) { 460 if (((flags & MPOL_F_STATIC_NODES) || 461 (flags & MPOL_F_RELATIVE_NODES))) 462 return ERR_PTR(-EINVAL); 463 464 mode = MPOL_LOCAL; 465 } 466 } else if (mode == MPOL_LOCAL) { 467 if (!nodes_empty(*nodes) || 468 (flags & MPOL_F_STATIC_NODES) || 469 (flags & MPOL_F_RELATIVE_NODES)) 470 return ERR_PTR(-EINVAL); 471 } else if (nodes_empty(*nodes)) 472 return ERR_PTR(-EINVAL); 473 474 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 475 if (!policy) 476 return ERR_PTR(-ENOMEM); 477 atomic_set(&policy->refcnt, 1); 478 policy->mode = mode; 479 policy->flags = flags; 480 policy->home_node = NUMA_NO_NODE; 481 482 return policy; 483 } 484 485 /* Slow path of a mpol destructor. */ 486 void __mpol_put(struct mempolicy *pol) 487 { 488 if (!atomic_dec_and_test(&pol->refcnt)) 489 return; 490 kmem_cache_free(policy_cache, pol); 491 } 492 493 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 494 { 495 } 496 497 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 498 { 499 nodemask_t tmp; 500 501 if (pol->flags & MPOL_F_STATIC_NODES) 502 nodes_and(tmp, pol->w.user_nodemask, *nodes); 503 else if (pol->flags & MPOL_F_RELATIVE_NODES) 504 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 505 else { 506 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 507 *nodes); 508 pol->w.cpuset_mems_allowed = *nodes; 509 } 510 511 if (nodes_empty(tmp)) 512 tmp = *nodes; 513 514 pol->nodes = tmp; 515 } 516 517 static void mpol_rebind_preferred(struct mempolicy *pol, 518 const nodemask_t *nodes) 519 { 520 pol->w.cpuset_mems_allowed = *nodes; 521 } 522 523 /* 524 * mpol_rebind_policy - Migrate a policy to a different set of nodes 525 * 526 * Per-vma policies are protected by mmap_lock. Allocations using per-task 527 * policies are protected by task->mems_allowed_seq to prevent a premature 528 * OOM/allocation failure due to parallel nodemask modification. 529 */ 530 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 531 { 532 if (!pol || pol->mode == MPOL_LOCAL) 533 return; 534 if (!mpol_store_user_nodemask(pol) && 535 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 536 return; 537 538 mpol_ops[pol->mode].rebind(pol, newmask); 539 } 540 541 /* 542 * Wrapper for mpol_rebind_policy() that just requires task 543 * pointer, and updates task mempolicy. 544 * 545 * Called with task's alloc_lock held. 546 */ 547 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 548 { 549 mpol_rebind_policy(tsk->mempolicy, new); 550 } 551 552 /* 553 * Rebind each vma in mm to new nodemask. 554 * 555 * Call holding a reference to mm. Takes mm->mmap_lock during call. 556 */ 557 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 558 { 559 struct vm_area_struct *vma; 560 VMA_ITERATOR(vmi, mm, 0); 561 562 mmap_write_lock(mm); 563 for_each_vma(vmi, vma) { 564 vma_start_write(vma); 565 mpol_rebind_policy(vma->vm_policy, new); 566 } 567 mmap_write_unlock(mm); 568 } 569 570 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 571 [MPOL_DEFAULT] = { 572 .rebind = mpol_rebind_default, 573 }, 574 [MPOL_INTERLEAVE] = { 575 .create = mpol_new_nodemask, 576 .rebind = mpol_rebind_nodemask, 577 }, 578 [MPOL_PREFERRED] = { 579 .create = mpol_new_preferred, 580 .rebind = mpol_rebind_preferred, 581 }, 582 [MPOL_BIND] = { 583 .create = mpol_new_nodemask, 584 .rebind = mpol_rebind_nodemask, 585 }, 586 [MPOL_LOCAL] = { 587 .rebind = mpol_rebind_default, 588 }, 589 [MPOL_PREFERRED_MANY] = { 590 .create = mpol_new_nodemask, 591 .rebind = mpol_rebind_preferred, 592 }, 593 [MPOL_WEIGHTED_INTERLEAVE] = { 594 .create = mpol_new_nodemask, 595 .rebind = mpol_rebind_nodemask, 596 }, 597 }; 598 599 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 600 unsigned long flags); 601 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 602 pgoff_t ilx, int *nid); 603 604 static bool strictly_unmovable(unsigned long flags) 605 { 606 /* 607 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 608 * if any misplaced page is found. 609 */ 610 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 611 MPOL_MF_STRICT; 612 } 613 614 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 615 struct mempolicy *pol; 616 pgoff_t ilx; 617 }; 618 619 struct queue_pages { 620 struct list_head *pagelist; 621 unsigned long flags; 622 nodemask_t *nmask; 623 unsigned long start; 624 unsigned long end; 625 struct vm_area_struct *first; 626 struct folio *large; /* note last large folio encountered */ 627 long nr_failed; /* could not be isolated at this time */ 628 }; 629 630 /* 631 * Check if the folio's nid is in qp->nmask. 632 * 633 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 634 * in the invert of qp->nmask. 635 */ 636 static inline bool queue_folio_required(struct folio *folio, 637 struct queue_pages *qp) 638 { 639 int nid = folio_nid(folio); 640 unsigned long flags = qp->flags; 641 642 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 643 } 644 645 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 646 { 647 struct folio *folio; 648 struct queue_pages *qp = walk->private; 649 650 if (unlikely(pmd_is_migration_entry(*pmd))) { 651 qp->nr_failed++; 652 return; 653 } 654 folio = pmd_folio(*pmd); 655 if (is_huge_zero_folio(folio)) { 656 walk->action = ACTION_CONTINUE; 657 return; 658 } 659 if (!queue_folio_required(folio, qp)) 660 return; 661 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 662 !vma_migratable(walk->vma) || 663 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 664 qp->nr_failed++; 665 } 666 667 /* 668 * Scan through folios, checking if they satisfy the required conditions, 669 * moving them from LRU to local pagelist for migration if they do (or not). 670 * 671 * queue_folios_pte_range() has two possible return values: 672 * 0 - continue walking to scan for more, even if an existing folio on the 673 * wrong node could not be isolated and queued for migration. 674 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 675 * and an existing folio was on a node that does not follow the policy. 676 */ 677 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 678 unsigned long end, struct mm_walk *walk) 679 { 680 struct vm_area_struct *vma = walk->vma; 681 struct folio *folio; 682 struct queue_pages *qp = walk->private; 683 unsigned long flags = qp->flags; 684 pte_t *pte, *mapped_pte; 685 pte_t ptent; 686 spinlock_t *ptl; 687 int max_nr, nr; 688 689 ptl = pmd_trans_huge_lock(pmd, vma); 690 if (ptl) { 691 queue_folios_pmd(pmd, walk); 692 spin_unlock(ptl); 693 goto out; 694 } 695 696 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 697 if (!pte) { 698 walk->action = ACTION_AGAIN; 699 return 0; 700 } 701 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 702 max_nr = (end - addr) >> PAGE_SHIFT; 703 nr = 1; 704 ptent = ptep_get(pte); 705 if (pte_none(ptent)) 706 continue; 707 if (!pte_present(ptent)) { 708 const softleaf_t entry = softleaf_from_pte(ptent); 709 710 if (softleaf_is_migration(entry)) 711 qp->nr_failed++; 712 continue; 713 } 714 folio = vm_normal_folio(vma, addr, ptent); 715 if (!folio || folio_is_zone_device(folio)) 716 continue; 717 if (folio_test_large(folio) && max_nr != 1) 718 nr = folio_pte_batch(folio, pte, ptent, max_nr); 719 /* 720 * vm_normal_folio() filters out zero pages, but there might 721 * still be reserved folios to skip, perhaps in a VDSO. 722 */ 723 if (folio_test_reserved(folio)) 724 continue; 725 if (!queue_folio_required(folio, qp)) 726 continue; 727 if (folio_test_large(folio)) { 728 /* 729 * A large folio can only be isolated from LRU once, 730 * but may be mapped by many PTEs (and Copy-On-Write may 731 * intersperse PTEs of other, order 0, folios). This is 732 * a common case, so don't mistake it for failure (but 733 * there can be other cases of multi-mapped pages which 734 * this quick check does not help to filter out - and a 735 * search of the pagelist might grow to be prohibitive). 736 * 737 * migrate_pages(&pagelist) returns nr_failed folios, so 738 * check "large" now so that queue_pages_range() returns 739 * a comparable nr_failed folios. This does imply that 740 * if folio could not be isolated for some racy reason 741 * at its first PTE, later PTEs will not give it another 742 * chance of isolation; but keeps the accounting simple. 743 */ 744 if (folio == qp->large) 745 continue; 746 qp->large = folio; 747 } 748 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 749 !vma_migratable(vma) || 750 !migrate_folio_add(folio, qp->pagelist, flags)) { 751 qp->nr_failed += nr; 752 if (strictly_unmovable(flags)) 753 break; 754 } 755 } 756 pte_unmap_unlock(mapped_pte, ptl); 757 cond_resched(); 758 out: 759 if (qp->nr_failed && strictly_unmovable(flags)) 760 return -EIO; 761 return 0; 762 } 763 764 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 765 unsigned long addr, unsigned long end, 766 struct mm_walk *walk) 767 { 768 #ifdef CONFIG_HUGETLB_PAGE 769 struct queue_pages *qp = walk->private; 770 unsigned long flags = qp->flags; 771 struct folio *folio; 772 spinlock_t *ptl; 773 pte_t ptep; 774 775 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 776 ptep = huge_ptep_get(walk->mm, addr, pte); 777 if (!pte_present(ptep)) { 778 if (!huge_pte_none(ptep)) { 779 const softleaf_t entry = softleaf_from_pte(ptep); 780 781 if (unlikely(softleaf_is_migration(entry))) 782 qp->nr_failed++; 783 } 784 785 goto unlock; 786 } 787 folio = pfn_folio(pte_pfn(ptep)); 788 if (!queue_folio_required(folio, qp)) 789 goto unlock; 790 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 791 !vma_migratable(walk->vma)) { 792 qp->nr_failed++; 793 goto unlock; 794 } 795 /* 796 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 797 * Choosing not to migrate a shared folio is not counted as a failure. 798 * 799 * See folio_maybe_mapped_shared() on possible imprecision when we 800 * cannot easily detect if a folio is shared. 801 */ 802 if ((flags & MPOL_MF_MOVE_ALL) || 803 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 804 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 805 qp->nr_failed++; 806 unlock: 807 spin_unlock(ptl); 808 if (qp->nr_failed && strictly_unmovable(flags)) 809 return -EIO; 810 #endif 811 return 0; 812 } 813 814 #ifdef CONFIG_NUMA_BALANCING 815 /** 816 * folio_can_map_prot_numa() - check whether the folio can map prot numa 817 * @folio: The folio whose mapping considered for being made NUMA hintable 818 * @vma: The VMA that the folio belongs to. 819 * @is_private_single_threaded: Is this a single-threaded private VMA or not 820 * 821 * This function checks to see if the folio actually indicates that 822 * we need to make the mapping one which causes a NUMA hinting fault, 823 * as there are cases where it's simply unnecessary, and the folio's 824 * access time is adjusted for memory tiering if prot numa needed. 825 * 826 * Return: True if the mapping of the folio needs to be changed, false otherwise. 827 */ 828 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, 829 bool is_private_single_threaded) 830 { 831 int nid; 832 833 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) 834 return false; 835 836 /* Also skip shared copy-on-write folios */ 837 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) 838 return false; 839 840 /* Folios are pinned and can't be migrated */ 841 if (folio_maybe_dma_pinned(folio)) 842 return false; 843 844 /* 845 * While migration can move some dirty folios, 846 * it cannot move them all from MIGRATE_ASYNC 847 * context. 848 */ 849 if (folio_is_file_lru(folio) && folio_test_dirty(folio)) 850 return false; 851 852 /* 853 * Don't mess with PTEs if folio is already on the node 854 * a single-threaded process is running on. 855 */ 856 nid = folio_nid(folio); 857 if (is_private_single_threaded && (nid == numa_node_id())) 858 return false; 859 860 /* 861 * Skip scanning top tier node if normal numa 862 * balancing is disabled 863 */ 864 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 865 node_is_toptier(nid)) 866 return false; 867 868 if (folio_use_access_time(folio)) 869 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); 870 871 return true; 872 } 873 874 /* 875 * This is used to mark a range of virtual addresses to be inaccessible. 876 * These are later cleared by a NUMA hinting fault. Depending on these 877 * faults, pages may be migrated for better NUMA placement. 878 * 879 * This is assuming that NUMA faults are handled using PROT_NONE. If 880 * an architecture makes a different choice, it will need further 881 * changes to the core. 882 */ 883 unsigned long change_prot_numa(struct vm_area_struct *vma, 884 unsigned long addr, unsigned long end) 885 { 886 struct mmu_gather tlb; 887 long nr_updated; 888 889 tlb_gather_mmu(&tlb, vma->vm_mm); 890 891 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 892 if (nr_updated > 0) { 893 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 894 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 895 } 896 897 tlb_finish_mmu(&tlb); 898 899 return nr_updated; 900 } 901 #endif /* CONFIG_NUMA_BALANCING */ 902 903 static int queue_pages_test_walk(unsigned long start, unsigned long end, 904 struct mm_walk *walk) 905 { 906 struct vm_area_struct *next, *vma = walk->vma; 907 struct queue_pages *qp = walk->private; 908 unsigned long flags = qp->flags; 909 910 /* range check first */ 911 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 912 913 if (!qp->first) { 914 qp->first = vma; 915 if (!(flags & MPOL_MF_DISCONTIG_OK) && 916 (qp->start < vma->vm_start)) 917 /* hole at head side of range */ 918 return -EFAULT; 919 } 920 next = find_vma(vma->vm_mm, vma->vm_end); 921 if (!(flags & MPOL_MF_DISCONTIG_OK) && 922 ((vma->vm_end < qp->end) && 923 (!next || vma->vm_end < next->vm_start))) 924 /* hole at middle or tail of range */ 925 return -EFAULT; 926 927 /* 928 * Need check MPOL_MF_STRICT to return -EIO if possible 929 * regardless of vma_migratable 930 */ 931 if (!vma_migratable(vma) && 932 !(flags & MPOL_MF_STRICT)) 933 return 1; 934 935 /* 936 * Check page nodes, and queue pages to move, in the current vma. 937 * But if no moving, and no strict checking, the scan can be skipped. 938 */ 939 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 940 return 0; 941 return 1; 942 } 943 944 static const struct mm_walk_ops queue_pages_walk_ops = { 945 .hugetlb_entry = queue_folios_hugetlb, 946 .pmd_entry = queue_folios_pte_range, 947 .test_walk = queue_pages_test_walk, 948 .walk_lock = PGWALK_RDLOCK, 949 }; 950 951 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 952 .hugetlb_entry = queue_folios_hugetlb, 953 .pmd_entry = queue_folios_pte_range, 954 .test_walk = queue_pages_test_walk, 955 .walk_lock = PGWALK_WRLOCK, 956 }; 957 958 /* 959 * Walk through page tables and collect pages to be migrated. 960 * 961 * If pages found in a given range are not on the required set of @nodes, 962 * and migration is allowed, they are isolated and queued to @pagelist. 963 * 964 * queue_pages_range() may return: 965 * 0 - all pages already on the right node, or successfully queued for moving 966 * (or neither strict checking nor moving requested: only range checking). 967 * >0 - this number of misplaced folios could not be queued for moving 968 * (a hugetlbfs page or a transparent huge page being counted as 1). 969 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 970 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 971 */ 972 static long 973 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 974 nodemask_t *nodes, unsigned long flags, 975 struct list_head *pagelist) 976 { 977 int err; 978 struct queue_pages qp = { 979 .pagelist = pagelist, 980 .flags = flags, 981 .nmask = nodes, 982 .start = start, 983 .end = end, 984 .first = NULL, 985 }; 986 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 987 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 988 989 err = walk_page_range(mm, start, end, ops, &qp); 990 991 if (!qp.first) 992 /* whole range in hole */ 993 err = -EFAULT; 994 995 return err ? : qp.nr_failed; 996 } 997 998 /* 999 * Apply policy to a single VMA 1000 * This must be called with the mmap_lock held for writing. 1001 */ 1002 static int vma_replace_policy(struct vm_area_struct *vma, 1003 struct mempolicy *pol) 1004 { 1005 int err; 1006 struct mempolicy *old; 1007 struct mempolicy *new; 1008 1009 vma_assert_write_locked(vma); 1010 1011 new = mpol_dup(pol); 1012 if (IS_ERR(new)) 1013 return PTR_ERR(new); 1014 1015 if (vma->vm_ops && vma->vm_ops->set_policy) { 1016 err = vma->vm_ops->set_policy(vma, new); 1017 if (err) 1018 goto err_out; 1019 } 1020 1021 old = vma->vm_policy; 1022 vma->vm_policy = new; /* protected by mmap_lock */ 1023 mpol_put(old); 1024 1025 return 0; 1026 err_out: 1027 mpol_put(new); 1028 return err; 1029 } 1030 1031 /* Split or merge the VMA (if required) and apply the new policy */ 1032 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 1033 struct vm_area_struct **prev, unsigned long start, 1034 unsigned long end, struct mempolicy *new_pol) 1035 { 1036 unsigned long vmstart, vmend; 1037 1038 vmend = min(end, vma->vm_end); 1039 if (start > vma->vm_start) { 1040 *prev = vma; 1041 vmstart = start; 1042 } else { 1043 vmstart = vma->vm_start; 1044 } 1045 1046 if (mpol_equal(vma->vm_policy, new_pol)) { 1047 *prev = vma; 1048 return 0; 1049 } 1050 1051 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 1052 if (IS_ERR(vma)) 1053 return PTR_ERR(vma); 1054 1055 *prev = vma; 1056 return vma_replace_policy(vma, new_pol); 1057 } 1058 1059 /* Set the process memory policy */ 1060 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 1061 nodemask_t *nodes) 1062 { 1063 struct mempolicy *new, *old; 1064 NODEMASK_SCRATCH(scratch); 1065 int ret; 1066 1067 if (!scratch) 1068 return -ENOMEM; 1069 1070 new = mpol_new(mode, flags, nodes); 1071 if (IS_ERR(new)) { 1072 ret = PTR_ERR(new); 1073 goto out; 1074 } 1075 1076 task_lock(current); 1077 ret = mpol_set_nodemask(new, nodes, scratch); 1078 if (ret) { 1079 task_unlock(current); 1080 mpol_put(new); 1081 goto out; 1082 } 1083 1084 old = current->mempolicy; 1085 current->mempolicy = new; 1086 if (new && (new->mode == MPOL_INTERLEAVE || 1087 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 1088 current->il_prev = MAX_NUMNODES-1; 1089 current->il_weight = 0; 1090 } 1091 task_unlock(current); 1092 mpol_put(old); 1093 ret = 0; 1094 out: 1095 NODEMASK_SCRATCH_FREE(scratch); 1096 return ret; 1097 } 1098 1099 /* 1100 * Return nodemask for policy for get_mempolicy() query 1101 * 1102 * Called with task's alloc_lock held 1103 */ 1104 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 1105 { 1106 nodes_clear(*nodes); 1107 if (pol == &default_policy) 1108 return; 1109 1110 switch (pol->mode) { 1111 case MPOL_BIND: 1112 case MPOL_INTERLEAVE: 1113 case MPOL_PREFERRED: 1114 case MPOL_PREFERRED_MANY: 1115 case MPOL_WEIGHTED_INTERLEAVE: 1116 *nodes = pol->nodes; 1117 break; 1118 case MPOL_LOCAL: 1119 /* return empty node mask for local allocation */ 1120 break; 1121 default: 1122 BUG(); 1123 } 1124 } 1125 1126 static int lookup_node(struct mm_struct *mm, unsigned long addr) 1127 { 1128 struct page *p = NULL; 1129 int ret; 1130 1131 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 1132 if (ret > 0) { 1133 ret = page_to_nid(p); 1134 put_page(p); 1135 } 1136 return ret; 1137 } 1138 1139 /* Retrieve NUMA policy */ 1140 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 1141 unsigned long addr, unsigned long flags) 1142 { 1143 int err; 1144 struct mm_struct *mm = current->mm; 1145 struct vm_area_struct *vma = NULL; 1146 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 1147 1148 if (flags & 1149 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 1150 return -EINVAL; 1151 1152 if (flags & MPOL_F_MEMS_ALLOWED) { 1153 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 1154 return -EINVAL; 1155 *policy = 0; /* just so it's initialized */ 1156 task_lock(current); 1157 *nmask = cpuset_current_mems_allowed; 1158 task_unlock(current); 1159 return 0; 1160 } 1161 1162 if (flags & MPOL_F_ADDR) { 1163 pgoff_t ilx; /* ignored here */ 1164 /* 1165 * Do NOT fall back to task policy if the 1166 * vma/shared policy at addr is NULL. We 1167 * want to return MPOL_DEFAULT in this case. 1168 */ 1169 mmap_read_lock(mm); 1170 vma = vma_lookup(mm, addr); 1171 if (!vma) { 1172 mmap_read_unlock(mm); 1173 return -EFAULT; 1174 } 1175 pol = __get_vma_policy(vma, addr, &ilx); 1176 } else if (addr) 1177 return -EINVAL; 1178 1179 if (!pol) 1180 pol = &default_policy; /* indicates default behavior */ 1181 1182 if (flags & MPOL_F_NODE) { 1183 if (flags & MPOL_F_ADDR) { 1184 /* 1185 * Take a refcount on the mpol, because we are about to 1186 * drop the mmap_lock, after which only "pol" remains 1187 * valid, "vma" is stale. 1188 */ 1189 pol_refcount = pol; 1190 vma = NULL; 1191 mpol_get(pol); 1192 mmap_read_unlock(mm); 1193 err = lookup_node(mm, addr); 1194 if (err < 0) 1195 goto out; 1196 *policy = err; 1197 } else if (pol == current->mempolicy && 1198 pol->mode == MPOL_INTERLEAVE) { 1199 *policy = next_node_in(current->il_prev, pol->nodes); 1200 } else if (pol == current->mempolicy && 1201 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1202 if (current->il_weight) 1203 *policy = current->il_prev; 1204 else 1205 *policy = next_node_in(current->il_prev, 1206 pol->nodes); 1207 } else { 1208 err = -EINVAL; 1209 goto out; 1210 } 1211 } else { 1212 *policy = pol == &default_policy ? MPOL_DEFAULT : 1213 pol->mode; 1214 /* 1215 * Internal mempolicy flags must be masked off before exposing 1216 * the policy to userspace. 1217 */ 1218 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1219 } 1220 1221 err = 0; 1222 if (nmask) { 1223 if (mpol_store_user_nodemask(pol)) { 1224 *nmask = pol->w.user_nodemask; 1225 } else { 1226 task_lock(current); 1227 get_policy_nodemask(pol, nmask); 1228 task_unlock(current); 1229 } 1230 } 1231 1232 out: 1233 mpol_cond_put(pol); 1234 if (vma) 1235 mmap_read_unlock(mm); 1236 if (pol_refcount) 1237 mpol_put(pol_refcount); 1238 return err; 1239 } 1240 1241 #ifdef CONFIG_MIGRATION 1242 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1243 unsigned long flags) 1244 { 1245 /* 1246 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1247 * Choosing not to migrate a shared folio is not counted as a failure. 1248 * 1249 * See folio_maybe_mapped_shared() on possible imprecision when we 1250 * cannot easily detect if a folio is shared. 1251 */ 1252 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1253 if (folio_isolate_lru(folio)) { 1254 list_add_tail(&folio->lru, foliolist); 1255 node_stat_mod_folio(folio, 1256 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1257 folio_nr_pages(folio)); 1258 } else { 1259 /* 1260 * Non-movable folio may reach here. And, there may be 1261 * temporary off LRU folios or non-LRU movable folios. 1262 * Treat them as unmovable folios since they can't be 1263 * isolated, so they can't be moved at the moment. 1264 */ 1265 return false; 1266 } 1267 } 1268 return true; 1269 } 1270 1271 /* 1272 * Migrate pages from one node to a target node. 1273 * Returns error or the number of pages not migrated. 1274 */ 1275 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1276 int flags) 1277 { 1278 nodemask_t nmask; 1279 struct vm_area_struct *vma; 1280 LIST_HEAD(pagelist); 1281 long nr_failed; 1282 long err = 0; 1283 struct migration_target_control mtc = { 1284 .nid = dest, 1285 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1286 .reason = MR_SYSCALL, 1287 }; 1288 1289 nodes_clear(nmask); 1290 node_set(source, nmask); 1291 1292 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1293 1294 mmap_read_lock(mm); 1295 vma = find_vma(mm, 0); 1296 if (unlikely(!vma)) { 1297 mmap_read_unlock(mm); 1298 return 0; 1299 } 1300 1301 /* 1302 * This does not migrate the range, but isolates all pages that 1303 * need migration. Between passing in the full user address 1304 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1305 * but passes back the count of pages which could not be isolated. 1306 */ 1307 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1308 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1309 mmap_read_unlock(mm); 1310 1311 if (!list_empty(&pagelist)) { 1312 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1313 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1314 if (err) 1315 putback_movable_pages(&pagelist); 1316 } 1317 1318 if (err >= 0) 1319 err += nr_failed; 1320 return err; 1321 } 1322 1323 /* 1324 * Move pages between the two nodesets so as to preserve the physical 1325 * layout as much as possible. 1326 * 1327 * Returns the number of page that could not be moved. 1328 */ 1329 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1330 const nodemask_t *to, int flags) 1331 { 1332 long nr_failed = 0; 1333 long err = 0; 1334 nodemask_t tmp; 1335 1336 lru_cache_disable(); 1337 1338 /* 1339 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1340 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1341 * bit in 'tmp', and return that <source, dest> pair for migration. 1342 * The pair of nodemasks 'to' and 'from' define the map. 1343 * 1344 * If no pair of bits is found that way, fallback to picking some 1345 * pair of 'source' and 'dest' bits that are not the same. If the 1346 * 'source' and 'dest' bits are the same, this represents a node 1347 * that will be migrating to itself, so no pages need move. 1348 * 1349 * If no bits are left in 'tmp', or if all remaining bits left 1350 * in 'tmp' correspond to the same bit in 'to', return false 1351 * (nothing left to migrate). 1352 * 1353 * This lets us pick a pair of nodes to migrate between, such that 1354 * if possible the dest node is not already occupied by some other 1355 * source node, minimizing the risk of overloading the memory on a 1356 * node that would happen if we migrated incoming memory to a node 1357 * before migrating outgoing memory source that same node. 1358 * 1359 * A single scan of tmp is sufficient. As we go, we remember the 1360 * most recent <s, d> pair that moved (s != d). If we find a pair 1361 * that not only moved, but what's better, moved to an empty slot 1362 * (d is not set in tmp), then we break out then, with that pair. 1363 * Otherwise when we finish scanning from_tmp, we at least have the 1364 * most recent <s, d> pair that moved. If we get all the way through 1365 * the scan of tmp without finding any node that moved, much less 1366 * moved to an empty node, then there is nothing left worth migrating. 1367 */ 1368 1369 tmp = *from; 1370 while (!nodes_empty(tmp)) { 1371 int s, d; 1372 int source = NUMA_NO_NODE; 1373 int dest = 0; 1374 1375 for_each_node_mask(s, tmp) { 1376 1377 /* 1378 * do_migrate_pages() tries to maintain the relative 1379 * node relationship of the pages established between 1380 * threads and memory areas. 1381 * 1382 * However if the number of source nodes is not equal to 1383 * the number of destination nodes we can not preserve 1384 * this node relative relationship. In that case, skip 1385 * copying memory from a node that is in the destination 1386 * mask. 1387 * 1388 * Example: [2,3,4] -> [3,4,5] moves everything. 1389 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1390 */ 1391 1392 if ((nodes_weight(*from) != nodes_weight(*to)) && 1393 (node_isset(s, *to))) 1394 continue; 1395 1396 d = node_remap(s, *from, *to); 1397 if (s == d) 1398 continue; 1399 1400 source = s; /* Node moved. Memorize */ 1401 dest = d; 1402 1403 /* dest not in remaining from nodes? */ 1404 if (!node_isset(dest, tmp)) 1405 break; 1406 } 1407 if (source == NUMA_NO_NODE) 1408 break; 1409 1410 node_clear(source, tmp); 1411 err = migrate_to_node(mm, source, dest, flags); 1412 if (err > 0) 1413 nr_failed += err; 1414 if (err < 0) 1415 break; 1416 } 1417 1418 lru_cache_enable(); 1419 if (err < 0) 1420 return err; 1421 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1422 } 1423 1424 /* 1425 * Allocate a new folio for page migration, according to NUMA mempolicy. 1426 */ 1427 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1428 unsigned long private) 1429 { 1430 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1431 struct mempolicy *pol = mmpol->pol; 1432 pgoff_t ilx = mmpol->ilx; 1433 unsigned int order; 1434 int nid = numa_node_id(); 1435 gfp_t gfp; 1436 1437 order = folio_order(src); 1438 ilx += src->index >> order; 1439 1440 if (folio_test_hugetlb(src)) { 1441 nodemask_t *nodemask; 1442 struct hstate *h; 1443 1444 h = folio_hstate(src); 1445 gfp = htlb_alloc_mask(h); 1446 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1447 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1448 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1449 } 1450 1451 if (folio_test_large(src)) 1452 gfp = GFP_TRANSHUGE; 1453 else 1454 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1455 1456 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1457 } 1458 #else 1459 1460 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1461 unsigned long flags) 1462 { 1463 return false; 1464 } 1465 1466 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1467 const nodemask_t *to, int flags) 1468 { 1469 return -ENOSYS; 1470 } 1471 1472 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1473 unsigned long private) 1474 { 1475 return NULL; 1476 } 1477 #endif 1478 1479 static long do_mbind(unsigned long start, unsigned long len, 1480 unsigned short mode, unsigned short mode_flags, 1481 nodemask_t *nmask, unsigned long flags) 1482 { 1483 struct mm_struct *mm = current->mm; 1484 struct vm_area_struct *vma, *prev; 1485 struct vma_iterator vmi; 1486 struct migration_mpol mmpol; 1487 struct mempolicy *new; 1488 unsigned long end; 1489 long err; 1490 long nr_failed; 1491 LIST_HEAD(pagelist); 1492 1493 if (flags & ~(unsigned long)MPOL_MF_VALID) 1494 return -EINVAL; 1495 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1496 return -EPERM; 1497 1498 if (start & ~PAGE_MASK) 1499 return -EINVAL; 1500 1501 if (mode == MPOL_DEFAULT) 1502 flags &= ~MPOL_MF_STRICT; 1503 1504 len = PAGE_ALIGN(len); 1505 end = start + len; 1506 1507 if (end < start) 1508 return -EINVAL; 1509 if (end == start) 1510 return 0; 1511 1512 new = mpol_new(mode, mode_flags, nmask); 1513 if (IS_ERR(new)) 1514 return PTR_ERR(new); 1515 1516 /* 1517 * If we are using the default policy then operation 1518 * on discontinuous address spaces is okay after all 1519 */ 1520 if (!new) 1521 flags |= MPOL_MF_DISCONTIG_OK; 1522 1523 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1524 lru_cache_disable(); 1525 { 1526 NODEMASK_SCRATCH(scratch); 1527 if (scratch) { 1528 mmap_write_lock(mm); 1529 err = mpol_set_nodemask(new, nmask, scratch); 1530 if (err) 1531 mmap_write_unlock(mm); 1532 } else 1533 err = -ENOMEM; 1534 NODEMASK_SCRATCH_FREE(scratch); 1535 } 1536 if (err) 1537 goto mpol_out; 1538 1539 /* 1540 * Lock the VMAs before scanning for pages to migrate, 1541 * to ensure we don't miss a concurrently inserted page. 1542 */ 1543 nr_failed = queue_pages_range(mm, start, end, nmask, 1544 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1545 1546 if (nr_failed < 0) { 1547 err = nr_failed; 1548 nr_failed = 0; 1549 } else { 1550 vma_iter_init(&vmi, mm, start); 1551 prev = vma_prev(&vmi); 1552 for_each_vma_range(vmi, vma, end) { 1553 err = mbind_range(&vmi, vma, &prev, start, end, new); 1554 if (err) 1555 break; 1556 } 1557 } 1558 1559 if (!err && !list_empty(&pagelist)) { 1560 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1561 if (!new) { 1562 new = get_task_policy(current); 1563 mpol_get(new); 1564 } 1565 mmpol.pol = new; 1566 mmpol.ilx = 0; 1567 1568 /* 1569 * In the interleaved case, attempt to allocate on exactly the 1570 * targeted nodes, for the first VMA to be migrated; for later 1571 * VMAs, the nodes will still be interleaved from the targeted 1572 * nodemask, but one by one may be selected differently. 1573 */ 1574 if (new->mode == MPOL_INTERLEAVE || 1575 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1576 struct folio *folio; 1577 unsigned int order; 1578 unsigned long addr = -EFAULT; 1579 1580 list_for_each_entry(folio, &pagelist, lru) { 1581 if (!folio_test_ksm(folio)) 1582 break; 1583 } 1584 if (!list_entry_is_head(folio, &pagelist, lru)) { 1585 vma_iter_init(&vmi, mm, start); 1586 for_each_vma_range(vmi, vma, end) { 1587 addr = page_address_in_vma(folio, 1588 folio_page(folio, 0), vma); 1589 if (addr != -EFAULT) 1590 break; 1591 } 1592 } 1593 if (addr != -EFAULT) { 1594 order = folio_order(folio); 1595 /* We already know the pol, but not the ilx */ 1596 mpol_cond_put(get_vma_policy(vma, addr, order, 1597 &mmpol.ilx)); 1598 /* Set base from which to increment by index */ 1599 mmpol.ilx -= folio->index >> order; 1600 } 1601 } 1602 } 1603 1604 mmap_write_unlock(mm); 1605 1606 if (!err && !list_empty(&pagelist)) { 1607 nr_failed |= migrate_pages(&pagelist, 1608 alloc_migration_target_by_mpol, NULL, 1609 (unsigned long)&mmpol, MIGRATE_SYNC, 1610 MR_MEMPOLICY_MBIND, NULL); 1611 } 1612 1613 if (nr_failed && (flags & MPOL_MF_STRICT)) 1614 err = -EIO; 1615 if (!list_empty(&pagelist)) 1616 putback_movable_pages(&pagelist); 1617 mpol_out: 1618 mpol_put(new); 1619 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1620 lru_cache_enable(); 1621 return err; 1622 } 1623 1624 /* 1625 * User space interface with variable sized bitmaps for nodelists. 1626 */ 1627 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1628 unsigned long maxnode) 1629 { 1630 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1631 int ret; 1632 1633 if (in_compat_syscall()) 1634 ret = compat_get_bitmap(mask, 1635 (const compat_ulong_t __user *)nmask, 1636 maxnode); 1637 else 1638 ret = copy_from_user(mask, nmask, 1639 nlongs * sizeof(unsigned long)); 1640 1641 if (ret) 1642 return -EFAULT; 1643 1644 if (maxnode % BITS_PER_LONG) 1645 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1646 1647 return 0; 1648 } 1649 1650 /* Copy a node mask from user space. */ 1651 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1652 unsigned long maxnode) 1653 { 1654 --maxnode; 1655 nodes_clear(*nodes); 1656 if (maxnode == 0 || !nmask) 1657 return 0; 1658 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1659 return -EINVAL; 1660 1661 /* 1662 * When the user specified more nodes than supported just check 1663 * if the non supported part is all zero, one word at a time, 1664 * starting at the end. 1665 */ 1666 while (maxnode > MAX_NUMNODES) { 1667 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1668 unsigned long t; 1669 1670 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1671 return -EFAULT; 1672 1673 if (maxnode - bits >= MAX_NUMNODES) { 1674 maxnode -= bits; 1675 } else { 1676 maxnode = MAX_NUMNODES; 1677 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1678 } 1679 if (t) 1680 return -EINVAL; 1681 } 1682 1683 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1684 } 1685 1686 /* Copy a kernel node mask to user space */ 1687 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1688 nodemask_t *nodes) 1689 { 1690 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1691 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1692 bool compat = in_compat_syscall(); 1693 1694 if (compat) 1695 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1696 1697 if (copy > nbytes) { 1698 if (copy > PAGE_SIZE) 1699 return -EINVAL; 1700 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1701 return -EFAULT; 1702 copy = nbytes; 1703 maxnode = nr_node_ids; 1704 } 1705 1706 if (compat) 1707 return compat_put_bitmap((compat_ulong_t __user *)mask, 1708 nodes_addr(*nodes), maxnode); 1709 1710 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1711 } 1712 1713 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1714 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1715 { 1716 *flags = *mode & MPOL_MODE_FLAGS; 1717 *mode &= ~MPOL_MODE_FLAGS; 1718 1719 if ((unsigned int)(*mode) >= MPOL_MAX) 1720 return -EINVAL; 1721 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1722 return -EINVAL; 1723 if (*flags & MPOL_F_NUMA_BALANCING) { 1724 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1725 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1726 else 1727 return -EINVAL; 1728 } 1729 return 0; 1730 } 1731 1732 static long kernel_mbind(unsigned long start, unsigned long len, 1733 unsigned long mode, const unsigned long __user *nmask, 1734 unsigned long maxnode, unsigned int flags) 1735 { 1736 unsigned short mode_flags; 1737 nodemask_t nodes; 1738 int lmode = mode; 1739 int err; 1740 1741 start = untagged_addr(start); 1742 err = sanitize_mpol_flags(&lmode, &mode_flags); 1743 if (err) 1744 return err; 1745 1746 err = get_nodes(&nodes, nmask, maxnode); 1747 if (err) 1748 return err; 1749 1750 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1751 } 1752 1753 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1754 unsigned long, home_node, unsigned long, flags) 1755 { 1756 struct mm_struct *mm = current->mm; 1757 struct vm_area_struct *vma, *prev; 1758 struct mempolicy *new, *old; 1759 unsigned long end; 1760 int err = -ENOENT; 1761 VMA_ITERATOR(vmi, mm, start); 1762 1763 start = untagged_addr(start); 1764 if (start & ~PAGE_MASK) 1765 return -EINVAL; 1766 /* 1767 * flags is used for future extension if any. 1768 */ 1769 if (flags != 0) 1770 return -EINVAL; 1771 1772 /* 1773 * Check home_node is online to avoid accessing uninitialized 1774 * NODE_DATA. 1775 */ 1776 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1777 return -EINVAL; 1778 1779 len = PAGE_ALIGN(len); 1780 end = start + len; 1781 1782 if (end < start) 1783 return -EINVAL; 1784 if (end == start) 1785 return 0; 1786 mmap_write_lock(mm); 1787 prev = vma_prev(&vmi); 1788 for_each_vma_range(vmi, vma, end) { 1789 /* 1790 * If any vma in the range got policy other than MPOL_BIND 1791 * or MPOL_PREFERRED_MANY we return error. We don't reset 1792 * the home node for vmas we already updated before. 1793 */ 1794 old = vma_policy(vma); 1795 if (!old) { 1796 prev = vma; 1797 continue; 1798 } 1799 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1800 err = -EOPNOTSUPP; 1801 break; 1802 } 1803 new = mpol_dup(old); 1804 if (IS_ERR(new)) { 1805 err = PTR_ERR(new); 1806 break; 1807 } 1808 1809 vma_start_write(vma); 1810 new->home_node = home_node; 1811 err = mbind_range(&vmi, vma, &prev, start, end, new); 1812 mpol_put(new); 1813 if (err) 1814 break; 1815 } 1816 mmap_write_unlock(mm); 1817 return err; 1818 } 1819 1820 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1821 unsigned long, mode, const unsigned long __user *, nmask, 1822 unsigned long, maxnode, unsigned int, flags) 1823 { 1824 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1825 } 1826 1827 /* Set the process memory policy */ 1828 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1829 unsigned long maxnode) 1830 { 1831 unsigned short mode_flags; 1832 nodemask_t nodes; 1833 int lmode = mode; 1834 int err; 1835 1836 err = sanitize_mpol_flags(&lmode, &mode_flags); 1837 if (err) 1838 return err; 1839 1840 err = get_nodes(&nodes, nmask, maxnode); 1841 if (err) 1842 return err; 1843 1844 return do_set_mempolicy(lmode, mode_flags, &nodes); 1845 } 1846 1847 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1848 unsigned long, maxnode) 1849 { 1850 return kernel_set_mempolicy(mode, nmask, maxnode); 1851 } 1852 1853 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1854 const unsigned long __user *old_nodes, 1855 const unsigned long __user *new_nodes) 1856 { 1857 struct mm_struct *mm = NULL; 1858 struct task_struct *task; 1859 nodemask_t task_nodes; 1860 int err; 1861 nodemask_t *old; 1862 nodemask_t *new; 1863 NODEMASK_SCRATCH(scratch); 1864 1865 if (!scratch) 1866 return -ENOMEM; 1867 1868 old = &scratch->mask1; 1869 new = &scratch->mask2; 1870 1871 err = get_nodes(old, old_nodes, maxnode); 1872 if (err) 1873 goto out; 1874 1875 err = get_nodes(new, new_nodes, maxnode); 1876 if (err) 1877 goto out; 1878 1879 /* Find the mm_struct */ 1880 rcu_read_lock(); 1881 task = pid ? find_task_by_vpid(pid) : current; 1882 if (!task) { 1883 rcu_read_unlock(); 1884 err = -ESRCH; 1885 goto out; 1886 } 1887 get_task_struct(task); 1888 1889 err = -EINVAL; 1890 1891 /* 1892 * Check if this process has the right to modify the specified process. 1893 * Use the regular "ptrace_may_access()" checks. 1894 */ 1895 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1896 rcu_read_unlock(); 1897 err = -EPERM; 1898 goto out_put; 1899 } 1900 rcu_read_unlock(); 1901 1902 task_nodes = cpuset_mems_allowed(task); 1903 /* Is the user allowed to access the target nodes? */ 1904 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1905 err = -EPERM; 1906 goto out_put; 1907 } 1908 1909 task_nodes = cpuset_mems_allowed(current); 1910 nodes_and(*new, *new, task_nodes); 1911 if (nodes_empty(*new)) 1912 goto out_put; 1913 1914 err = security_task_movememory(task); 1915 if (err) 1916 goto out_put; 1917 1918 mm = get_task_mm(task); 1919 put_task_struct(task); 1920 1921 if (!mm) { 1922 err = -EINVAL; 1923 goto out; 1924 } 1925 1926 err = do_migrate_pages(mm, old, new, 1927 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1928 1929 mmput(mm); 1930 out: 1931 NODEMASK_SCRATCH_FREE(scratch); 1932 1933 return err; 1934 1935 out_put: 1936 put_task_struct(task); 1937 goto out; 1938 } 1939 1940 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1941 const unsigned long __user *, old_nodes, 1942 const unsigned long __user *, new_nodes) 1943 { 1944 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1945 } 1946 1947 /* Retrieve NUMA policy */ 1948 static int kernel_get_mempolicy(int __user *policy, 1949 unsigned long __user *nmask, 1950 unsigned long maxnode, 1951 unsigned long addr, 1952 unsigned long flags) 1953 { 1954 int err; 1955 int pval; 1956 nodemask_t nodes; 1957 1958 if (nmask != NULL && maxnode < nr_node_ids) 1959 return -EINVAL; 1960 1961 addr = untagged_addr(addr); 1962 1963 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1964 1965 if (err) 1966 return err; 1967 1968 if (policy && put_user(pval, policy)) 1969 return -EFAULT; 1970 1971 if (nmask) 1972 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1973 1974 return err; 1975 } 1976 1977 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1978 unsigned long __user *, nmask, unsigned long, maxnode, 1979 unsigned long, addr, unsigned long, flags) 1980 { 1981 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1982 } 1983 1984 bool vma_migratable(struct vm_area_struct *vma) 1985 { 1986 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1987 return false; 1988 1989 /* 1990 * DAX device mappings require predictable access latency, so avoid 1991 * incurring periodic faults. 1992 */ 1993 if (vma_is_dax(vma)) 1994 return false; 1995 1996 if (is_vm_hugetlb_page(vma) && 1997 !hugepage_migration_supported(hstate_vma(vma))) 1998 return false; 1999 2000 /* 2001 * Migration allocates pages in the highest zone. If we cannot 2002 * do so then migration (at least from node to node) is not 2003 * possible. 2004 */ 2005 if (vma->vm_file && 2006 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 2007 < policy_zone) 2008 return false; 2009 return true; 2010 } 2011 2012 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 2013 unsigned long addr, pgoff_t *ilx) 2014 { 2015 *ilx = 0; 2016 return (vma->vm_ops && vma->vm_ops->get_policy) ? 2017 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 2018 } 2019 2020 /* 2021 * get_vma_policy(@vma, @addr, @order, @ilx) 2022 * @vma: virtual memory area whose policy is sought 2023 * @addr: address in @vma for shared policy lookup 2024 * @order: 0, or appropriate huge_page_order for interleaving 2025 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 2026 * MPOL_WEIGHTED_INTERLEAVE 2027 * 2028 * Returns effective policy for a VMA at specified address. 2029 * Falls back to current->mempolicy or system default policy, as necessary. 2030 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 2031 * count--added by the get_policy() vm_op, as appropriate--to protect against 2032 * freeing by another task. It is the caller's responsibility to free the 2033 * extra reference for shared policies. 2034 */ 2035 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 2036 unsigned long addr, int order, pgoff_t *ilx) 2037 { 2038 struct mempolicy *pol; 2039 2040 pol = __get_vma_policy(vma, addr, ilx); 2041 if (!pol) 2042 pol = get_task_policy(current); 2043 if (pol->mode == MPOL_INTERLEAVE || 2044 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 2045 *ilx += vma->vm_pgoff >> order; 2046 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 2047 } 2048 return pol; 2049 } 2050 2051 bool vma_policy_mof(struct vm_area_struct *vma) 2052 { 2053 struct mempolicy *pol; 2054 2055 if (vma->vm_ops && vma->vm_ops->get_policy) { 2056 bool ret = false; 2057 pgoff_t ilx; /* ignored here */ 2058 2059 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 2060 if (pol && (pol->flags & MPOL_F_MOF)) 2061 ret = true; 2062 mpol_cond_put(pol); 2063 2064 return ret; 2065 } 2066 2067 pol = vma->vm_policy; 2068 if (!pol) 2069 pol = get_task_policy(current); 2070 2071 return pol->flags & MPOL_F_MOF; 2072 } 2073 2074 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 2075 { 2076 enum zone_type dynamic_policy_zone = policy_zone; 2077 2078 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 2079 2080 /* 2081 * if policy->nodes has movable memory only, 2082 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 2083 * 2084 * policy->nodes is intersect with node_states[N_MEMORY]. 2085 * so if the following test fails, it implies 2086 * policy->nodes has movable memory only. 2087 */ 2088 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 2089 dynamic_policy_zone = ZONE_MOVABLE; 2090 2091 return zone >= dynamic_policy_zone; 2092 } 2093 2094 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 2095 { 2096 unsigned int node; 2097 unsigned int cpuset_mems_cookie; 2098 2099 retry: 2100 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 2101 cpuset_mems_cookie = read_mems_allowed_begin(); 2102 node = current->il_prev; 2103 if (!current->il_weight || !node_isset(node, policy->nodes)) { 2104 node = next_node_in(node, policy->nodes); 2105 if (read_mems_allowed_retry(cpuset_mems_cookie)) 2106 goto retry; 2107 if (node == MAX_NUMNODES) 2108 return node; 2109 current->il_prev = node; 2110 current->il_weight = get_il_weight(node); 2111 } 2112 current->il_weight--; 2113 return node; 2114 } 2115 2116 /* Do dynamic interleaving for a process */ 2117 static unsigned int interleave_nodes(struct mempolicy *policy) 2118 { 2119 unsigned int nid; 2120 unsigned int cpuset_mems_cookie; 2121 2122 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 2123 do { 2124 cpuset_mems_cookie = read_mems_allowed_begin(); 2125 nid = next_node_in(current->il_prev, policy->nodes); 2126 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2127 2128 if (nid < MAX_NUMNODES) 2129 current->il_prev = nid; 2130 return nid; 2131 } 2132 2133 /* 2134 * Depending on the memory policy provide a node from which to allocate the 2135 * next slab entry. 2136 */ 2137 unsigned int mempolicy_slab_node(void) 2138 { 2139 struct mempolicy *policy; 2140 int node = numa_mem_id(); 2141 2142 if (!in_task()) 2143 return node; 2144 2145 policy = current->mempolicy; 2146 if (!policy) 2147 return node; 2148 2149 switch (policy->mode) { 2150 case MPOL_PREFERRED: 2151 return first_node(policy->nodes); 2152 2153 case MPOL_INTERLEAVE: 2154 return interleave_nodes(policy); 2155 2156 case MPOL_WEIGHTED_INTERLEAVE: 2157 return weighted_interleave_nodes(policy); 2158 2159 case MPOL_BIND: 2160 case MPOL_PREFERRED_MANY: 2161 { 2162 struct zoneref *z; 2163 2164 /* 2165 * Follow bind policy behavior and start allocation at the 2166 * first node. 2167 */ 2168 struct zonelist *zonelist; 2169 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 2170 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 2171 z = first_zones_zonelist(zonelist, highest_zoneidx, 2172 &policy->nodes); 2173 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2174 } 2175 case MPOL_LOCAL: 2176 return node; 2177 2178 default: 2179 BUG(); 2180 } 2181 } 2182 2183 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2184 nodemask_t *mask) 2185 { 2186 /* 2187 * barrier stabilizes the nodemask locally so that it can be iterated 2188 * over safely without concern for changes. Allocators validate node 2189 * selection does not violate mems_allowed, so this is safe. 2190 */ 2191 barrier(); 2192 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2193 barrier(); 2194 return nodes_weight(*mask); 2195 } 2196 2197 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2198 { 2199 struct weighted_interleave_state *state; 2200 nodemask_t nodemask; 2201 unsigned int target, nr_nodes; 2202 u8 *table = NULL; 2203 unsigned int weight_total = 0; 2204 u8 weight; 2205 int nid = 0; 2206 2207 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2208 if (!nr_nodes) 2209 return numa_node_id(); 2210 2211 rcu_read_lock(); 2212 2213 state = rcu_dereference(wi_state); 2214 /* Uninitialized wi_state means we should assume all weights are 1 */ 2215 if (state) 2216 table = state->iw_table; 2217 2218 /* calculate the total weight */ 2219 for_each_node_mask(nid, nodemask) 2220 weight_total += table ? table[nid] : 1; 2221 2222 /* Calculate the node offset based on totals */ 2223 target = ilx % weight_total; 2224 nid = first_node(nodemask); 2225 while (target) { 2226 /* detect system default usage */ 2227 weight = table ? table[nid] : 1; 2228 if (target < weight) 2229 break; 2230 target -= weight; 2231 nid = next_node_in(nid, nodemask); 2232 } 2233 rcu_read_unlock(); 2234 return nid; 2235 } 2236 2237 /* 2238 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2239 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2240 * exceeds the number of present nodes. 2241 */ 2242 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2243 { 2244 nodemask_t nodemask; 2245 unsigned int target, nnodes; 2246 int i; 2247 int nid; 2248 2249 nnodes = read_once_policy_nodemask(pol, &nodemask); 2250 if (!nnodes) 2251 return numa_node_id(); 2252 target = ilx % nnodes; 2253 nid = first_node(nodemask); 2254 for (i = 0; i < target; i++) 2255 nid = next_node(nid, nodemask); 2256 return nid; 2257 } 2258 2259 /* 2260 * Return a nodemask representing a mempolicy for filtering nodes for 2261 * page allocation, together with preferred node id (or the input node id). 2262 */ 2263 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2264 pgoff_t ilx, int *nid) 2265 { 2266 nodemask_t *nodemask = NULL; 2267 2268 switch (pol->mode) { 2269 case MPOL_PREFERRED: 2270 /* Override input node id */ 2271 *nid = first_node(pol->nodes); 2272 break; 2273 case MPOL_PREFERRED_MANY: 2274 nodemask = &pol->nodes; 2275 if (pol->home_node != NUMA_NO_NODE) 2276 *nid = pol->home_node; 2277 break; 2278 case MPOL_BIND: 2279 /* Restrict to nodemask (but not on lower zones) */ 2280 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2281 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2282 nodemask = &pol->nodes; 2283 if (pol->home_node != NUMA_NO_NODE) 2284 *nid = pol->home_node; 2285 /* 2286 * __GFP_THISNODE shouldn't even be used with the bind policy 2287 * because we might easily break the expectation to stay on the 2288 * requested node and not break the policy. 2289 */ 2290 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2291 break; 2292 case MPOL_INTERLEAVE: 2293 /* Override input node id */ 2294 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2295 interleave_nodes(pol) : interleave_nid(pol, ilx); 2296 break; 2297 case MPOL_WEIGHTED_INTERLEAVE: 2298 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2299 weighted_interleave_nodes(pol) : 2300 weighted_interleave_nid(pol, ilx); 2301 break; 2302 } 2303 2304 return nodemask; 2305 } 2306 2307 #ifdef CONFIG_HUGETLBFS 2308 /* 2309 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2310 * @vma: virtual memory area whose policy is sought 2311 * @addr: address in @vma for shared policy lookup and interleave policy 2312 * @gfp_flags: for requested zone 2313 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2314 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2315 * 2316 * Returns a nid suitable for a huge page allocation and a pointer 2317 * to the struct mempolicy for conditional unref after allocation. 2318 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2319 * to the mempolicy's @nodemask for filtering the zonelist. 2320 */ 2321 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2322 struct mempolicy **mpol, nodemask_t **nodemask) 2323 { 2324 pgoff_t ilx; 2325 int nid; 2326 2327 nid = numa_node_id(); 2328 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2329 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2330 return nid; 2331 } 2332 2333 /* 2334 * init_nodemask_of_mempolicy 2335 * 2336 * If the current task's mempolicy is "default" [NULL], return 'false' 2337 * to indicate default policy. Otherwise, extract the policy nodemask 2338 * for 'bind' or 'interleave' policy into the argument nodemask, or 2339 * initialize the argument nodemask to contain the single node for 2340 * 'preferred' or 'local' policy and return 'true' to indicate presence 2341 * of non-default mempolicy. 2342 * 2343 * We don't bother with reference counting the mempolicy [mpol_get/put] 2344 * because the current task is examining it's own mempolicy and a task's 2345 * mempolicy is only ever changed by the task itself. 2346 * 2347 * N.B., it is the caller's responsibility to free a returned nodemask. 2348 */ 2349 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2350 { 2351 struct mempolicy *mempolicy; 2352 2353 if (!(mask && current->mempolicy)) 2354 return false; 2355 2356 task_lock(current); 2357 mempolicy = current->mempolicy; 2358 switch (mempolicy->mode) { 2359 case MPOL_PREFERRED: 2360 case MPOL_PREFERRED_MANY: 2361 case MPOL_BIND: 2362 case MPOL_INTERLEAVE: 2363 case MPOL_WEIGHTED_INTERLEAVE: 2364 *mask = mempolicy->nodes; 2365 break; 2366 2367 case MPOL_LOCAL: 2368 init_nodemask_of_node(mask, numa_node_id()); 2369 break; 2370 2371 default: 2372 BUG(); 2373 } 2374 task_unlock(current); 2375 2376 return true; 2377 } 2378 #endif 2379 2380 /* 2381 * mempolicy_in_oom_domain 2382 * 2383 * If tsk's mempolicy is "bind", check for intersection between mask and 2384 * the policy nodemask. Otherwise, return true for all other policies 2385 * including "interleave", as a tsk with "interleave" policy may have 2386 * memory allocated from all nodes in system. 2387 * 2388 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2389 */ 2390 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2391 const nodemask_t *mask) 2392 { 2393 struct mempolicy *mempolicy; 2394 bool ret = true; 2395 2396 if (!mask) 2397 return ret; 2398 2399 task_lock(tsk); 2400 mempolicy = tsk->mempolicy; 2401 if (mempolicy && mempolicy->mode == MPOL_BIND) 2402 ret = nodes_intersects(mempolicy->nodes, *mask); 2403 task_unlock(tsk); 2404 2405 return ret; 2406 } 2407 2408 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2409 int nid, nodemask_t *nodemask) 2410 { 2411 struct page *page; 2412 gfp_t preferred_gfp; 2413 2414 /* 2415 * This is a two pass approach. The first pass will only try the 2416 * preferred nodes but skip the direct reclaim and allow the 2417 * allocation to fail, while the second pass will try all the 2418 * nodes in system. 2419 */ 2420 preferred_gfp = gfp | __GFP_NOWARN; 2421 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2422 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2423 if (!page) 2424 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2425 2426 return page; 2427 } 2428 2429 /** 2430 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2431 * @gfp: GFP flags. 2432 * @order: Order of the page allocation. 2433 * @pol: Pointer to the NUMA mempolicy. 2434 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2435 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2436 * 2437 * Return: The page on success or NULL if allocation fails. 2438 */ 2439 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2440 struct mempolicy *pol, pgoff_t ilx, int nid) 2441 { 2442 nodemask_t *nodemask; 2443 struct page *page; 2444 2445 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2446 2447 if (pol->mode == MPOL_PREFERRED_MANY) 2448 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2449 2450 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2451 /* filter "hugepage" allocation, unless from alloc_pages() */ 2452 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2453 /* 2454 * For hugepage allocation and non-interleave policy which 2455 * allows the current node (or other explicitly preferred 2456 * node) we only try to allocate from the current/preferred 2457 * node and don't fall back to other nodes, as the cost of 2458 * remote accesses would likely offset THP benefits. 2459 * 2460 * If the policy is interleave or does not allow the current 2461 * node in its nodemask, we allocate the standard way. 2462 */ 2463 if (pol->mode != MPOL_INTERLEAVE && 2464 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2465 (!nodemask || node_isset(nid, *nodemask))) { 2466 /* 2467 * First, try to allocate THP only on local node, but 2468 * don't reclaim unnecessarily, just compact. 2469 */ 2470 page = __alloc_frozen_pages_noprof( 2471 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2472 nid, NULL); 2473 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2474 return page; 2475 /* 2476 * If hugepage allocations are configured to always 2477 * synchronous compact or the vma has been madvised 2478 * to prefer hugepage backing, retry allowing remote 2479 * memory with both reclaim and compact as well. 2480 */ 2481 } 2482 } 2483 2484 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2485 2486 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2487 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2488 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2489 if (static_branch_likely(&vm_numa_stat_key) && 2490 page_to_nid(page) == nid) { 2491 preempt_disable(); 2492 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2493 preempt_enable(); 2494 } 2495 } 2496 2497 return page; 2498 } 2499 2500 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2501 struct mempolicy *pol, pgoff_t ilx, int nid) 2502 { 2503 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2504 ilx, nid); 2505 if (!page) 2506 return NULL; 2507 2508 set_page_refcounted(page); 2509 return page_rmappable_folio(page); 2510 } 2511 2512 /** 2513 * vma_alloc_folio - Allocate a folio for a VMA. 2514 * @gfp: GFP flags. 2515 * @order: Order of the folio. 2516 * @vma: Pointer to VMA. 2517 * @addr: Virtual address of the allocation. Must be inside @vma. 2518 * 2519 * Allocate a folio for a specific address in @vma, using the appropriate 2520 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2521 * VMA to prevent it from going away. Should be used for all allocations 2522 * for folios that will be mapped into user space, excepting hugetlbfs, and 2523 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2524 * 2525 * Return: The folio on success or NULL if allocation fails. 2526 */ 2527 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2528 unsigned long addr) 2529 { 2530 struct mempolicy *pol; 2531 pgoff_t ilx; 2532 struct folio *folio; 2533 2534 if (vma->vm_flags & VM_DROPPABLE) 2535 gfp |= __GFP_NOWARN; 2536 2537 pol = get_vma_policy(vma, addr, order, &ilx); 2538 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2539 mpol_cond_put(pol); 2540 return folio; 2541 } 2542 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2543 2544 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2545 { 2546 struct mempolicy *pol = &default_policy; 2547 2548 /* 2549 * No reference counting needed for current->mempolicy 2550 * nor system default_policy 2551 */ 2552 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2553 pol = get_task_policy(current); 2554 2555 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2556 numa_node_id()); 2557 } 2558 2559 /** 2560 * alloc_pages - Allocate pages. 2561 * @gfp: GFP flags. 2562 * @order: Power of two of number of pages to allocate. 2563 * 2564 * Allocate 1 << @order contiguous pages. The physical address of the 2565 * first page is naturally aligned (eg an order-3 allocation will be aligned 2566 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2567 * process is honoured when in process context. 2568 * 2569 * Context: Can be called from any context, providing the appropriate GFP 2570 * flags are used. 2571 * Return: The page on success or NULL if allocation fails. 2572 */ 2573 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2574 { 2575 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2576 2577 if (page) 2578 set_page_refcounted(page); 2579 return page; 2580 } 2581 EXPORT_SYMBOL(alloc_pages_noprof); 2582 2583 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2584 { 2585 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2586 } 2587 EXPORT_SYMBOL(folio_alloc_noprof); 2588 2589 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2590 struct mempolicy *pol, unsigned long nr_pages, 2591 struct page **page_array) 2592 { 2593 int nodes; 2594 unsigned long nr_pages_per_node; 2595 int delta; 2596 int i; 2597 unsigned long nr_allocated; 2598 unsigned long total_allocated = 0; 2599 2600 nodes = nodes_weight(pol->nodes); 2601 nr_pages_per_node = nr_pages / nodes; 2602 delta = nr_pages - nodes * nr_pages_per_node; 2603 2604 for (i = 0; i < nodes; i++) { 2605 if (delta) { 2606 nr_allocated = alloc_pages_bulk_noprof(gfp, 2607 interleave_nodes(pol), NULL, 2608 nr_pages_per_node + 1, 2609 page_array); 2610 delta--; 2611 } else { 2612 nr_allocated = alloc_pages_bulk_noprof(gfp, 2613 interleave_nodes(pol), NULL, 2614 nr_pages_per_node, page_array); 2615 } 2616 2617 page_array += nr_allocated; 2618 total_allocated += nr_allocated; 2619 } 2620 2621 return total_allocated; 2622 } 2623 2624 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2625 struct mempolicy *pol, unsigned long nr_pages, 2626 struct page **page_array) 2627 { 2628 struct weighted_interleave_state *state; 2629 struct task_struct *me = current; 2630 unsigned int cpuset_mems_cookie; 2631 unsigned long total_allocated = 0; 2632 unsigned long nr_allocated = 0; 2633 unsigned long rounds; 2634 unsigned long node_pages, delta; 2635 u8 *weights, weight; 2636 unsigned int weight_total = 0; 2637 unsigned long rem_pages = nr_pages; 2638 nodemask_t nodes; 2639 int nnodes, node; 2640 int resume_node = MAX_NUMNODES - 1; 2641 u8 resume_weight = 0; 2642 int prev_node; 2643 int i; 2644 2645 if (!nr_pages) 2646 return 0; 2647 2648 /* read the nodes onto the stack, retry if done during rebind */ 2649 do { 2650 cpuset_mems_cookie = read_mems_allowed_begin(); 2651 nnodes = read_once_policy_nodemask(pol, &nodes); 2652 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2653 2654 /* if the nodemask has become invalid, we cannot do anything */ 2655 if (!nnodes) 2656 return 0; 2657 2658 /* Continue allocating from most recent node and adjust the nr_pages */ 2659 node = me->il_prev; 2660 weight = me->il_weight; 2661 if (weight && node_isset(node, nodes)) { 2662 node_pages = min(rem_pages, weight); 2663 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2664 page_array); 2665 page_array += nr_allocated; 2666 total_allocated += nr_allocated; 2667 /* if that's all the pages, no need to interleave */ 2668 if (rem_pages <= weight) { 2669 me->il_weight -= rem_pages; 2670 return total_allocated; 2671 } 2672 /* Otherwise we adjust remaining pages, continue from there */ 2673 rem_pages -= weight; 2674 } 2675 /* clear active weight in case of an allocation failure */ 2676 me->il_weight = 0; 2677 prev_node = node; 2678 2679 /* create a local copy of node weights to operate on outside rcu */ 2680 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2681 if (!weights) 2682 return total_allocated; 2683 2684 rcu_read_lock(); 2685 state = rcu_dereference(wi_state); 2686 if (state) { 2687 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); 2688 rcu_read_unlock(); 2689 } else { 2690 rcu_read_unlock(); 2691 for (i = 0; i < nr_node_ids; i++) 2692 weights[i] = 1; 2693 } 2694 2695 /* calculate total, detect system default usage */ 2696 for_each_node_mask(node, nodes) 2697 weight_total += weights[node]; 2698 2699 /* 2700 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2701 * Track which node weighted interleave should resume from. 2702 * 2703 * if (rounds > 0) and (delta == 0), resume_node will always be 2704 * the node following prev_node and its weight. 2705 */ 2706 rounds = rem_pages / weight_total; 2707 delta = rem_pages % weight_total; 2708 resume_node = next_node_in(prev_node, nodes); 2709 resume_weight = weights[resume_node]; 2710 for (i = 0; i < nnodes; i++) { 2711 node = next_node_in(prev_node, nodes); 2712 weight = weights[node]; 2713 node_pages = weight * rounds; 2714 /* If a delta exists, add this node's portion of the delta */ 2715 if (delta > weight) { 2716 node_pages += weight; 2717 delta -= weight; 2718 } else if (delta) { 2719 /* when delta is depleted, resume from that node */ 2720 node_pages += delta; 2721 resume_node = node; 2722 resume_weight = weight - delta; 2723 delta = 0; 2724 } 2725 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2726 if (!node_pages) 2727 break; 2728 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2729 page_array); 2730 page_array += nr_allocated; 2731 total_allocated += nr_allocated; 2732 if (total_allocated == nr_pages) 2733 break; 2734 prev_node = node; 2735 } 2736 me->il_prev = resume_node; 2737 me->il_weight = resume_weight; 2738 kfree(weights); 2739 return total_allocated; 2740 } 2741 2742 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2743 struct mempolicy *pol, unsigned long nr_pages, 2744 struct page **page_array) 2745 { 2746 gfp_t preferred_gfp; 2747 unsigned long nr_allocated = 0; 2748 2749 preferred_gfp = gfp | __GFP_NOWARN; 2750 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2751 2752 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2753 nr_pages, page_array); 2754 2755 if (nr_allocated < nr_pages) 2756 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2757 nr_pages - nr_allocated, 2758 page_array + nr_allocated); 2759 return nr_allocated; 2760 } 2761 2762 /* alloc pages bulk and mempolicy should be considered at the 2763 * same time in some situation such as vmalloc. 2764 * 2765 * It can accelerate memory allocation especially interleaving 2766 * allocate memory. 2767 */ 2768 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2769 unsigned long nr_pages, struct page **page_array) 2770 { 2771 struct mempolicy *pol = &default_policy; 2772 nodemask_t *nodemask; 2773 int nid; 2774 2775 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2776 pol = get_task_policy(current); 2777 2778 if (pol->mode == MPOL_INTERLEAVE) 2779 return alloc_pages_bulk_interleave(gfp, pol, 2780 nr_pages, page_array); 2781 2782 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2783 return alloc_pages_bulk_weighted_interleave( 2784 gfp, pol, nr_pages, page_array); 2785 2786 if (pol->mode == MPOL_PREFERRED_MANY) 2787 return alloc_pages_bulk_preferred_many(gfp, 2788 numa_node_id(), pol, nr_pages, page_array); 2789 2790 nid = numa_node_id(); 2791 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2792 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2793 nr_pages, page_array); 2794 } 2795 2796 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2797 { 2798 struct mempolicy *pol = mpol_dup(src->vm_policy); 2799 2800 if (IS_ERR(pol)) 2801 return PTR_ERR(pol); 2802 dst->vm_policy = pol; 2803 return 0; 2804 } 2805 2806 /* 2807 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2808 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2809 * with the mems_allowed returned by cpuset_mems_allowed(). This 2810 * keeps mempolicies cpuset relative after its cpuset moves. See 2811 * further kernel/cpuset.c update_nodemask(). 2812 * 2813 * current's mempolicy may be rebinded by the other task(the task that changes 2814 * cpuset's mems), so we needn't do rebind work for current task. 2815 */ 2816 2817 /* Slow path of a mempolicy duplicate */ 2818 struct mempolicy *__mpol_dup(struct mempolicy *old) 2819 { 2820 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2821 2822 if (!new) 2823 return ERR_PTR(-ENOMEM); 2824 2825 /* task's mempolicy is protected by alloc_lock */ 2826 if (old == current->mempolicy) { 2827 task_lock(current); 2828 *new = *old; 2829 task_unlock(current); 2830 } else 2831 *new = *old; 2832 2833 if (current_cpuset_is_being_rebound()) { 2834 nodemask_t mems = cpuset_mems_allowed(current); 2835 mpol_rebind_policy(new, &mems); 2836 } 2837 atomic_set(&new->refcnt, 1); 2838 return new; 2839 } 2840 2841 /* Slow path of a mempolicy comparison */ 2842 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2843 { 2844 if (!a || !b) 2845 return false; 2846 if (a->mode != b->mode) 2847 return false; 2848 if (a->flags != b->flags) 2849 return false; 2850 if (a->home_node != b->home_node) 2851 return false; 2852 if (mpol_store_user_nodemask(a)) 2853 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2854 return false; 2855 2856 switch (a->mode) { 2857 case MPOL_BIND: 2858 case MPOL_INTERLEAVE: 2859 case MPOL_PREFERRED: 2860 case MPOL_PREFERRED_MANY: 2861 case MPOL_WEIGHTED_INTERLEAVE: 2862 return !!nodes_equal(a->nodes, b->nodes); 2863 case MPOL_LOCAL: 2864 return true; 2865 default: 2866 BUG(); 2867 return false; 2868 } 2869 } 2870 2871 /* 2872 * Shared memory backing store policy support. 2873 * 2874 * Remember policies even when nobody has shared memory mapped. 2875 * The policies are kept in Red-Black tree linked from the inode. 2876 * They are protected by the sp->lock rwlock, which should be held 2877 * for any accesses to the tree. 2878 */ 2879 2880 /* 2881 * lookup first element intersecting start-end. Caller holds sp->lock for 2882 * reading or for writing 2883 */ 2884 static struct sp_node *sp_lookup(struct shared_policy *sp, 2885 pgoff_t start, pgoff_t end) 2886 { 2887 struct rb_node *n = sp->root.rb_node; 2888 2889 while (n) { 2890 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2891 2892 if (start >= p->end) 2893 n = n->rb_right; 2894 else if (end <= p->start) 2895 n = n->rb_left; 2896 else 2897 break; 2898 } 2899 if (!n) 2900 return NULL; 2901 for (;;) { 2902 struct sp_node *w = NULL; 2903 struct rb_node *prev = rb_prev(n); 2904 if (!prev) 2905 break; 2906 w = rb_entry(prev, struct sp_node, nd); 2907 if (w->end <= start) 2908 break; 2909 n = prev; 2910 } 2911 return rb_entry(n, struct sp_node, nd); 2912 } 2913 2914 /* 2915 * Insert a new shared policy into the list. Caller holds sp->lock for 2916 * writing. 2917 */ 2918 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2919 { 2920 struct rb_node **p = &sp->root.rb_node; 2921 struct rb_node *parent = NULL; 2922 struct sp_node *nd; 2923 2924 while (*p) { 2925 parent = *p; 2926 nd = rb_entry(parent, struct sp_node, nd); 2927 if (new->start < nd->start) 2928 p = &(*p)->rb_left; 2929 else if (new->end > nd->end) 2930 p = &(*p)->rb_right; 2931 else 2932 BUG(); 2933 } 2934 rb_link_node(&new->nd, parent, p); 2935 rb_insert_color(&new->nd, &sp->root); 2936 } 2937 2938 /* Find shared policy intersecting idx */ 2939 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2940 pgoff_t idx) 2941 { 2942 struct mempolicy *pol = NULL; 2943 struct sp_node *sn; 2944 2945 if (!sp->root.rb_node) 2946 return NULL; 2947 read_lock(&sp->lock); 2948 sn = sp_lookup(sp, idx, idx+1); 2949 if (sn) { 2950 mpol_get(sn->policy); 2951 pol = sn->policy; 2952 } 2953 read_unlock(&sp->lock); 2954 return pol; 2955 } 2956 2957 static void sp_free(struct sp_node *n) 2958 { 2959 mpol_put(n->policy); 2960 kmem_cache_free(sn_cache, n); 2961 } 2962 2963 /** 2964 * mpol_misplaced - check whether current folio node is valid in policy 2965 * 2966 * @folio: folio to be checked 2967 * @vmf: structure describing the fault 2968 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2969 * 2970 * Lookup current policy node id for vma,addr and "compare to" folio's 2971 * node id. Policy determination "mimics" alloc_page_vma(). 2972 * Called from fault path where we know the vma and faulting address. 2973 * 2974 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2975 * policy, or a suitable node ID to allocate a replacement folio from. 2976 */ 2977 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2978 unsigned long addr) 2979 { 2980 struct mempolicy *pol; 2981 pgoff_t ilx; 2982 struct zoneref *z; 2983 int curnid = folio_nid(folio); 2984 struct vm_area_struct *vma = vmf->vma; 2985 int thiscpu = raw_smp_processor_id(); 2986 int thisnid = numa_node_id(); 2987 int polnid = NUMA_NO_NODE; 2988 int ret = NUMA_NO_NODE; 2989 2990 /* 2991 * Make sure ptl is held so that we don't preempt and we 2992 * have a stable smp processor id 2993 */ 2994 lockdep_assert_held(vmf->ptl); 2995 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2996 if (!(pol->flags & MPOL_F_MOF)) 2997 goto out; 2998 2999 switch (pol->mode) { 3000 case MPOL_INTERLEAVE: 3001 polnid = interleave_nid(pol, ilx); 3002 break; 3003 3004 case MPOL_WEIGHTED_INTERLEAVE: 3005 polnid = weighted_interleave_nid(pol, ilx); 3006 break; 3007 3008 case MPOL_PREFERRED: 3009 if (node_isset(curnid, pol->nodes)) 3010 goto out; 3011 polnid = first_node(pol->nodes); 3012 break; 3013 3014 case MPOL_LOCAL: 3015 polnid = numa_node_id(); 3016 break; 3017 3018 case MPOL_BIND: 3019 case MPOL_PREFERRED_MANY: 3020 /* 3021 * Even though MPOL_PREFERRED_MANY can allocate pages outside 3022 * policy nodemask we don't allow numa migration to nodes 3023 * outside policy nodemask for now. This is done so that if we 3024 * want demotion to slow memory to happen, before allocating 3025 * from some DRAM node say 'x', we will end up using a 3026 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 3027 * we should not promote to node 'x' from slow memory node. 3028 */ 3029 if (pol->flags & MPOL_F_MORON) { 3030 /* 3031 * Optimize placement among multiple nodes 3032 * via NUMA balancing 3033 */ 3034 if (node_isset(thisnid, pol->nodes)) 3035 break; 3036 goto out; 3037 } 3038 3039 /* 3040 * use current page if in policy nodemask, 3041 * else select nearest allowed node, if any. 3042 * If no allowed nodes, use current [!misplaced]. 3043 */ 3044 if (node_isset(curnid, pol->nodes)) 3045 goto out; 3046 z = first_zones_zonelist( 3047 node_zonelist(thisnid, GFP_HIGHUSER), 3048 gfp_zone(GFP_HIGHUSER), 3049 &pol->nodes); 3050 polnid = zonelist_node_idx(z); 3051 break; 3052 3053 default: 3054 BUG(); 3055 } 3056 3057 /* Migrate the folio towards the node whose CPU is referencing it */ 3058 if (pol->flags & MPOL_F_MORON) { 3059 polnid = thisnid; 3060 3061 if (!should_numa_migrate_memory(current, folio, curnid, 3062 thiscpu)) 3063 goto out; 3064 } 3065 3066 if (curnid != polnid) 3067 ret = polnid; 3068 out: 3069 mpol_cond_put(pol); 3070 3071 return ret; 3072 } 3073 3074 /* 3075 * Drop the (possibly final) reference to task->mempolicy. It needs to be 3076 * dropped after task->mempolicy is set to NULL so that any allocation done as 3077 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 3078 * policy. 3079 */ 3080 void mpol_put_task_policy(struct task_struct *task) 3081 { 3082 struct mempolicy *pol; 3083 3084 task_lock(task); 3085 pol = task->mempolicy; 3086 task->mempolicy = NULL; 3087 task_unlock(task); 3088 mpol_put(pol); 3089 } 3090 3091 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 3092 { 3093 rb_erase(&n->nd, &sp->root); 3094 sp_free(n); 3095 } 3096 3097 static void sp_node_init(struct sp_node *node, unsigned long start, 3098 unsigned long end, struct mempolicy *pol) 3099 { 3100 node->start = start; 3101 node->end = end; 3102 node->policy = pol; 3103 } 3104 3105 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 3106 struct mempolicy *pol) 3107 { 3108 struct sp_node *n; 3109 struct mempolicy *newpol; 3110 3111 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3112 if (!n) 3113 return NULL; 3114 3115 newpol = mpol_dup(pol); 3116 if (IS_ERR(newpol)) { 3117 kmem_cache_free(sn_cache, n); 3118 return NULL; 3119 } 3120 newpol->flags |= MPOL_F_SHARED; 3121 sp_node_init(n, start, end, newpol); 3122 3123 return n; 3124 } 3125 3126 /* Replace a policy range. */ 3127 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 3128 pgoff_t end, struct sp_node *new) 3129 { 3130 struct sp_node *n; 3131 struct sp_node *n_new = NULL; 3132 struct mempolicy *mpol_new = NULL; 3133 int ret = 0; 3134 3135 restart: 3136 write_lock(&sp->lock); 3137 n = sp_lookup(sp, start, end); 3138 /* Take care of old policies in the same range. */ 3139 while (n && n->start < end) { 3140 struct rb_node *next = rb_next(&n->nd); 3141 if (n->start >= start) { 3142 if (n->end <= end) 3143 sp_delete(sp, n); 3144 else 3145 n->start = end; 3146 } else { 3147 /* Old policy spanning whole new range. */ 3148 if (n->end > end) { 3149 if (!n_new) 3150 goto alloc_new; 3151 3152 *mpol_new = *n->policy; 3153 atomic_set(&mpol_new->refcnt, 1); 3154 sp_node_init(n_new, end, n->end, mpol_new); 3155 n->end = start; 3156 sp_insert(sp, n_new); 3157 n_new = NULL; 3158 mpol_new = NULL; 3159 break; 3160 } else 3161 n->end = start; 3162 } 3163 if (!next) 3164 break; 3165 n = rb_entry(next, struct sp_node, nd); 3166 } 3167 if (new) 3168 sp_insert(sp, new); 3169 write_unlock(&sp->lock); 3170 ret = 0; 3171 3172 err_out: 3173 if (mpol_new) 3174 mpol_put(mpol_new); 3175 if (n_new) 3176 kmem_cache_free(sn_cache, n_new); 3177 3178 return ret; 3179 3180 alloc_new: 3181 write_unlock(&sp->lock); 3182 ret = -ENOMEM; 3183 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3184 if (!n_new) 3185 goto err_out; 3186 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3187 if (!mpol_new) 3188 goto err_out; 3189 atomic_set(&mpol_new->refcnt, 1); 3190 goto restart; 3191 } 3192 3193 /** 3194 * mpol_shared_policy_init - initialize shared policy for inode 3195 * @sp: pointer to inode shared policy 3196 * @mpol: struct mempolicy to install 3197 * 3198 * Install non-NULL @mpol in inode's shared policy rb-tree. 3199 * On entry, the current task has a reference on a non-NULL @mpol. 3200 * This must be released on exit. 3201 * This is called at get_inode() calls and we can use GFP_KERNEL. 3202 */ 3203 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3204 { 3205 int ret; 3206 3207 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3208 rwlock_init(&sp->lock); 3209 3210 if (mpol) { 3211 struct sp_node *sn; 3212 struct mempolicy *npol; 3213 NODEMASK_SCRATCH(scratch); 3214 3215 if (!scratch) 3216 goto put_mpol; 3217 3218 /* contextualize the tmpfs mount point mempolicy to this file */ 3219 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3220 if (IS_ERR(npol)) 3221 goto free_scratch; /* no valid nodemask intersection */ 3222 3223 task_lock(current); 3224 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3225 task_unlock(current); 3226 if (ret) 3227 goto put_npol; 3228 3229 /* alloc node covering entire file; adds ref to file's npol */ 3230 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3231 if (sn) 3232 sp_insert(sp, sn); 3233 put_npol: 3234 mpol_put(npol); /* drop initial ref on file's npol */ 3235 free_scratch: 3236 NODEMASK_SCRATCH_FREE(scratch); 3237 put_mpol: 3238 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3239 } 3240 } 3241 3242 int mpol_set_shared_policy(struct shared_policy *sp, 3243 struct vm_area_struct *vma, struct mempolicy *pol) 3244 { 3245 int err; 3246 struct sp_node *new = NULL; 3247 unsigned long sz = vma_pages(vma); 3248 3249 if (pol) { 3250 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3251 if (!new) 3252 return -ENOMEM; 3253 } 3254 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3255 if (err && new) 3256 sp_free(new); 3257 return err; 3258 } 3259 3260 /* Free a backing policy store on inode delete. */ 3261 void mpol_free_shared_policy(struct shared_policy *sp) 3262 { 3263 struct sp_node *n; 3264 struct rb_node *next; 3265 3266 if (!sp->root.rb_node) 3267 return; 3268 write_lock(&sp->lock); 3269 next = rb_first(&sp->root); 3270 while (next) { 3271 n = rb_entry(next, struct sp_node, nd); 3272 next = rb_next(&n->nd); 3273 sp_delete(sp, n); 3274 } 3275 write_unlock(&sp->lock); 3276 } 3277 3278 #ifdef CONFIG_NUMA_BALANCING 3279 static int __initdata numabalancing_override; 3280 3281 static void __init check_numabalancing_enable(void) 3282 { 3283 bool numabalancing_default = false; 3284 3285 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3286 numabalancing_default = true; 3287 3288 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3289 if (numabalancing_override) 3290 set_numabalancing_state(numabalancing_override == 1); 3291 3292 if (num_online_nodes() > 1 && !numabalancing_override) { 3293 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3294 numabalancing_default ? "Enabling" : "Disabling"); 3295 set_numabalancing_state(numabalancing_default); 3296 } 3297 } 3298 3299 static int __init setup_numabalancing(char *str) 3300 { 3301 int ret = 0; 3302 if (!str) 3303 goto out; 3304 3305 if (!strcmp(str, "enable")) { 3306 numabalancing_override = 1; 3307 ret = 1; 3308 } else if (!strcmp(str, "disable")) { 3309 numabalancing_override = -1; 3310 ret = 1; 3311 } 3312 out: 3313 if (!ret) 3314 pr_warn("Unable to parse numa_balancing=\n"); 3315 3316 return ret; 3317 } 3318 __setup("numa_balancing=", setup_numabalancing); 3319 #else 3320 static inline void __init check_numabalancing_enable(void) 3321 { 3322 } 3323 #endif /* CONFIG_NUMA_BALANCING */ 3324 3325 void __init numa_policy_init(void) 3326 { 3327 nodemask_t interleave_nodes; 3328 unsigned long largest = 0; 3329 int nid, prefer = 0; 3330 3331 policy_cache = kmem_cache_create("numa_policy", 3332 sizeof(struct mempolicy), 3333 0, SLAB_PANIC, NULL); 3334 3335 sn_cache = kmem_cache_create("shared_policy_node", 3336 sizeof(struct sp_node), 3337 0, SLAB_PANIC, NULL); 3338 3339 for_each_node(nid) { 3340 preferred_node_policy[nid] = (struct mempolicy) { 3341 .refcnt = ATOMIC_INIT(1), 3342 .mode = MPOL_PREFERRED, 3343 .flags = MPOL_F_MOF | MPOL_F_MORON, 3344 .nodes = nodemask_of_node(nid), 3345 }; 3346 } 3347 3348 /* 3349 * Set interleaving policy for system init. Interleaving is only 3350 * enabled across suitably sized nodes (default is >= 16MB), or 3351 * fall back to the largest node if they're all smaller. 3352 */ 3353 nodes_clear(interleave_nodes); 3354 for_each_node_state(nid, N_MEMORY) { 3355 unsigned long total_pages = node_present_pages(nid); 3356 3357 /* Preserve the largest node */ 3358 if (largest < total_pages) { 3359 largest = total_pages; 3360 prefer = nid; 3361 } 3362 3363 /* Interleave this node? */ 3364 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3365 node_set(nid, interleave_nodes); 3366 } 3367 3368 /* All too small, use the largest */ 3369 if (unlikely(nodes_empty(interleave_nodes))) 3370 node_set(prefer, interleave_nodes); 3371 3372 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3373 pr_err("%s: interleaving failed\n", __func__); 3374 3375 check_numabalancing_enable(); 3376 } 3377 3378 /* Reset policy of current process to default */ 3379 void numa_default_policy(void) 3380 { 3381 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3382 } 3383 3384 /* 3385 * Parse and format mempolicy from/to strings 3386 */ 3387 static const char * const policy_modes[] = 3388 { 3389 [MPOL_DEFAULT] = "default", 3390 [MPOL_PREFERRED] = "prefer", 3391 [MPOL_BIND] = "bind", 3392 [MPOL_INTERLEAVE] = "interleave", 3393 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3394 [MPOL_LOCAL] = "local", 3395 [MPOL_PREFERRED_MANY] = "prefer (many)", 3396 }; 3397 3398 #ifdef CONFIG_TMPFS 3399 /** 3400 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3401 * @str: string containing mempolicy to parse 3402 * @mpol: pointer to struct mempolicy pointer, returned on success. 3403 * 3404 * Format of input: 3405 * <mode>[=<flags>][:<nodelist>] 3406 * 3407 * Return: %0 on success, else %1 3408 */ 3409 int mpol_parse_str(char *str, struct mempolicy **mpol) 3410 { 3411 struct mempolicy *new = NULL; 3412 unsigned short mode_flags; 3413 nodemask_t nodes; 3414 char *nodelist = strchr(str, ':'); 3415 char *flags = strchr(str, '='); 3416 int err = 1, mode; 3417 3418 if (flags) 3419 *flags++ = '\0'; /* terminate mode string */ 3420 3421 if (nodelist) { 3422 /* NUL-terminate mode or flags string */ 3423 *nodelist++ = '\0'; 3424 if (nodelist_parse(nodelist, nodes)) 3425 goto out; 3426 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3427 goto out; 3428 } else 3429 nodes_clear(nodes); 3430 3431 mode = match_string(policy_modes, MPOL_MAX, str); 3432 if (mode < 0) 3433 goto out; 3434 3435 switch (mode) { 3436 case MPOL_PREFERRED: 3437 /* 3438 * Insist on a nodelist of one node only, although later 3439 * we use first_node(nodes) to grab a single node, so here 3440 * nodelist (or nodes) cannot be empty. 3441 */ 3442 if (nodelist) { 3443 char *rest = nodelist; 3444 while (isdigit(*rest)) 3445 rest++; 3446 if (*rest) 3447 goto out; 3448 if (nodes_empty(nodes)) 3449 goto out; 3450 } 3451 break; 3452 case MPOL_INTERLEAVE: 3453 case MPOL_WEIGHTED_INTERLEAVE: 3454 /* 3455 * Default to online nodes with memory if no nodelist 3456 */ 3457 if (!nodelist) 3458 nodes = node_states[N_MEMORY]; 3459 break; 3460 case MPOL_LOCAL: 3461 /* 3462 * Don't allow a nodelist; mpol_new() checks flags 3463 */ 3464 if (nodelist) 3465 goto out; 3466 break; 3467 case MPOL_DEFAULT: 3468 /* 3469 * Insist on a empty nodelist 3470 */ 3471 if (!nodelist) 3472 err = 0; 3473 goto out; 3474 case MPOL_PREFERRED_MANY: 3475 case MPOL_BIND: 3476 /* 3477 * Insist on a nodelist 3478 */ 3479 if (!nodelist) 3480 goto out; 3481 } 3482 3483 mode_flags = 0; 3484 if (flags) { 3485 /* 3486 * Currently, we only support two mutually exclusive 3487 * mode flags. 3488 */ 3489 if (!strcmp(flags, "static")) 3490 mode_flags |= MPOL_F_STATIC_NODES; 3491 else if (!strcmp(flags, "relative")) 3492 mode_flags |= MPOL_F_RELATIVE_NODES; 3493 else 3494 goto out; 3495 } 3496 3497 new = mpol_new(mode, mode_flags, &nodes); 3498 if (IS_ERR(new)) 3499 goto out; 3500 3501 /* 3502 * Save nodes for mpol_to_str() to show the tmpfs mount options 3503 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3504 */ 3505 if (mode != MPOL_PREFERRED) { 3506 new->nodes = nodes; 3507 } else if (nodelist) { 3508 nodes_clear(new->nodes); 3509 node_set(first_node(nodes), new->nodes); 3510 } else { 3511 new->mode = MPOL_LOCAL; 3512 } 3513 3514 /* 3515 * Save nodes for contextualization: this will be used to "clone" 3516 * the mempolicy in a specific context [cpuset] at a later time. 3517 */ 3518 new->w.user_nodemask = nodes; 3519 3520 err = 0; 3521 3522 out: 3523 /* Restore string for error message */ 3524 if (nodelist) 3525 *--nodelist = ':'; 3526 if (flags) 3527 *--flags = '='; 3528 if (!err) 3529 *mpol = new; 3530 return err; 3531 } 3532 #endif /* CONFIG_TMPFS */ 3533 3534 /** 3535 * mpol_to_str - format a mempolicy structure for printing 3536 * @buffer: to contain formatted mempolicy string 3537 * @maxlen: length of @buffer 3538 * @pol: pointer to mempolicy to be formatted 3539 * 3540 * Convert @pol into a string. If @buffer is too short, truncate the string. 3541 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3542 * interleave", plus the longest flag flags, "relative|balancing", and to 3543 * display at least a few node ids. 3544 */ 3545 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3546 { 3547 char *p = buffer; 3548 nodemask_t nodes = NODE_MASK_NONE; 3549 unsigned short mode = MPOL_DEFAULT; 3550 unsigned short flags = 0; 3551 3552 if (pol && 3553 pol != &default_policy && 3554 !(pol >= &preferred_node_policy[0] && 3555 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3556 mode = pol->mode; 3557 flags = pol->flags; 3558 } 3559 3560 switch (mode) { 3561 case MPOL_DEFAULT: 3562 case MPOL_LOCAL: 3563 break; 3564 case MPOL_PREFERRED: 3565 case MPOL_PREFERRED_MANY: 3566 case MPOL_BIND: 3567 case MPOL_INTERLEAVE: 3568 case MPOL_WEIGHTED_INTERLEAVE: 3569 nodes = pol->nodes; 3570 break; 3571 default: 3572 WARN_ON_ONCE(1); 3573 snprintf(p, maxlen, "unknown"); 3574 return; 3575 } 3576 3577 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3578 3579 if (flags & MPOL_MODE_FLAGS) { 3580 p += snprintf(p, buffer + maxlen - p, "="); 3581 3582 /* 3583 * Static and relative are mutually exclusive. 3584 */ 3585 if (flags & MPOL_F_STATIC_NODES) 3586 p += snprintf(p, buffer + maxlen - p, "static"); 3587 else if (flags & MPOL_F_RELATIVE_NODES) 3588 p += snprintf(p, buffer + maxlen - p, "relative"); 3589 3590 if (flags & MPOL_F_NUMA_BALANCING) { 3591 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3592 p += snprintf(p, buffer + maxlen - p, "|"); 3593 p += snprintf(p, buffer + maxlen - p, "balancing"); 3594 } 3595 } 3596 3597 if (!nodes_empty(nodes)) 3598 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3599 nodemask_pr_args(&nodes)); 3600 } 3601 3602 #ifdef CONFIG_SYSFS 3603 struct iw_node_attr { 3604 struct kobj_attribute kobj_attr; 3605 int nid; 3606 }; 3607 3608 struct sysfs_wi_group { 3609 struct kobject wi_kobj; 3610 struct mutex kobj_lock; 3611 struct iw_node_attr *nattrs[]; 3612 }; 3613 3614 static struct sysfs_wi_group *wi_group; 3615 3616 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3617 char *buf) 3618 { 3619 struct iw_node_attr *node_attr; 3620 u8 weight; 3621 3622 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3623 weight = get_il_weight(node_attr->nid); 3624 return sysfs_emit(buf, "%d\n", weight); 3625 } 3626 3627 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3628 const char *buf, size_t count) 3629 { 3630 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3631 struct iw_node_attr *node_attr; 3632 u8 weight = 0; 3633 int i; 3634 3635 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3636 if (count == 0 || sysfs_streq(buf, "") || 3637 kstrtou8(buf, 0, &weight) || weight == 0) 3638 return -EINVAL; 3639 3640 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3641 GFP_KERNEL); 3642 if (!new_wi_state) 3643 return -ENOMEM; 3644 3645 mutex_lock(&wi_state_lock); 3646 old_wi_state = rcu_dereference_protected(wi_state, 3647 lockdep_is_held(&wi_state_lock)); 3648 if (old_wi_state) { 3649 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3650 nr_node_ids * sizeof(u8)); 3651 } else { 3652 for (i = 0; i < nr_node_ids; i++) 3653 new_wi_state->iw_table[i] = 1; 3654 } 3655 new_wi_state->iw_table[node_attr->nid] = weight; 3656 new_wi_state->mode_auto = false; 3657 3658 rcu_assign_pointer(wi_state, new_wi_state); 3659 mutex_unlock(&wi_state_lock); 3660 if (old_wi_state) { 3661 synchronize_rcu(); 3662 kfree(old_wi_state); 3663 } 3664 return count; 3665 } 3666 3667 static ssize_t weighted_interleave_auto_show(struct kobject *kobj, 3668 struct kobj_attribute *attr, char *buf) 3669 { 3670 struct weighted_interleave_state *state; 3671 bool wi_auto = true; 3672 3673 rcu_read_lock(); 3674 state = rcu_dereference(wi_state); 3675 if (state) 3676 wi_auto = state->mode_auto; 3677 rcu_read_unlock(); 3678 3679 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); 3680 } 3681 3682 static ssize_t weighted_interleave_auto_store(struct kobject *kobj, 3683 struct kobj_attribute *attr, const char *buf, size_t count) 3684 { 3685 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3686 unsigned int *bw; 3687 bool input; 3688 int i; 3689 3690 if (kstrtobool(buf, &input)) 3691 return -EINVAL; 3692 3693 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3694 GFP_KERNEL); 3695 if (!new_wi_state) 3696 return -ENOMEM; 3697 for (i = 0; i < nr_node_ids; i++) 3698 new_wi_state->iw_table[i] = 1; 3699 3700 mutex_lock(&wi_state_lock); 3701 if (!input) { 3702 old_wi_state = rcu_dereference_protected(wi_state, 3703 lockdep_is_held(&wi_state_lock)); 3704 if (!old_wi_state) 3705 goto update_wi_state; 3706 if (input == old_wi_state->mode_auto) { 3707 mutex_unlock(&wi_state_lock); 3708 return count; 3709 } 3710 3711 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3712 nr_node_ids * sizeof(u8)); 3713 goto update_wi_state; 3714 } 3715 3716 bw = node_bw_table; 3717 if (!bw) { 3718 mutex_unlock(&wi_state_lock); 3719 kfree(new_wi_state); 3720 return -ENODEV; 3721 } 3722 3723 new_wi_state->mode_auto = true; 3724 reduce_interleave_weights(bw, new_wi_state->iw_table); 3725 3726 update_wi_state: 3727 rcu_assign_pointer(wi_state, new_wi_state); 3728 mutex_unlock(&wi_state_lock); 3729 if (old_wi_state) { 3730 synchronize_rcu(); 3731 kfree(old_wi_state); 3732 } 3733 return count; 3734 } 3735 3736 static void sysfs_wi_node_delete(int nid) 3737 { 3738 struct iw_node_attr *attr; 3739 3740 if (nid < 0 || nid >= nr_node_ids) 3741 return; 3742 3743 mutex_lock(&wi_group->kobj_lock); 3744 attr = wi_group->nattrs[nid]; 3745 if (!attr) { 3746 mutex_unlock(&wi_group->kobj_lock); 3747 return; 3748 } 3749 3750 wi_group->nattrs[nid] = NULL; 3751 mutex_unlock(&wi_group->kobj_lock); 3752 3753 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); 3754 kfree(attr->kobj_attr.attr.name); 3755 kfree(attr); 3756 } 3757 3758 static void sysfs_wi_node_delete_all(void) 3759 { 3760 int nid; 3761 3762 for (nid = 0; nid < nr_node_ids; nid++) 3763 sysfs_wi_node_delete(nid); 3764 } 3765 3766 static void wi_state_free(void) 3767 { 3768 struct weighted_interleave_state *old_wi_state; 3769 3770 mutex_lock(&wi_state_lock); 3771 old_wi_state = rcu_dereference_protected(wi_state, 3772 lockdep_is_held(&wi_state_lock)); 3773 rcu_assign_pointer(wi_state, NULL); 3774 mutex_unlock(&wi_state_lock); 3775 3776 if (old_wi_state) { 3777 synchronize_rcu(); 3778 kfree(old_wi_state); 3779 } 3780 } 3781 3782 static struct kobj_attribute wi_auto_attr = 3783 __ATTR(auto, 0664, weighted_interleave_auto_show, 3784 weighted_interleave_auto_store); 3785 3786 static void wi_cleanup(void) { 3787 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3788 sysfs_wi_node_delete_all(); 3789 wi_state_free(); 3790 } 3791 3792 static void wi_kobj_release(struct kobject *wi_kobj) 3793 { 3794 kfree(wi_group); 3795 } 3796 3797 static const struct kobj_type wi_ktype = { 3798 .sysfs_ops = &kobj_sysfs_ops, 3799 .release = wi_kobj_release, 3800 }; 3801 3802 static int sysfs_wi_node_add(int nid) 3803 { 3804 int ret; 3805 char *name; 3806 struct iw_node_attr *new_attr; 3807 3808 if (nid < 0 || nid >= nr_node_ids) { 3809 pr_err("invalid node id: %d\n", nid); 3810 return -EINVAL; 3811 } 3812 3813 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); 3814 if (!new_attr) 3815 return -ENOMEM; 3816 3817 name = kasprintf(GFP_KERNEL, "node%d", nid); 3818 if (!name) { 3819 kfree(new_attr); 3820 return -ENOMEM; 3821 } 3822 3823 sysfs_attr_init(&new_attr->kobj_attr.attr); 3824 new_attr->kobj_attr.attr.name = name; 3825 new_attr->kobj_attr.attr.mode = 0644; 3826 new_attr->kobj_attr.show = node_show; 3827 new_attr->kobj_attr.store = node_store; 3828 new_attr->nid = nid; 3829 3830 mutex_lock(&wi_group->kobj_lock); 3831 if (wi_group->nattrs[nid]) { 3832 mutex_unlock(&wi_group->kobj_lock); 3833 ret = -EEXIST; 3834 goto out; 3835 } 3836 3837 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); 3838 if (ret) { 3839 mutex_unlock(&wi_group->kobj_lock); 3840 goto out; 3841 } 3842 wi_group->nattrs[nid] = new_attr; 3843 mutex_unlock(&wi_group->kobj_lock); 3844 return 0; 3845 3846 out: 3847 kfree(new_attr->kobj_attr.attr.name); 3848 kfree(new_attr); 3849 return ret; 3850 } 3851 3852 static int wi_node_notifier(struct notifier_block *nb, 3853 unsigned long action, void *data) 3854 { 3855 int err; 3856 struct node_notify *nn = data; 3857 int nid = nn->nid; 3858 3859 switch (action) { 3860 case NODE_ADDED_FIRST_MEMORY: 3861 err = sysfs_wi_node_add(nid); 3862 if (err) 3863 pr_err("failed to add sysfs for node%d during hotplug: %d\n", 3864 nid, err); 3865 break; 3866 case NODE_REMOVED_LAST_MEMORY: 3867 sysfs_wi_node_delete(nid); 3868 break; 3869 } 3870 3871 return NOTIFY_OK; 3872 } 3873 3874 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) 3875 { 3876 int nid, err; 3877 3878 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), 3879 GFP_KERNEL); 3880 if (!wi_group) 3881 return -ENOMEM; 3882 mutex_init(&wi_group->kobj_lock); 3883 3884 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, 3885 "weighted_interleave"); 3886 if (err) 3887 goto err_put_kobj; 3888 3889 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3890 if (err) 3891 goto err_put_kobj; 3892 3893 for_each_online_node(nid) { 3894 if (!node_state(nid, N_MEMORY)) 3895 continue; 3896 3897 err = sysfs_wi_node_add(nid); 3898 if (err) { 3899 pr_err("failed to add sysfs for node%d during init: %d\n", 3900 nid, err); 3901 goto err_cleanup_kobj; 3902 } 3903 } 3904 3905 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); 3906 return 0; 3907 3908 err_cleanup_kobj: 3909 wi_cleanup(); 3910 kobject_del(&wi_group->wi_kobj); 3911 err_put_kobj: 3912 kobject_put(&wi_group->wi_kobj); 3913 return err; 3914 } 3915 3916 static int __init mempolicy_sysfs_init(void) 3917 { 3918 int err; 3919 static struct kobject *mempolicy_kobj; 3920 3921 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); 3922 if (!mempolicy_kobj) 3923 return -ENOMEM; 3924 3925 err = add_weighted_interleave_group(mempolicy_kobj); 3926 if (err) 3927 goto err_kobj; 3928 3929 return 0; 3930 3931 err_kobj: 3932 kobject_del(mempolicy_kobj); 3933 kobject_put(mempolicy_kobj); 3934 return err; 3935 } 3936 3937 late_initcall(mempolicy_sysfs_init); 3938 #endif /* CONFIG_SYSFS */ 3939