1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/sysctl.h> 89 #include <linux/sched/task.h> 90 #include <linux/nodemask.h> 91 #include <linux/cpuset.h> 92 #include <linux/slab.h> 93 #include <linux/string.h> 94 #include <linux/export.h> 95 #include <linux/nsproxy.h> 96 #include <linux/interrupt.h> 97 #include <linux/init.h> 98 #include <linux/compat.h> 99 #include <linux/ptrace.h> 100 #include <linux/swap.h> 101 #include <linux/seq_file.h> 102 #include <linux/proc_fs.h> 103 #include <linux/memory-tiers.h> 104 #include <linux/migrate.h> 105 #include <linux/ksm.h> 106 #include <linux/rmap.h> 107 #include <linux/security.h> 108 #include <linux/syscalls.h> 109 #include <linux/ctype.h> 110 #include <linux/mm_inline.h> 111 #include <linux/mmu_notifier.h> 112 #include <linux/printk.h> 113 #include <linux/leafops.h> 114 #include <linux/gcd.h> 115 116 #include <asm/tlbflush.h> 117 #include <asm/tlb.h> 118 #include <linux/uaccess.h> 119 #include <linux/memory.h> 120 121 #include "internal.h" 122 123 /* Internal flags */ 124 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 125 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 126 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 127 128 static struct kmem_cache *policy_cache; 129 static struct kmem_cache *sn_cache; 130 131 /* Highest zone. An specific allocation for a zone below that is not 132 policied. */ 133 enum zone_type policy_zone = 0; 134 135 /* 136 * run-time system-wide default policy => local allocation 137 */ 138 static struct mempolicy default_policy = { 139 .refcnt = ATOMIC_INIT(1), /* never free it */ 140 .mode = MPOL_LOCAL, 141 }; 142 143 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 144 145 /* 146 * weightiness balances the tradeoff between small weights (cycles through nodes 147 * faster, more fair/even distribution) and large weights (smaller errors 148 * between actual bandwidth ratios and weight ratios). 32 is a number that has 149 * been found to perform at a reasonable compromise between the two goals. 150 */ 151 static const int weightiness = 32; 152 153 /* 154 * A null weighted_interleave_state is interpreted as having .mode="auto", 155 * and .iw_table is interpreted as an array of 1s with length nr_node_ids. 156 */ 157 struct weighted_interleave_state { 158 bool mode_auto; 159 u8 iw_table[]; 160 }; 161 static struct weighted_interleave_state __rcu *wi_state; 162 static unsigned int *node_bw_table; 163 164 /* 165 * wi_state_lock protects both wi_state and node_bw_table. 166 * node_bw_table is only used by writers to update wi_state. 167 */ 168 static DEFINE_MUTEX(wi_state_lock); 169 170 static u8 get_il_weight(int node) 171 { 172 struct weighted_interleave_state *state; 173 u8 weight = 1; 174 175 rcu_read_lock(); 176 state = rcu_dereference(wi_state); 177 if (state) 178 weight = state->iw_table[node]; 179 rcu_read_unlock(); 180 return weight; 181 } 182 183 /* 184 * Convert bandwidth values into weighted interleave weights. 185 * Call with wi_state_lock. 186 */ 187 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) 188 { 189 u64 sum_bw = 0; 190 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; 191 int nid; 192 193 for_each_node_state(nid, N_MEMORY) 194 sum_bw += bw[nid]; 195 196 /* Scale bandwidths to whole numbers in the range [1, weightiness] */ 197 for_each_node_state(nid, N_MEMORY) { 198 /* 199 * Try not to perform 64-bit division. 200 * If sum_bw < scaling_factor, then sum_bw < U32_MAX. 201 * If sum_bw > scaling_factor, then round the weight up to 1. 202 */ 203 scaling_factor = weightiness * bw[nid]; 204 if (bw[nid] && sum_bw < scaling_factor) { 205 cast_sum_bw = (unsigned int)sum_bw; 206 new_iw[nid] = scaling_factor / cast_sum_bw; 207 } else { 208 new_iw[nid] = 1; 209 } 210 if (!iw_gcd) 211 iw_gcd = new_iw[nid]; 212 iw_gcd = gcd(iw_gcd, new_iw[nid]); 213 } 214 215 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ 216 for_each_node_state(nid, N_MEMORY) 217 new_iw[nid] /= iw_gcd; 218 } 219 220 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) 221 { 222 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 223 unsigned int *old_bw, *new_bw; 224 unsigned int bw_val; 225 int i; 226 227 bw_val = min(coords->read_bandwidth, coords->write_bandwidth); 228 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); 229 if (!new_bw) 230 return -ENOMEM; 231 232 new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 233 GFP_KERNEL); 234 if (!new_wi_state) { 235 kfree(new_bw); 236 return -ENOMEM; 237 } 238 new_wi_state->mode_auto = true; 239 for (i = 0; i < nr_node_ids; i++) 240 new_wi_state->iw_table[i] = 1; 241 242 /* 243 * Update bandwidth info, even in manual mode. That way, when switching 244 * to auto mode in the future, iw_table can be overwritten using 245 * accurate bw data. 246 */ 247 mutex_lock(&wi_state_lock); 248 249 old_bw = node_bw_table; 250 if (old_bw) 251 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); 252 new_bw[node] = bw_val; 253 node_bw_table = new_bw; 254 255 old_wi_state = rcu_dereference_protected(wi_state, 256 lockdep_is_held(&wi_state_lock)); 257 if (old_wi_state && !old_wi_state->mode_auto) { 258 /* Manual mode; skip reducing weights and updating wi_state */ 259 mutex_unlock(&wi_state_lock); 260 kfree(new_wi_state); 261 goto out; 262 } 263 264 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ 265 reduce_interleave_weights(new_bw, new_wi_state->iw_table); 266 rcu_assign_pointer(wi_state, new_wi_state); 267 268 mutex_unlock(&wi_state_lock); 269 if (old_wi_state) { 270 synchronize_rcu(); 271 kfree(old_wi_state); 272 } 273 out: 274 kfree(old_bw); 275 return 0; 276 } 277 278 /** 279 * numa_nearest_node - Find nearest node by state 280 * @node: Node id to start the search 281 * @state: State to filter the search 282 * 283 * Lookup the closest node by distance if @nid is not in state. 284 * 285 * Return: this @node if it is in state, otherwise the closest node by distance 286 */ 287 int numa_nearest_node(int node, unsigned int state) 288 { 289 int min_dist = INT_MAX, dist, n, min_node; 290 291 if (state >= NR_NODE_STATES) 292 return -EINVAL; 293 294 if (node == NUMA_NO_NODE || node_state(node, state)) 295 return node; 296 297 min_node = node; 298 for_each_node_state(n, state) { 299 dist = node_distance(node, n); 300 if (dist < min_dist) { 301 min_dist = dist; 302 min_node = n; 303 } 304 } 305 306 return min_node; 307 } 308 EXPORT_SYMBOL_GPL(numa_nearest_node); 309 310 /** 311 * nearest_node_nodemask - Find the node in @mask at the nearest distance 312 * from @node. 313 * 314 * @node: a valid node ID to start the search from. 315 * @mask: a pointer to a nodemask representing the allowed nodes. 316 * 317 * This function iterates over all nodes in @mask and calculates the 318 * distance from the starting @node, then it returns the node ID that is 319 * the closest to @node, or MAX_NUMNODES if no node is found. 320 * 321 * Note that @node must be a valid node ID usable with node_distance(), 322 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 323 * or unexpected behavior. 324 */ 325 int nearest_node_nodemask(int node, nodemask_t *mask) 326 { 327 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 328 329 for_each_node_mask(n, *mask) { 330 dist = node_distance(node, n); 331 if (dist < min_dist) { 332 min_dist = dist; 333 min_node = n; 334 } 335 } 336 337 return min_node; 338 } 339 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 340 341 struct mempolicy *get_task_policy(struct task_struct *p) 342 { 343 struct mempolicy *pol = p->mempolicy; 344 int node; 345 346 if (pol) 347 return pol; 348 349 node = numa_node_id(); 350 if (node != NUMA_NO_NODE) { 351 pol = &preferred_node_policy[node]; 352 /* preferred_node_policy is not initialised early in boot */ 353 if (pol->mode) 354 return pol; 355 } 356 357 return &default_policy; 358 } 359 EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); 360 361 static const struct mempolicy_operations { 362 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 363 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 364 } mpol_ops[MPOL_MAX]; 365 366 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 367 { 368 return pol->flags & MPOL_USER_NODEMASK_FLAGS; 369 } 370 371 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 372 const nodemask_t *rel) 373 { 374 nodemask_t tmp; 375 nodes_fold(tmp, *orig, nodes_weight(*rel)); 376 nodes_onto(*ret, tmp, *rel); 377 } 378 379 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 380 { 381 if (nodes_empty(*nodes)) 382 return -EINVAL; 383 pol->nodes = *nodes; 384 return 0; 385 } 386 387 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 388 { 389 if (nodes_empty(*nodes)) 390 return -EINVAL; 391 392 nodes_clear(pol->nodes); 393 node_set(first_node(*nodes), pol->nodes); 394 return 0; 395 } 396 397 /* 398 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 399 * any, for the new policy. mpol_new() has already validated the nodes 400 * parameter with respect to the policy mode and flags. 401 * 402 * Must be called holding task's alloc_lock to protect task's mems_allowed 403 * and mempolicy. May also be called holding the mmap_lock for write. 404 */ 405 static int mpol_set_nodemask(struct mempolicy *pol, 406 const nodemask_t *nodes, struct nodemask_scratch *nsc) 407 { 408 int ret; 409 410 /* 411 * Default (pol==NULL) resp. local memory policies are not a 412 * subject of any remapping. They also do not need any special 413 * constructor. 414 */ 415 if (!pol || pol->mode == MPOL_LOCAL) 416 return 0; 417 418 /* Check N_MEMORY */ 419 nodes_and(nsc->mask1, 420 cpuset_current_mems_allowed, node_states[N_MEMORY]); 421 422 VM_BUG_ON(!nodes); 423 424 if (pol->flags & MPOL_F_RELATIVE_NODES) 425 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 426 else 427 nodes_and(nsc->mask2, *nodes, nsc->mask1); 428 429 if (mpol_store_user_nodemask(pol)) 430 pol->w.user_nodemask = *nodes; 431 else 432 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 433 434 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 435 return ret; 436 } 437 438 /* 439 * This function just creates a new policy, does some check and simple 440 * initialization. You must invoke mpol_set_nodemask() to set nodes. 441 */ 442 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 443 nodemask_t *nodes) 444 { 445 struct mempolicy *policy; 446 447 if (mode == MPOL_DEFAULT) { 448 if (nodes && !nodes_empty(*nodes)) 449 return ERR_PTR(-EINVAL); 450 return NULL; 451 } 452 VM_BUG_ON(!nodes); 453 454 /* 455 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 456 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 457 * All other modes require a valid pointer to a non-empty nodemask. 458 */ 459 if (mode == MPOL_PREFERRED) { 460 if (nodes_empty(*nodes)) { 461 if (((flags & MPOL_F_STATIC_NODES) || 462 (flags & MPOL_F_RELATIVE_NODES))) 463 return ERR_PTR(-EINVAL); 464 465 mode = MPOL_LOCAL; 466 } 467 } else if (mode == MPOL_LOCAL) { 468 if (!nodes_empty(*nodes) || 469 (flags & MPOL_F_STATIC_NODES) || 470 (flags & MPOL_F_RELATIVE_NODES)) 471 return ERR_PTR(-EINVAL); 472 } else if (nodes_empty(*nodes)) 473 return ERR_PTR(-EINVAL); 474 475 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 476 if (!policy) 477 return ERR_PTR(-ENOMEM); 478 atomic_set(&policy->refcnt, 1); 479 policy->mode = mode; 480 policy->flags = flags; 481 policy->home_node = NUMA_NO_NODE; 482 483 return policy; 484 } 485 486 /* Slow path of a mpol destructor. */ 487 void __mpol_put(struct mempolicy *pol) 488 { 489 if (!atomic_dec_and_test(&pol->refcnt)) 490 return; 491 kmem_cache_free(policy_cache, pol); 492 } 493 EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); 494 495 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 496 { 497 } 498 499 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 500 { 501 nodemask_t tmp; 502 503 if (pol->flags & MPOL_F_STATIC_NODES) 504 nodes_and(tmp, pol->w.user_nodemask, *nodes); 505 else if (pol->flags & MPOL_F_RELATIVE_NODES) 506 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 507 else { 508 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 509 *nodes); 510 pol->w.cpuset_mems_allowed = *nodes; 511 } 512 513 if (nodes_empty(tmp)) 514 tmp = *nodes; 515 516 pol->nodes = tmp; 517 } 518 519 static void mpol_rebind_preferred(struct mempolicy *pol, 520 const nodemask_t *nodes) 521 { 522 pol->w.cpuset_mems_allowed = *nodes; 523 } 524 525 /* 526 * mpol_rebind_policy - Migrate a policy to a different set of nodes 527 * 528 * Per-vma policies are protected by mmap_lock. Allocations using per-task 529 * policies are protected by task->mems_allowed_seq to prevent a premature 530 * OOM/allocation failure due to parallel nodemask modification. 531 */ 532 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 533 { 534 if (!pol || pol->mode == MPOL_LOCAL) 535 return; 536 if (!mpol_store_user_nodemask(pol) && 537 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 538 return; 539 540 mpol_ops[pol->mode].rebind(pol, newmask); 541 } 542 543 /* 544 * Wrapper for mpol_rebind_policy() that just requires task 545 * pointer, and updates task mempolicy. 546 * 547 * Called with task's alloc_lock held. 548 */ 549 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 550 { 551 mpol_rebind_policy(tsk->mempolicy, new); 552 } 553 554 /* 555 * Rebind each vma in mm to new nodemask. 556 * 557 * Call holding a reference to mm. Takes mm->mmap_lock during call. 558 */ 559 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 560 { 561 struct vm_area_struct *vma; 562 VMA_ITERATOR(vmi, mm, 0); 563 564 mmap_write_lock(mm); 565 for_each_vma(vmi, vma) { 566 vma_start_write(vma); 567 mpol_rebind_policy(vma->vm_policy, new); 568 } 569 mmap_write_unlock(mm); 570 } 571 572 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 573 [MPOL_DEFAULT] = { 574 .rebind = mpol_rebind_default, 575 }, 576 [MPOL_INTERLEAVE] = { 577 .create = mpol_new_nodemask, 578 .rebind = mpol_rebind_nodemask, 579 }, 580 [MPOL_PREFERRED] = { 581 .create = mpol_new_preferred, 582 .rebind = mpol_rebind_preferred, 583 }, 584 [MPOL_BIND] = { 585 .create = mpol_new_nodemask, 586 .rebind = mpol_rebind_nodemask, 587 }, 588 [MPOL_LOCAL] = { 589 .rebind = mpol_rebind_default, 590 }, 591 [MPOL_PREFERRED_MANY] = { 592 .create = mpol_new_nodemask, 593 .rebind = mpol_rebind_preferred, 594 }, 595 [MPOL_WEIGHTED_INTERLEAVE] = { 596 .create = mpol_new_nodemask, 597 .rebind = mpol_rebind_nodemask, 598 }, 599 }; 600 601 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 602 unsigned long flags); 603 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 604 pgoff_t ilx, int *nid); 605 606 static bool strictly_unmovable(unsigned long flags) 607 { 608 /* 609 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 610 * if any misplaced page is found. 611 */ 612 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 613 MPOL_MF_STRICT; 614 } 615 616 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 617 struct mempolicy *pol; 618 pgoff_t ilx; 619 }; 620 621 struct queue_pages { 622 struct list_head *pagelist; 623 unsigned long flags; 624 nodemask_t *nmask; 625 unsigned long start; 626 unsigned long end; 627 struct vm_area_struct *first; 628 struct folio *large; /* note last large folio encountered */ 629 long nr_failed; /* could not be isolated at this time */ 630 }; 631 632 /* 633 * Check if the folio's nid is in qp->nmask. 634 * 635 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 636 * in the invert of qp->nmask. 637 */ 638 static inline bool queue_folio_required(struct folio *folio, 639 struct queue_pages *qp) 640 { 641 int nid = folio_nid(folio); 642 unsigned long flags = qp->flags; 643 644 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 645 } 646 647 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 648 { 649 struct folio *folio; 650 struct queue_pages *qp = walk->private; 651 652 if (unlikely(pmd_is_migration_entry(*pmd))) { 653 qp->nr_failed++; 654 return; 655 } 656 folio = pmd_folio(*pmd); 657 if (is_huge_zero_folio(folio)) { 658 walk->action = ACTION_CONTINUE; 659 return; 660 } 661 if (!queue_folio_required(folio, qp)) 662 return; 663 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 664 !vma_migratable(walk->vma) || 665 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 666 qp->nr_failed++; 667 } 668 669 /* 670 * Scan through folios, checking if they satisfy the required conditions, 671 * moving them from LRU to local pagelist for migration if they do (or not). 672 * 673 * queue_folios_pte_range() has two possible return values: 674 * 0 - continue walking to scan for more, even if an existing folio on the 675 * wrong node could not be isolated and queued for migration. 676 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 677 * and an existing folio was on a node that does not follow the policy. 678 */ 679 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 680 unsigned long end, struct mm_walk *walk) 681 { 682 struct vm_area_struct *vma = walk->vma; 683 struct folio *folio; 684 struct queue_pages *qp = walk->private; 685 unsigned long flags = qp->flags; 686 pte_t *pte, *mapped_pte; 687 pte_t ptent; 688 spinlock_t *ptl; 689 int max_nr, nr; 690 691 ptl = pmd_trans_huge_lock(pmd, vma); 692 if (ptl) { 693 queue_folios_pmd(pmd, walk); 694 spin_unlock(ptl); 695 goto out; 696 } 697 698 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 699 if (!pte) { 700 walk->action = ACTION_AGAIN; 701 return 0; 702 } 703 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 704 max_nr = (end - addr) >> PAGE_SHIFT; 705 nr = 1; 706 ptent = ptep_get(pte); 707 if (pte_none(ptent)) 708 continue; 709 if (!pte_present(ptent)) { 710 const softleaf_t entry = softleaf_from_pte(ptent); 711 712 if (softleaf_is_migration(entry)) 713 qp->nr_failed++; 714 continue; 715 } 716 folio = vm_normal_folio(vma, addr, ptent); 717 if (!folio || folio_is_zone_device(folio)) 718 continue; 719 if (folio_test_large(folio) && max_nr != 1) 720 nr = folio_pte_batch(folio, pte, ptent, max_nr); 721 /* 722 * vm_normal_folio() filters out zero pages, but there might 723 * still be reserved folios to skip, perhaps in a VDSO. 724 */ 725 if (folio_test_reserved(folio)) 726 continue; 727 if (!queue_folio_required(folio, qp)) 728 continue; 729 if (folio_test_large(folio)) { 730 /* 731 * A large folio can only be isolated from LRU once, 732 * but may be mapped by many PTEs (and Copy-On-Write may 733 * intersperse PTEs of other, order 0, folios). This is 734 * a common case, so don't mistake it for failure (but 735 * there can be other cases of multi-mapped pages which 736 * this quick check does not help to filter out - and a 737 * search of the pagelist might grow to be prohibitive). 738 * 739 * migrate_pages(&pagelist) returns nr_failed folios, so 740 * check "large" now so that queue_pages_range() returns 741 * a comparable nr_failed folios. This does imply that 742 * if folio could not be isolated for some racy reason 743 * at its first PTE, later PTEs will not give it another 744 * chance of isolation; but keeps the accounting simple. 745 */ 746 if (folio == qp->large) 747 continue; 748 qp->large = folio; 749 } 750 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 751 !vma_migratable(vma) || 752 !migrate_folio_add(folio, qp->pagelist, flags)) { 753 qp->nr_failed += nr; 754 if (strictly_unmovable(flags)) 755 break; 756 } 757 } 758 pte_unmap_unlock(mapped_pte, ptl); 759 cond_resched(); 760 out: 761 if (qp->nr_failed && strictly_unmovable(flags)) 762 return -EIO; 763 return 0; 764 } 765 766 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 767 unsigned long addr, unsigned long end, 768 struct mm_walk *walk) 769 { 770 #ifdef CONFIG_HUGETLB_PAGE 771 struct queue_pages *qp = walk->private; 772 unsigned long flags = qp->flags; 773 struct folio *folio; 774 spinlock_t *ptl; 775 pte_t ptep; 776 777 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 778 ptep = huge_ptep_get(walk->mm, addr, pte); 779 if (!pte_present(ptep)) { 780 if (!huge_pte_none(ptep)) { 781 const softleaf_t entry = softleaf_from_pte(ptep); 782 783 if (unlikely(softleaf_is_migration(entry))) 784 qp->nr_failed++; 785 } 786 787 goto unlock; 788 } 789 folio = pfn_folio(pte_pfn(ptep)); 790 if (!queue_folio_required(folio, qp)) 791 goto unlock; 792 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 793 !vma_migratable(walk->vma)) { 794 qp->nr_failed++; 795 goto unlock; 796 } 797 /* 798 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 799 * Choosing not to migrate a shared folio is not counted as a failure. 800 * 801 * See folio_maybe_mapped_shared() on possible imprecision when we 802 * cannot easily detect if a folio is shared. 803 */ 804 if ((flags & MPOL_MF_MOVE_ALL) || 805 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 806 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 807 qp->nr_failed++; 808 unlock: 809 spin_unlock(ptl); 810 if (qp->nr_failed && strictly_unmovable(flags)) 811 return -EIO; 812 #endif 813 return 0; 814 } 815 816 #ifdef CONFIG_NUMA_BALANCING 817 /** 818 * folio_can_map_prot_numa() - check whether the folio can map prot numa 819 * @folio: The folio whose mapping considered for being made NUMA hintable 820 * @vma: The VMA that the folio belongs to. 821 * @is_private_single_threaded: Is this a single-threaded private VMA or not 822 * 823 * This function checks to see if the folio actually indicates that 824 * we need to make the mapping one which causes a NUMA hinting fault, 825 * as there are cases where it's simply unnecessary, and the folio's 826 * access time is adjusted for memory tiering if prot numa needed. 827 * 828 * Return: True if the mapping of the folio needs to be changed, false otherwise. 829 */ 830 bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, 831 bool is_private_single_threaded) 832 { 833 int nid; 834 835 if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) 836 return false; 837 838 /* Also skip shared copy-on-write folios */ 839 if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) 840 return false; 841 842 /* Folios are pinned and can't be migrated */ 843 if (folio_maybe_dma_pinned(folio)) 844 return false; 845 846 /* 847 * While migration can move some dirty folios, 848 * it cannot move them all from MIGRATE_ASYNC 849 * context. 850 */ 851 if (folio_is_file_lru(folio) && folio_test_dirty(folio)) 852 return false; 853 854 /* 855 * Don't mess with PTEs if folio is already on the node 856 * a single-threaded process is running on. 857 */ 858 nid = folio_nid(folio); 859 if (is_private_single_threaded && (nid == numa_node_id())) 860 return false; 861 862 /* 863 * Skip scanning top tier node if normal numa 864 * balancing is disabled 865 */ 866 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 867 node_is_toptier(nid)) 868 return false; 869 870 if (folio_use_access_time(folio)) 871 folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); 872 873 return true; 874 } 875 876 /* 877 * This is used to mark a range of virtual addresses to be inaccessible. 878 * These are later cleared by a NUMA hinting fault. Depending on these 879 * faults, pages may be migrated for better NUMA placement. 880 * 881 * This is assuming that NUMA faults are handled using PROT_NONE. If 882 * an architecture makes a different choice, it will need further 883 * changes to the core. 884 */ 885 unsigned long change_prot_numa(struct vm_area_struct *vma, 886 unsigned long addr, unsigned long end) 887 { 888 struct mmu_gather tlb; 889 long nr_updated; 890 891 tlb_gather_mmu(&tlb, vma->vm_mm); 892 893 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 894 if (nr_updated > 0) { 895 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 896 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 897 } 898 899 tlb_finish_mmu(&tlb); 900 901 return nr_updated; 902 } 903 #endif /* CONFIG_NUMA_BALANCING */ 904 905 static int queue_pages_test_walk(unsigned long start, unsigned long end, 906 struct mm_walk *walk) 907 { 908 struct vm_area_struct *next, *vma = walk->vma; 909 struct queue_pages *qp = walk->private; 910 unsigned long flags = qp->flags; 911 912 /* range check first */ 913 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 914 915 if (!qp->first) { 916 qp->first = vma; 917 if (!(flags & MPOL_MF_DISCONTIG_OK) && 918 (qp->start < vma->vm_start)) 919 /* hole at head side of range */ 920 return -EFAULT; 921 } 922 next = find_vma(vma->vm_mm, vma->vm_end); 923 if (!(flags & MPOL_MF_DISCONTIG_OK) && 924 ((vma->vm_end < qp->end) && 925 (!next || vma->vm_end < next->vm_start))) 926 /* hole at middle or tail of range */ 927 return -EFAULT; 928 929 /* 930 * Need check MPOL_MF_STRICT to return -EIO if possible 931 * regardless of vma_migratable 932 */ 933 if (!vma_migratable(vma) && 934 !(flags & MPOL_MF_STRICT)) 935 return 1; 936 937 /* 938 * Check page nodes, and queue pages to move, in the current vma. 939 * But if no moving, and no strict checking, the scan can be skipped. 940 */ 941 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 942 return 0; 943 return 1; 944 } 945 946 static const struct mm_walk_ops queue_pages_walk_ops = { 947 .hugetlb_entry = queue_folios_hugetlb, 948 .pmd_entry = queue_folios_pte_range, 949 .test_walk = queue_pages_test_walk, 950 .walk_lock = PGWALK_RDLOCK, 951 }; 952 953 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 954 .hugetlb_entry = queue_folios_hugetlb, 955 .pmd_entry = queue_folios_pte_range, 956 .test_walk = queue_pages_test_walk, 957 .walk_lock = PGWALK_WRLOCK, 958 }; 959 960 /* 961 * Walk through page tables and collect pages to be migrated. 962 * 963 * If pages found in a given range are not on the required set of @nodes, 964 * and migration is allowed, they are isolated and queued to @pagelist. 965 * 966 * queue_pages_range() may return: 967 * 0 - all pages already on the right node, or successfully queued for moving 968 * (or neither strict checking nor moving requested: only range checking). 969 * >0 - this number of misplaced folios could not be queued for moving 970 * (a hugetlbfs page or a transparent huge page being counted as 1). 971 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 972 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 973 */ 974 static long 975 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 976 nodemask_t *nodes, unsigned long flags, 977 struct list_head *pagelist) 978 { 979 int err; 980 struct queue_pages qp = { 981 .pagelist = pagelist, 982 .flags = flags, 983 .nmask = nodes, 984 .start = start, 985 .end = end, 986 .first = NULL, 987 }; 988 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 989 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 990 991 err = walk_page_range(mm, start, end, ops, &qp); 992 993 if (!qp.first) 994 /* whole range in hole */ 995 err = -EFAULT; 996 997 return err ? : qp.nr_failed; 998 } 999 1000 /* 1001 * Apply policy to a single VMA 1002 * This must be called with the mmap_lock held for writing. 1003 */ 1004 static int vma_replace_policy(struct vm_area_struct *vma, 1005 struct mempolicy *pol) 1006 { 1007 int err; 1008 struct mempolicy *old; 1009 struct mempolicy *new; 1010 1011 vma_assert_write_locked(vma); 1012 1013 new = mpol_dup(pol); 1014 if (IS_ERR(new)) 1015 return PTR_ERR(new); 1016 1017 if (vma->vm_ops && vma->vm_ops->set_policy) { 1018 err = vma->vm_ops->set_policy(vma, new); 1019 if (err) 1020 goto err_out; 1021 } 1022 1023 old = vma->vm_policy; 1024 vma->vm_policy = new; /* protected by mmap_lock */ 1025 mpol_put(old); 1026 1027 return 0; 1028 err_out: 1029 mpol_put(new); 1030 return err; 1031 } 1032 1033 /* Split or merge the VMA (if required) and apply the new policy */ 1034 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 1035 struct vm_area_struct **prev, unsigned long start, 1036 unsigned long end, struct mempolicy *new_pol) 1037 { 1038 unsigned long vmstart, vmend; 1039 1040 vmend = min(end, vma->vm_end); 1041 if (start > vma->vm_start) { 1042 *prev = vma; 1043 vmstart = start; 1044 } else { 1045 vmstart = vma->vm_start; 1046 } 1047 1048 if (mpol_equal(vma->vm_policy, new_pol)) { 1049 *prev = vma; 1050 return 0; 1051 } 1052 1053 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 1054 if (IS_ERR(vma)) 1055 return PTR_ERR(vma); 1056 1057 *prev = vma; 1058 return vma_replace_policy(vma, new_pol); 1059 } 1060 1061 /* Set the process memory policy */ 1062 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 1063 nodemask_t *nodes) 1064 { 1065 struct mempolicy *new, *old; 1066 NODEMASK_SCRATCH(scratch); 1067 int ret; 1068 1069 if (!scratch) 1070 return -ENOMEM; 1071 1072 new = mpol_new(mode, flags, nodes); 1073 if (IS_ERR(new)) { 1074 ret = PTR_ERR(new); 1075 goto out; 1076 } 1077 1078 task_lock(current); 1079 ret = mpol_set_nodemask(new, nodes, scratch); 1080 if (ret) { 1081 task_unlock(current); 1082 mpol_put(new); 1083 goto out; 1084 } 1085 1086 old = current->mempolicy; 1087 current->mempolicy = new; 1088 if (new && (new->mode == MPOL_INTERLEAVE || 1089 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 1090 current->il_prev = MAX_NUMNODES-1; 1091 current->il_weight = 0; 1092 } 1093 task_unlock(current); 1094 mpol_put(old); 1095 ret = 0; 1096 out: 1097 NODEMASK_SCRATCH_FREE(scratch); 1098 return ret; 1099 } 1100 1101 /* 1102 * Return nodemask for policy for get_mempolicy() query 1103 * 1104 * Called with task's alloc_lock held 1105 */ 1106 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 1107 { 1108 nodes_clear(*nodes); 1109 if (pol == &default_policy) 1110 return; 1111 1112 switch (pol->mode) { 1113 case MPOL_BIND: 1114 case MPOL_INTERLEAVE: 1115 case MPOL_PREFERRED: 1116 case MPOL_PREFERRED_MANY: 1117 case MPOL_WEIGHTED_INTERLEAVE: 1118 *nodes = pol->nodes; 1119 break; 1120 case MPOL_LOCAL: 1121 /* return empty node mask for local allocation */ 1122 break; 1123 default: 1124 BUG(); 1125 } 1126 } 1127 1128 static int lookup_node(struct mm_struct *mm, unsigned long addr) 1129 { 1130 struct page *p = NULL; 1131 int ret; 1132 1133 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 1134 if (ret > 0) { 1135 ret = page_to_nid(p); 1136 put_page(p); 1137 } 1138 return ret; 1139 } 1140 1141 /* Retrieve NUMA policy */ 1142 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 1143 unsigned long addr, unsigned long flags) 1144 { 1145 int err; 1146 struct mm_struct *mm = current->mm; 1147 struct vm_area_struct *vma = NULL; 1148 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 1149 1150 if (flags & 1151 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 1152 return -EINVAL; 1153 1154 if (flags & MPOL_F_MEMS_ALLOWED) { 1155 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 1156 return -EINVAL; 1157 *policy = 0; /* just so it's initialized */ 1158 task_lock(current); 1159 *nmask = cpuset_current_mems_allowed; 1160 task_unlock(current); 1161 return 0; 1162 } 1163 1164 if (flags & MPOL_F_ADDR) { 1165 pgoff_t ilx; /* ignored here */ 1166 /* 1167 * Do NOT fall back to task policy if the 1168 * vma/shared policy at addr is NULL. We 1169 * want to return MPOL_DEFAULT in this case. 1170 */ 1171 mmap_read_lock(mm); 1172 vma = vma_lookup(mm, addr); 1173 if (!vma) { 1174 mmap_read_unlock(mm); 1175 return -EFAULT; 1176 } 1177 pol = __get_vma_policy(vma, addr, &ilx); 1178 } else if (addr) 1179 return -EINVAL; 1180 1181 if (!pol) 1182 pol = &default_policy; /* indicates default behavior */ 1183 1184 if (flags & MPOL_F_NODE) { 1185 if (flags & MPOL_F_ADDR) { 1186 /* 1187 * Take a refcount on the mpol, because we are about to 1188 * drop the mmap_lock, after which only "pol" remains 1189 * valid, "vma" is stale. 1190 */ 1191 pol_refcount = pol; 1192 vma = NULL; 1193 mpol_get(pol); 1194 mmap_read_unlock(mm); 1195 err = lookup_node(mm, addr); 1196 if (err < 0) 1197 goto out; 1198 *policy = err; 1199 } else if (pol == current->mempolicy && 1200 pol->mode == MPOL_INTERLEAVE) { 1201 *policy = next_node_in(current->il_prev, pol->nodes); 1202 } else if (pol == current->mempolicy && 1203 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1204 if (current->il_weight) 1205 *policy = current->il_prev; 1206 else 1207 *policy = next_node_in(current->il_prev, 1208 pol->nodes); 1209 } else { 1210 err = -EINVAL; 1211 goto out; 1212 } 1213 } else { 1214 *policy = pol == &default_policy ? MPOL_DEFAULT : 1215 pol->mode; 1216 /* 1217 * Internal mempolicy flags must be masked off before exposing 1218 * the policy to userspace. 1219 */ 1220 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1221 } 1222 1223 err = 0; 1224 if (nmask) { 1225 if (mpol_store_user_nodemask(pol)) { 1226 *nmask = pol->w.user_nodemask; 1227 } else { 1228 task_lock(current); 1229 get_policy_nodemask(pol, nmask); 1230 task_unlock(current); 1231 } 1232 } 1233 1234 out: 1235 mpol_cond_put(pol); 1236 if (vma) 1237 mmap_read_unlock(mm); 1238 if (pol_refcount) 1239 mpol_put(pol_refcount); 1240 return err; 1241 } 1242 1243 #ifdef CONFIG_MIGRATION 1244 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1245 unsigned long flags) 1246 { 1247 /* 1248 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1249 * Choosing not to migrate a shared folio is not counted as a failure. 1250 * 1251 * See folio_maybe_mapped_shared() on possible imprecision when we 1252 * cannot easily detect if a folio is shared. 1253 */ 1254 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1255 if (folio_isolate_lru(folio)) { 1256 list_add_tail(&folio->lru, foliolist); 1257 node_stat_mod_folio(folio, 1258 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1259 folio_nr_pages(folio)); 1260 } else { 1261 /* 1262 * Non-movable folio may reach here. And, there may be 1263 * temporary off LRU folios or non-LRU movable folios. 1264 * Treat them as unmovable folios since they can't be 1265 * isolated, so they can't be moved at the moment. 1266 */ 1267 return false; 1268 } 1269 } 1270 return true; 1271 } 1272 1273 /* 1274 * Migrate pages from one node to a target node. 1275 * Returns error or the number of pages not migrated. 1276 */ 1277 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1278 int flags) 1279 { 1280 nodemask_t nmask; 1281 struct vm_area_struct *vma; 1282 LIST_HEAD(pagelist); 1283 long nr_failed; 1284 long err = 0; 1285 struct migration_target_control mtc = { 1286 .nid = dest, 1287 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1288 .reason = MR_SYSCALL, 1289 }; 1290 1291 nodes_clear(nmask); 1292 node_set(source, nmask); 1293 1294 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1295 1296 mmap_read_lock(mm); 1297 vma = find_vma(mm, 0); 1298 if (unlikely(!vma)) { 1299 mmap_read_unlock(mm); 1300 return 0; 1301 } 1302 1303 /* 1304 * This does not migrate the range, but isolates all pages that 1305 * need migration. Between passing in the full user address 1306 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1307 * but passes back the count of pages which could not be isolated. 1308 */ 1309 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1310 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1311 mmap_read_unlock(mm); 1312 1313 if (!list_empty(&pagelist)) { 1314 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1315 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1316 if (err) 1317 putback_movable_pages(&pagelist); 1318 } 1319 1320 if (err >= 0) 1321 err += nr_failed; 1322 return err; 1323 } 1324 1325 /* 1326 * Move pages between the two nodesets so as to preserve the physical 1327 * layout as much as possible. 1328 * 1329 * Returns the number of page that could not be moved. 1330 */ 1331 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1332 const nodemask_t *to, int flags) 1333 { 1334 long nr_failed = 0; 1335 long err = 0; 1336 nodemask_t tmp; 1337 1338 lru_cache_disable(); 1339 1340 /* 1341 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1342 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1343 * bit in 'tmp', and return that <source, dest> pair for migration. 1344 * The pair of nodemasks 'to' and 'from' define the map. 1345 * 1346 * If no pair of bits is found that way, fallback to picking some 1347 * pair of 'source' and 'dest' bits that are not the same. If the 1348 * 'source' and 'dest' bits are the same, this represents a node 1349 * that will be migrating to itself, so no pages need move. 1350 * 1351 * If no bits are left in 'tmp', or if all remaining bits left 1352 * in 'tmp' correspond to the same bit in 'to', return false 1353 * (nothing left to migrate). 1354 * 1355 * This lets us pick a pair of nodes to migrate between, such that 1356 * if possible the dest node is not already occupied by some other 1357 * source node, minimizing the risk of overloading the memory on a 1358 * node that would happen if we migrated incoming memory to a node 1359 * before migrating outgoing memory source that same node. 1360 * 1361 * A single scan of tmp is sufficient. As we go, we remember the 1362 * most recent <s, d> pair that moved (s != d). If we find a pair 1363 * that not only moved, but what's better, moved to an empty slot 1364 * (d is not set in tmp), then we break out then, with that pair. 1365 * Otherwise when we finish scanning from_tmp, we at least have the 1366 * most recent <s, d> pair that moved. If we get all the way through 1367 * the scan of tmp without finding any node that moved, much less 1368 * moved to an empty node, then there is nothing left worth migrating. 1369 */ 1370 1371 tmp = *from; 1372 while (!nodes_empty(tmp)) { 1373 int s, d; 1374 int source = NUMA_NO_NODE; 1375 int dest = 0; 1376 1377 for_each_node_mask(s, tmp) { 1378 1379 /* 1380 * do_migrate_pages() tries to maintain the relative 1381 * node relationship of the pages established between 1382 * threads and memory areas. 1383 * 1384 * However if the number of source nodes is not equal to 1385 * the number of destination nodes we can not preserve 1386 * this node relative relationship. In that case, skip 1387 * copying memory from a node that is in the destination 1388 * mask. 1389 * 1390 * Example: [2,3,4] -> [3,4,5] moves everything. 1391 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1392 */ 1393 1394 if ((nodes_weight(*from) != nodes_weight(*to)) && 1395 (node_isset(s, *to))) 1396 continue; 1397 1398 d = node_remap(s, *from, *to); 1399 if (s == d) 1400 continue; 1401 1402 source = s; /* Node moved. Memorize */ 1403 dest = d; 1404 1405 /* dest not in remaining from nodes? */ 1406 if (!node_isset(dest, tmp)) 1407 break; 1408 } 1409 if (source == NUMA_NO_NODE) 1410 break; 1411 1412 node_clear(source, tmp); 1413 err = migrate_to_node(mm, source, dest, flags); 1414 if (err > 0) 1415 nr_failed += err; 1416 if (err < 0) 1417 break; 1418 } 1419 1420 lru_cache_enable(); 1421 if (err < 0) 1422 return err; 1423 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1424 } 1425 1426 /* 1427 * Allocate a new folio for page migration, according to NUMA mempolicy. 1428 */ 1429 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1430 unsigned long private) 1431 { 1432 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1433 struct mempolicy *pol = mmpol->pol; 1434 pgoff_t ilx = mmpol->ilx; 1435 unsigned int order; 1436 int nid = numa_node_id(); 1437 gfp_t gfp; 1438 1439 order = folio_order(src); 1440 ilx += src->index >> order; 1441 1442 if (folio_test_hugetlb(src)) { 1443 nodemask_t *nodemask; 1444 struct hstate *h; 1445 1446 h = folio_hstate(src); 1447 gfp = htlb_alloc_mask(h); 1448 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1449 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1450 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1451 } 1452 1453 if (folio_test_large(src)) 1454 gfp = GFP_TRANSHUGE; 1455 else 1456 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1457 1458 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1459 } 1460 #else 1461 1462 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1463 unsigned long flags) 1464 { 1465 return false; 1466 } 1467 1468 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1469 const nodemask_t *to, int flags) 1470 { 1471 return -ENOSYS; 1472 } 1473 1474 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1475 unsigned long private) 1476 { 1477 return NULL; 1478 } 1479 #endif 1480 1481 static long do_mbind(unsigned long start, unsigned long len, 1482 unsigned short mode, unsigned short mode_flags, 1483 nodemask_t *nmask, unsigned long flags) 1484 { 1485 struct mm_struct *mm = current->mm; 1486 struct vm_area_struct *vma, *prev; 1487 struct vma_iterator vmi; 1488 struct migration_mpol mmpol; 1489 struct mempolicy *new; 1490 unsigned long end; 1491 long err; 1492 long nr_failed; 1493 LIST_HEAD(pagelist); 1494 1495 if (flags & ~(unsigned long)MPOL_MF_VALID) 1496 return -EINVAL; 1497 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1498 return -EPERM; 1499 1500 if (start & ~PAGE_MASK) 1501 return -EINVAL; 1502 1503 if (mode == MPOL_DEFAULT) 1504 flags &= ~MPOL_MF_STRICT; 1505 1506 len = PAGE_ALIGN(len); 1507 end = start + len; 1508 1509 if (end < start) 1510 return -EINVAL; 1511 if (end == start) 1512 return 0; 1513 1514 new = mpol_new(mode, mode_flags, nmask); 1515 if (IS_ERR(new)) 1516 return PTR_ERR(new); 1517 1518 /* 1519 * If we are using the default policy then operation 1520 * on discontinuous address spaces is okay after all 1521 */ 1522 if (!new) 1523 flags |= MPOL_MF_DISCONTIG_OK; 1524 1525 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1526 lru_cache_disable(); 1527 { 1528 NODEMASK_SCRATCH(scratch); 1529 if (scratch) { 1530 mmap_write_lock(mm); 1531 err = mpol_set_nodemask(new, nmask, scratch); 1532 if (err) 1533 mmap_write_unlock(mm); 1534 } else 1535 err = -ENOMEM; 1536 NODEMASK_SCRATCH_FREE(scratch); 1537 } 1538 if (err) 1539 goto mpol_out; 1540 1541 /* 1542 * Lock the VMAs before scanning for pages to migrate, 1543 * to ensure we don't miss a concurrently inserted page. 1544 */ 1545 nr_failed = queue_pages_range(mm, start, end, nmask, 1546 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1547 1548 if (nr_failed < 0) { 1549 err = nr_failed; 1550 nr_failed = 0; 1551 } else { 1552 vma_iter_init(&vmi, mm, start); 1553 prev = vma_prev(&vmi); 1554 for_each_vma_range(vmi, vma, end) { 1555 err = mbind_range(&vmi, vma, &prev, start, end, new); 1556 if (err) 1557 break; 1558 } 1559 } 1560 1561 if (!err && !list_empty(&pagelist)) { 1562 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1563 if (!new) { 1564 new = get_task_policy(current); 1565 mpol_get(new); 1566 } 1567 mmpol.pol = new; 1568 mmpol.ilx = 0; 1569 1570 /* 1571 * In the interleaved case, attempt to allocate on exactly the 1572 * targeted nodes, for the first VMA to be migrated; for later 1573 * VMAs, the nodes will still be interleaved from the targeted 1574 * nodemask, but one by one may be selected differently. 1575 */ 1576 if (new->mode == MPOL_INTERLEAVE || 1577 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1578 struct folio *folio; 1579 unsigned int order; 1580 unsigned long addr = -EFAULT; 1581 1582 list_for_each_entry(folio, &pagelist, lru) { 1583 if (!folio_test_ksm(folio)) 1584 break; 1585 } 1586 if (!list_entry_is_head(folio, &pagelist, lru)) { 1587 vma_iter_init(&vmi, mm, start); 1588 for_each_vma_range(vmi, vma, end) { 1589 addr = page_address_in_vma(folio, 1590 folio_page(folio, 0), vma); 1591 if (addr != -EFAULT) 1592 break; 1593 } 1594 } 1595 if (addr != -EFAULT) { 1596 order = folio_order(folio); 1597 /* We already know the pol, but not the ilx */ 1598 mpol_cond_put(get_vma_policy(vma, addr, order, 1599 &mmpol.ilx)); 1600 /* Set base from which to increment by index */ 1601 mmpol.ilx -= folio->index >> order; 1602 } 1603 } 1604 } 1605 1606 mmap_write_unlock(mm); 1607 1608 if (!err && !list_empty(&pagelist)) { 1609 nr_failed |= migrate_pages(&pagelist, 1610 alloc_migration_target_by_mpol, NULL, 1611 (unsigned long)&mmpol, MIGRATE_SYNC, 1612 MR_MEMPOLICY_MBIND, NULL); 1613 } 1614 1615 if (nr_failed && (flags & MPOL_MF_STRICT)) 1616 err = -EIO; 1617 if (!list_empty(&pagelist)) 1618 putback_movable_pages(&pagelist); 1619 mpol_out: 1620 mpol_put(new); 1621 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1622 lru_cache_enable(); 1623 return err; 1624 } 1625 1626 /* 1627 * User space interface with variable sized bitmaps for nodelists. 1628 */ 1629 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1630 unsigned long maxnode) 1631 { 1632 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1633 int ret; 1634 1635 if (in_compat_syscall()) 1636 ret = compat_get_bitmap(mask, 1637 (const compat_ulong_t __user *)nmask, 1638 maxnode); 1639 else 1640 ret = copy_from_user(mask, nmask, 1641 nlongs * sizeof(unsigned long)); 1642 1643 if (ret) 1644 return -EFAULT; 1645 1646 if (maxnode % BITS_PER_LONG) 1647 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1648 1649 return 0; 1650 } 1651 1652 /* Copy a node mask from user space. */ 1653 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1654 unsigned long maxnode) 1655 { 1656 --maxnode; 1657 nodes_clear(*nodes); 1658 if (maxnode == 0 || !nmask) 1659 return 0; 1660 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1661 return -EINVAL; 1662 1663 /* 1664 * When the user specified more nodes than supported just check 1665 * if the non supported part is all zero, one word at a time, 1666 * starting at the end. 1667 */ 1668 while (maxnode > MAX_NUMNODES) { 1669 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1670 unsigned long t; 1671 1672 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1673 return -EFAULT; 1674 1675 if (maxnode - bits >= MAX_NUMNODES) { 1676 maxnode -= bits; 1677 } else { 1678 maxnode = MAX_NUMNODES; 1679 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1680 } 1681 if (t) 1682 return -EINVAL; 1683 } 1684 1685 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1686 } 1687 1688 /* Copy a kernel node mask to user space */ 1689 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1690 nodemask_t *nodes) 1691 { 1692 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1693 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1694 bool compat = in_compat_syscall(); 1695 1696 if (compat) 1697 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1698 1699 if (copy > nbytes) { 1700 if (copy > PAGE_SIZE) 1701 return -EINVAL; 1702 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1703 return -EFAULT; 1704 copy = nbytes; 1705 maxnode = nr_node_ids; 1706 } 1707 1708 if (compat) 1709 return compat_put_bitmap((compat_ulong_t __user *)mask, 1710 nodes_addr(*nodes), maxnode); 1711 1712 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1713 } 1714 1715 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1716 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1717 { 1718 *flags = *mode & MPOL_MODE_FLAGS; 1719 *mode &= ~MPOL_MODE_FLAGS; 1720 1721 if ((unsigned int)(*mode) >= MPOL_MAX) 1722 return -EINVAL; 1723 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1724 return -EINVAL; 1725 if (*flags & MPOL_F_NUMA_BALANCING) { 1726 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1727 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1728 else 1729 return -EINVAL; 1730 } 1731 return 0; 1732 } 1733 1734 static long kernel_mbind(unsigned long start, unsigned long len, 1735 unsigned long mode, const unsigned long __user *nmask, 1736 unsigned long maxnode, unsigned int flags) 1737 { 1738 unsigned short mode_flags; 1739 nodemask_t nodes; 1740 int lmode = mode; 1741 int err; 1742 1743 start = untagged_addr(start); 1744 err = sanitize_mpol_flags(&lmode, &mode_flags); 1745 if (err) 1746 return err; 1747 1748 err = get_nodes(&nodes, nmask, maxnode); 1749 if (err) 1750 return err; 1751 1752 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1753 } 1754 1755 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1756 unsigned long, home_node, unsigned long, flags) 1757 { 1758 struct mm_struct *mm = current->mm; 1759 struct vm_area_struct *vma, *prev; 1760 struct mempolicy *new, *old; 1761 unsigned long end; 1762 int err = -ENOENT; 1763 VMA_ITERATOR(vmi, mm, start); 1764 1765 start = untagged_addr(start); 1766 if (start & ~PAGE_MASK) 1767 return -EINVAL; 1768 /* 1769 * flags is used for future extension if any. 1770 */ 1771 if (flags != 0) 1772 return -EINVAL; 1773 1774 /* 1775 * Check home_node is online to avoid accessing uninitialized 1776 * NODE_DATA. 1777 */ 1778 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1779 return -EINVAL; 1780 1781 len = PAGE_ALIGN(len); 1782 end = start + len; 1783 1784 if (end < start) 1785 return -EINVAL; 1786 if (end == start) 1787 return 0; 1788 mmap_write_lock(mm); 1789 prev = vma_prev(&vmi); 1790 for_each_vma_range(vmi, vma, end) { 1791 /* 1792 * If any vma in the range got policy other than MPOL_BIND 1793 * or MPOL_PREFERRED_MANY we return error. We don't reset 1794 * the home node for vmas we already updated before. 1795 */ 1796 old = vma_policy(vma); 1797 if (!old) { 1798 prev = vma; 1799 continue; 1800 } 1801 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1802 err = -EOPNOTSUPP; 1803 break; 1804 } 1805 new = mpol_dup(old); 1806 if (IS_ERR(new)) { 1807 err = PTR_ERR(new); 1808 break; 1809 } 1810 1811 vma_start_write(vma); 1812 new->home_node = home_node; 1813 err = mbind_range(&vmi, vma, &prev, start, end, new); 1814 mpol_put(new); 1815 if (err) 1816 break; 1817 } 1818 mmap_write_unlock(mm); 1819 return err; 1820 } 1821 1822 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1823 unsigned long, mode, const unsigned long __user *, nmask, 1824 unsigned long, maxnode, unsigned int, flags) 1825 { 1826 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1827 } 1828 1829 /* Set the process memory policy */ 1830 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1831 unsigned long maxnode) 1832 { 1833 unsigned short mode_flags; 1834 nodemask_t nodes; 1835 int lmode = mode; 1836 int err; 1837 1838 err = sanitize_mpol_flags(&lmode, &mode_flags); 1839 if (err) 1840 return err; 1841 1842 err = get_nodes(&nodes, nmask, maxnode); 1843 if (err) 1844 return err; 1845 1846 return do_set_mempolicy(lmode, mode_flags, &nodes); 1847 } 1848 1849 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1850 unsigned long, maxnode) 1851 { 1852 return kernel_set_mempolicy(mode, nmask, maxnode); 1853 } 1854 1855 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1856 const unsigned long __user *old_nodes, 1857 const unsigned long __user *new_nodes) 1858 { 1859 struct mm_struct *mm = NULL; 1860 struct task_struct *task; 1861 nodemask_t task_nodes; 1862 int err; 1863 nodemask_t *old; 1864 nodemask_t *new; 1865 NODEMASK_SCRATCH(scratch); 1866 1867 if (!scratch) 1868 return -ENOMEM; 1869 1870 old = &scratch->mask1; 1871 new = &scratch->mask2; 1872 1873 err = get_nodes(old, old_nodes, maxnode); 1874 if (err) 1875 goto out; 1876 1877 err = get_nodes(new, new_nodes, maxnode); 1878 if (err) 1879 goto out; 1880 1881 /* Find the mm_struct */ 1882 rcu_read_lock(); 1883 task = pid ? find_task_by_vpid(pid) : current; 1884 if (!task) { 1885 rcu_read_unlock(); 1886 err = -ESRCH; 1887 goto out; 1888 } 1889 get_task_struct(task); 1890 1891 err = -EINVAL; 1892 1893 /* 1894 * Check if this process has the right to modify the specified process. 1895 * Use the regular "ptrace_may_access()" checks. 1896 */ 1897 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1898 rcu_read_unlock(); 1899 err = -EPERM; 1900 goto out_put; 1901 } 1902 rcu_read_unlock(); 1903 1904 task_nodes = cpuset_mems_allowed(task); 1905 /* Is the user allowed to access the target nodes? */ 1906 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1907 err = -EPERM; 1908 goto out_put; 1909 } 1910 1911 task_nodes = cpuset_mems_allowed(current); 1912 if (!nodes_and(*new, *new, task_nodes)) 1913 goto out_put; 1914 1915 err = security_task_movememory(task); 1916 if (err) 1917 goto out_put; 1918 1919 mm = get_task_mm(task); 1920 put_task_struct(task); 1921 1922 if (!mm) { 1923 err = -EINVAL; 1924 goto out; 1925 } 1926 1927 err = do_migrate_pages(mm, old, new, 1928 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1929 1930 mmput(mm); 1931 out: 1932 NODEMASK_SCRATCH_FREE(scratch); 1933 1934 return err; 1935 1936 out_put: 1937 put_task_struct(task); 1938 goto out; 1939 } 1940 1941 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1942 const unsigned long __user *, old_nodes, 1943 const unsigned long __user *, new_nodes) 1944 { 1945 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1946 } 1947 1948 /* Retrieve NUMA policy */ 1949 static int kernel_get_mempolicy(int __user *policy, 1950 unsigned long __user *nmask, 1951 unsigned long maxnode, 1952 unsigned long addr, 1953 unsigned long flags) 1954 { 1955 int err; 1956 int pval; 1957 nodemask_t nodes; 1958 1959 if (nmask != NULL && maxnode < nr_node_ids) 1960 return -EINVAL; 1961 1962 addr = untagged_addr(addr); 1963 1964 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1965 1966 if (err) 1967 return err; 1968 1969 if (policy && put_user(pval, policy)) 1970 return -EFAULT; 1971 1972 if (nmask) 1973 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1974 1975 return err; 1976 } 1977 1978 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1979 unsigned long __user *, nmask, unsigned long, maxnode, 1980 unsigned long, addr, unsigned long, flags) 1981 { 1982 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1983 } 1984 1985 bool vma_migratable(struct vm_area_struct *vma) 1986 { 1987 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1988 return false; 1989 1990 /* 1991 * DAX device mappings require predictable access latency, so avoid 1992 * incurring periodic faults. 1993 */ 1994 if (vma_is_dax(vma)) 1995 return false; 1996 1997 if (is_vm_hugetlb_page(vma) && 1998 !hugepage_migration_supported(hstate_vma(vma))) 1999 return false; 2000 2001 /* 2002 * Migration allocates pages in the highest zone. If we cannot 2003 * do so then migration (at least from node to node) is not 2004 * possible. 2005 */ 2006 if (vma->vm_file && 2007 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 2008 < policy_zone) 2009 return false; 2010 return true; 2011 } 2012 2013 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 2014 unsigned long addr, pgoff_t *ilx) 2015 { 2016 *ilx = 0; 2017 return (vma->vm_ops && vma->vm_ops->get_policy) ? 2018 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 2019 } 2020 2021 /* 2022 * get_vma_policy(@vma, @addr, @order, @ilx) 2023 * @vma: virtual memory area whose policy is sought 2024 * @addr: address in @vma for shared policy lookup 2025 * @order: 0, or appropriate huge_page_order for interleaving 2026 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 2027 * MPOL_WEIGHTED_INTERLEAVE 2028 * 2029 * Returns effective policy for a VMA at specified address. 2030 * Falls back to current->mempolicy or system default policy, as necessary. 2031 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 2032 * count--added by the get_policy() vm_op, as appropriate--to protect against 2033 * freeing by another task. It is the caller's responsibility to free the 2034 * extra reference for shared policies. 2035 */ 2036 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 2037 unsigned long addr, int order, pgoff_t *ilx) 2038 { 2039 struct mempolicy *pol; 2040 2041 pol = __get_vma_policy(vma, addr, ilx); 2042 if (!pol) 2043 pol = get_task_policy(current); 2044 if (pol->mode == MPOL_INTERLEAVE || 2045 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 2046 *ilx += vma->vm_pgoff >> order; 2047 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 2048 } 2049 return pol; 2050 } 2051 2052 bool vma_policy_mof(struct vm_area_struct *vma) 2053 { 2054 struct mempolicy *pol; 2055 2056 if (vma->vm_ops && vma->vm_ops->get_policy) { 2057 bool ret = false; 2058 pgoff_t ilx; /* ignored here */ 2059 2060 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 2061 if (pol && (pol->flags & MPOL_F_MOF)) 2062 ret = true; 2063 mpol_cond_put(pol); 2064 2065 return ret; 2066 } 2067 2068 pol = vma->vm_policy; 2069 if (!pol) 2070 pol = get_task_policy(current); 2071 2072 return pol->flags & MPOL_F_MOF; 2073 } 2074 2075 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 2076 { 2077 enum zone_type dynamic_policy_zone = policy_zone; 2078 2079 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 2080 2081 /* 2082 * if policy->nodes has movable memory only, 2083 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 2084 * 2085 * policy->nodes is intersect with node_states[N_MEMORY]. 2086 * so if the following test fails, it implies 2087 * policy->nodes has movable memory only. 2088 */ 2089 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 2090 dynamic_policy_zone = ZONE_MOVABLE; 2091 2092 return zone >= dynamic_policy_zone; 2093 } 2094 2095 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 2096 { 2097 unsigned int node; 2098 unsigned int cpuset_mems_cookie; 2099 2100 retry: 2101 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 2102 cpuset_mems_cookie = read_mems_allowed_begin(); 2103 node = current->il_prev; 2104 if (!current->il_weight || !node_isset(node, policy->nodes)) { 2105 node = next_node_in(node, policy->nodes); 2106 if (read_mems_allowed_retry(cpuset_mems_cookie)) 2107 goto retry; 2108 if (node == MAX_NUMNODES) 2109 return node; 2110 current->il_prev = node; 2111 current->il_weight = get_il_weight(node); 2112 } 2113 current->il_weight--; 2114 return node; 2115 } 2116 2117 /* Do dynamic interleaving for a process */ 2118 static unsigned int interleave_nodes(struct mempolicy *policy) 2119 { 2120 unsigned int nid; 2121 unsigned int cpuset_mems_cookie; 2122 2123 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 2124 do { 2125 cpuset_mems_cookie = read_mems_allowed_begin(); 2126 nid = next_node_in(current->il_prev, policy->nodes); 2127 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2128 2129 if (nid < MAX_NUMNODES) 2130 current->il_prev = nid; 2131 return nid; 2132 } 2133 2134 /* 2135 * Depending on the memory policy provide a node from which to allocate the 2136 * next slab entry. 2137 */ 2138 unsigned int mempolicy_slab_node(void) 2139 { 2140 struct mempolicy *policy; 2141 int node = numa_mem_id(); 2142 2143 if (!in_task()) 2144 return node; 2145 2146 policy = current->mempolicy; 2147 if (!policy) 2148 return node; 2149 2150 switch (policy->mode) { 2151 case MPOL_PREFERRED: 2152 return first_node(policy->nodes); 2153 2154 case MPOL_INTERLEAVE: 2155 return interleave_nodes(policy); 2156 2157 case MPOL_WEIGHTED_INTERLEAVE: 2158 return weighted_interleave_nodes(policy); 2159 2160 case MPOL_BIND: 2161 case MPOL_PREFERRED_MANY: 2162 { 2163 struct zoneref *z; 2164 2165 /* 2166 * Follow bind policy behavior and start allocation at the 2167 * first node. 2168 */ 2169 struct zonelist *zonelist; 2170 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 2171 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 2172 z = first_zones_zonelist(zonelist, highest_zoneidx, 2173 &policy->nodes); 2174 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2175 } 2176 case MPOL_LOCAL: 2177 return node; 2178 2179 default: 2180 BUG(); 2181 } 2182 } 2183 2184 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2185 nodemask_t *mask) 2186 { 2187 /* 2188 * barrier stabilizes the nodemask locally so that it can be iterated 2189 * over safely without concern for changes. Allocators validate node 2190 * selection does not violate mems_allowed, so this is safe. 2191 */ 2192 barrier(); 2193 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2194 barrier(); 2195 return nodes_weight(*mask); 2196 } 2197 2198 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2199 { 2200 struct weighted_interleave_state *state; 2201 nodemask_t nodemask; 2202 unsigned int target, nr_nodes; 2203 u8 *table = NULL; 2204 unsigned int weight_total = 0; 2205 u8 weight; 2206 int nid = 0; 2207 2208 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2209 if (!nr_nodes) 2210 return numa_node_id(); 2211 2212 rcu_read_lock(); 2213 2214 state = rcu_dereference(wi_state); 2215 /* Uninitialized wi_state means we should assume all weights are 1 */ 2216 if (state) 2217 table = state->iw_table; 2218 2219 /* calculate the total weight */ 2220 for_each_node_mask(nid, nodemask) 2221 weight_total += table ? table[nid] : 1; 2222 2223 /* Calculate the node offset based on totals */ 2224 target = ilx % weight_total; 2225 nid = first_node(nodemask); 2226 while (target) { 2227 /* detect system default usage */ 2228 weight = table ? table[nid] : 1; 2229 if (target < weight) 2230 break; 2231 target -= weight; 2232 nid = next_node_in(nid, nodemask); 2233 } 2234 rcu_read_unlock(); 2235 return nid; 2236 } 2237 2238 /* 2239 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2240 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2241 * exceeds the number of present nodes. 2242 */ 2243 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2244 { 2245 nodemask_t nodemask; 2246 unsigned int target, nnodes; 2247 int i; 2248 int nid; 2249 2250 nnodes = read_once_policy_nodemask(pol, &nodemask); 2251 if (!nnodes) 2252 return numa_node_id(); 2253 target = ilx % nnodes; 2254 nid = first_node(nodemask); 2255 for (i = 0; i < target; i++) 2256 nid = next_node(nid, nodemask); 2257 return nid; 2258 } 2259 2260 /* 2261 * Return a nodemask representing a mempolicy for filtering nodes for 2262 * page allocation, together with preferred node id (or the input node id). 2263 */ 2264 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2265 pgoff_t ilx, int *nid) 2266 { 2267 nodemask_t *nodemask = NULL; 2268 2269 switch (pol->mode) { 2270 case MPOL_PREFERRED: 2271 /* Override input node id */ 2272 *nid = first_node(pol->nodes); 2273 break; 2274 case MPOL_PREFERRED_MANY: 2275 nodemask = &pol->nodes; 2276 if (pol->home_node != NUMA_NO_NODE) 2277 *nid = pol->home_node; 2278 break; 2279 case MPOL_BIND: 2280 /* Restrict to nodemask (but not on lower zones) */ 2281 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2282 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2283 nodemask = &pol->nodes; 2284 if (pol->home_node != NUMA_NO_NODE) 2285 *nid = pol->home_node; 2286 /* 2287 * __GFP_THISNODE shouldn't even be used with the bind policy 2288 * because we might easily break the expectation to stay on the 2289 * requested node and not break the policy. 2290 */ 2291 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2292 break; 2293 case MPOL_INTERLEAVE: 2294 /* Override input node id */ 2295 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2296 interleave_nodes(pol) : interleave_nid(pol, ilx); 2297 break; 2298 case MPOL_WEIGHTED_INTERLEAVE: 2299 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2300 weighted_interleave_nodes(pol) : 2301 weighted_interleave_nid(pol, ilx); 2302 break; 2303 } 2304 2305 return nodemask; 2306 } 2307 2308 #ifdef CONFIG_HUGETLBFS 2309 /* 2310 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2311 * @vma: virtual memory area whose policy is sought 2312 * @addr: address in @vma for shared policy lookup and interleave policy 2313 * @gfp_flags: for requested zone 2314 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2315 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2316 * 2317 * Returns a nid suitable for a huge page allocation and a pointer 2318 * to the struct mempolicy for conditional unref after allocation. 2319 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2320 * to the mempolicy's @nodemask for filtering the zonelist. 2321 */ 2322 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2323 struct mempolicy **mpol, nodemask_t **nodemask) 2324 { 2325 pgoff_t ilx; 2326 int nid; 2327 2328 nid = numa_node_id(); 2329 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2330 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2331 return nid; 2332 } 2333 2334 /* 2335 * init_nodemask_of_mempolicy 2336 * 2337 * If the current task's mempolicy is "default" [NULL], return 'false' 2338 * to indicate default policy. Otherwise, extract the policy nodemask 2339 * for 'bind' or 'interleave' policy into the argument nodemask, or 2340 * initialize the argument nodemask to contain the single node for 2341 * 'preferred' or 'local' policy and return 'true' to indicate presence 2342 * of non-default mempolicy. 2343 * 2344 * We don't bother with reference counting the mempolicy [mpol_get/put] 2345 * because the current task is examining it's own mempolicy and a task's 2346 * mempolicy is only ever changed by the task itself. 2347 * 2348 * N.B., it is the caller's responsibility to free a returned nodemask. 2349 */ 2350 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2351 { 2352 struct mempolicy *mempolicy; 2353 2354 if (!(mask && current->mempolicy)) 2355 return false; 2356 2357 task_lock(current); 2358 mempolicy = current->mempolicy; 2359 switch (mempolicy->mode) { 2360 case MPOL_PREFERRED: 2361 case MPOL_PREFERRED_MANY: 2362 case MPOL_BIND: 2363 case MPOL_INTERLEAVE: 2364 case MPOL_WEIGHTED_INTERLEAVE: 2365 *mask = mempolicy->nodes; 2366 break; 2367 2368 case MPOL_LOCAL: 2369 init_nodemask_of_node(mask, numa_node_id()); 2370 break; 2371 2372 default: 2373 BUG(); 2374 } 2375 task_unlock(current); 2376 2377 return true; 2378 } 2379 #endif 2380 2381 /* 2382 * mempolicy_in_oom_domain 2383 * 2384 * If tsk's mempolicy is "bind", check for intersection between mask and 2385 * the policy nodemask. Otherwise, return true for all other policies 2386 * including "interleave", as a tsk with "interleave" policy may have 2387 * memory allocated from all nodes in system. 2388 * 2389 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2390 */ 2391 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2392 const nodemask_t *mask) 2393 { 2394 struct mempolicy *mempolicy; 2395 bool ret = true; 2396 2397 if (!mask) 2398 return ret; 2399 2400 task_lock(tsk); 2401 mempolicy = tsk->mempolicy; 2402 if (mempolicy && mempolicy->mode == MPOL_BIND) 2403 ret = nodes_intersects(mempolicy->nodes, *mask); 2404 task_unlock(tsk); 2405 2406 return ret; 2407 } 2408 2409 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2410 int nid, nodemask_t *nodemask) 2411 { 2412 struct page *page; 2413 gfp_t preferred_gfp; 2414 2415 /* 2416 * This is a two pass approach. The first pass will only try the 2417 * preferred nodes but skip the direct reclaim and allow the 2418 * allocation to fail, while the second pass will try all the 2419 * nodes in system. 2420 */ 2421 preferred_gfp = gfp | __GFP_NOWARN; 2422 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2423 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2424 if (!page) 2425 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2426 2427 return page; 2428 } 2429 2430 /** 2431 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2432 * @gfp: GFP flags. 2433 * @order: Order of the page allocation. 2434 * @pol: Pointer to the NUMA mempolicy. 2435 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2436 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2437 * 2438 * Return: The page on success or NULL if allocation fails. 2439 */ 2440 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2441 struct mempolicy *pol, pgoff_t ilx, int nid) 2442 { 2443 nodemask_t *nodemask; 2444 struct page *page; 2445 2446 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2447 2448 if (pol->mode == MPOL_PREFERRED_MANY) 2449 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2450 2451 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2452 /* filter "hugepage" allocation, unless from alloc_pages() */ 2453 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2454 /* 2455 * For hugepage allocation and non-interleave policy which 2456 * allows the current node (or other explicitly preferred 2457 * node) we only try to allocate from the current/preferred 2458 * node and don't fall back to other nodes, as the cost of 2459 * remote accesses would likely offset THP benefits. 2460 * 2461 * If the policy is interleave or does not allow the current 2462 * node in its nodemask, we allocate the standard way. 2463 */ 2464 if (pol->mode != MPOL_INTERLEAVE && 2465 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2466 (!nodemask || node_isset(nid, *nodemask))) { 2467 /* 2468 * First, try to allocate THP only on local node, but 2469 * don't reclaim unnecessarily, just compact. 2470 */ 2471 page = __alloc_frozen_pages_noprof( 2472 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2473 nid, NULL); 2474 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2475 return page; 2476 /* 2477 * If hugepage allocations are configured to always 2478 * synchronous compact or the vma has been madvised 2479 * to prefer hugepage backing, retry allowing remote 2480 * memory with both reclaim and compact as well. 2481 */ 2482 } 2483 } 2484 2485 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2486 2487 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2488 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2489 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2490 if (static_branch_likely(&vm_numa_stat_key) && 2491 page_to_nid(page) == nid) { 2492 preempt_disable(); 2493 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2494 preempt_enable(); 2495 } 2496 } 2497 2498 return page; 2499 } 2500 2501 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2502 struct mempolicy *pol, pgoff_t ilx, int nid) 2503 { 2504 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2505 ilx, nid); 2506 if (!page) 2507 return NULL; 2508 2509 set_page_refcounted(page); 2510 return page_rmappable_folio(page); 2511 } 2512 2513 /** 2514 * vma_alloc_folio - Allocate a folio for a VMA. 2515 * @gfp: GFP flags. 2516 * @order: Order of the folio. 2517 * @vma: Pointer to VMA. 2518 * @addr: Virtual address of the allocation. Must be inside @vma. 2519 * 2520 * Allocate a folio for a specific address in @vma, using the appropriate 2521 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2522 * VMA to prevent it from going away. Should be used for all allocations 2523 * for folios that will be mapped into user space, excepting hugetlbfs, and 2524 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2525 * 2526 * Return: The folio on success or NULL if allocation fails. 2527 */ 2528 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2529 unsigned long addr) 2530 { 2531 struct mempolicy *pol; 2532 pgoff_t ilx; 2533 struct folio *folio; 2534 2535 if (vma->vm_flags & VM_DROPPABLE) 2536 gfp |= __GFP_NOWARN; 2537 2538 pol = get_vma_policy(vma, addr, order, &ilx); 2539 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2540 mpol_cond_put(pol); 2541 return folio; 2542 } 2543 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2544 2545 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2546 { 2547 struct mempolicy *pol = &default_policy; 2548 2549 /* 2550 * No reference counting needed for current->mempolicy 2551 * nor system default_policy 2552 */ 2553 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2554 pol = get_task_policy(current); 2555 2556 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2557 numa_node_id()); 2558 } 2559 2560 /** 2561 * alloc_pages - Allocate pages. 2562 * @gfp: GFP flags. 2563 * @order: Power of two of number of pages to allocate. 2564 * 2565 * Allocate 1 << @order contiguous pages. The physical address of the 2566 * first page is naturally aligned (eg an order-3 allocation will be aligned 2567 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2568 * process is honoured when in process context. 2569 * 2570 * Context: Can be called from any context, providing the appropriate GFP 2571 * flags are used. 2572 * Return: The page on success or NULL if allocation fails. 2573 */ 2574 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2575 { 2576 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2577 2578 if (page) 2579 set_page_refcounted(page); 2580 return page; 2581 } 2582 EXPORT_SYMBOL(alloc_pages_noprof); 2583 2584 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2585 { 2586 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2587 } 2588 EXPORT_SYMBOL(folio_alloc_noprof); 2589 2590 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2591 struct mempolicy *pol, unsigned long nr_pages, 2592 struct page **page_array) 2593 { 2594 int nodes; 2595 unsigned long nr_pages_per_node; 2596 int delta; 2597 int i; 2598 unsigned long nr_allocated; 2599 unsigned long total_allocated = 0; 2600 2601 nodes = nodes_weight(pol->nodes); 2602 nr_pages_per_node = nr_pages / nodes; 2603 delta = nr_pages - nodes * nr_pages_per_node; 2604 2605 for (i = 0; i < nodes; i++) { 2606 if (delta) { 2607 nr_allocated = alloc_pages_bulk_noprof(gfp, 2608 interleave_nodes(pol), NULL, 2609 nr_pages_per_node + 1, 2610 page_array); 2611 delta--; 2612 } else { 2613 nr_allocated = alloc_pages_bulk_noprof(gfp, 2614 interleave_nodes(pol), NULL, 2615 nr_pages_per_node, page_array); 2616 } 2617 2618 page_array += nr_allocated; 2619 total_allocated += nr_allocated; 2620 } 2621 2622 return total_allocated; 2623 } 2624 2625 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2626 struct mempolicy *pol, unsigned long nr_pages, 2627 struct page **page_array) 2628 { 2629 struct weighted_interleave_state *state; 2630 struct task_struct *me = current; 2631 unsigned int cpuset_mems_cookie; 2632 unsigned long total_allocated = 0; 2633 unsigned long nr_allocated = 0; 2634 unsigned long rounds; 2635 unsigned long node_pages, delta; 2636 u8 *weights, weight; 2637 unsigned int weight_total = 0; 2638 unsigned long rem_pages = nr_pages; 2639 nodemask_t nodes; 2640 int nnodes, node; 2641 int resume_node = MAX_NUMNODES - 1; 2642 u8 resume_weight = 0; 2643 int prev_node; 2644 int i; 2645 2646 if (!nr_pages) 2647 return 0; 2648 2649 /* read the nodes onto the stack, retry if done during rebind */ 2650 do { 2651 cpuset_mems_cookie = read_mems_allowed_begin(); 2652 nnodes = read_once_policy_nodemask(pol, &nodes); 2653 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2654 2655 /* if the nodemask has become invalid, we cannot do anything */ 2656 if (!nnodes) 2657 return 0; 2658 2659 /* Continue allocating from most recent node and adjust the nr_pages */ 2660 node = me->il_prev; 2661 weight = me->il_weight; 2662 if (weight && node_isset(node, nodes)) { 2663 node_pages = min(rem_pages, weight); 2664 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2665 page_array); 2666 page_array += nr_allocated; 2667 total_allocated += nr_allocated; 2668 /* if that's all the pages, no need to interleave */ 2669 if (rem_pages <= weight) { 2670 me->il_weight -= rem_pages; 2671 return total_allocated; 2672 } 2673 /* Otherwise we adjust remaining pages, continue from there */ 2674 rem_pages -= weight; 2675 } 2676 /* clear active weight in case of an allocation failure */ 2677 me->il_weight = 0; 2678 prev_node = node; 2679 2680 /* create a local copy of node weights to operate on outside rcu */ 2681 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2682 if (!weights) 2683 return total_allocated; 2684 2685 rcu_read_lock(); 2686 state = rcu_dereference(wi_state); 2687 if (state) { 2688 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); 2689 rcu_read_unlock(); 2690 } else { 2691 rcu_read_unlock(); 2692 for (i = 0; i < nr_node_ids; i++) 2693 weights[i] = 1; 2694 } 2695 2696 /* calculate total, detect system default usage */ 2697 for_each_node_mask(node, nodes) 2698 weight_total += weights[node]; 2699 2700 /* 2701 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2702 * Track which node weighted interleave should resume from. 2703 * 2704 * if (rounds > 0) and (delta == 0), resume_node will always be 2705 * the node following prev_node and its weight. 2706 */ 2707 rounds = rem_pages / weight_total; 2708 delta = rem_pages % weight_total; 2709 resume_node = next_node_in(prev_node, nodes); 2710 resume_weight = weights[resume_node]; 2711 for (i = 0; i < nnodes; i++) { 2712 node = next_node_in(prev_node, nodes); 2713 weight = weights[node]; 2714 node_pages = weight * rounds; 2715 /* If a delta exists, add this node's portion of the delta */ 2716 if (delta > weight) { 2717 node_pages += weight; 2718 delta -= weight; 2719 } else if (delta) { 2720 /* when delta is depleted, resume from that node */ 2721 node_pages += delta; 2722 resume_node = node; 2723 resume_weight = weight - delta; 2724 delta = 0; 2725 } 2726 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2727 if (!node_pages) 2728 break; 2729 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2730 page_array); 2731 page_array += nr_allocated; 2732 total_allocated += nr_allocated; 2733 if (total_allocated == nr_pages) 2734 break; 2735 prev_node = node; 2736 } 2737 me->il_prev = resume_node; 2738 me->il_weight = resume_weight; 2739 kfree(weights); 2740 return total_allocated; 2741 } 2742 2743 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2744 struct mempolicy *pol, unsigned long nr_pages, 2745 struct page **page_array) 2746 { 2747 gfp_t preferred_gfp; 2748 unsigned long nr_allocated = 0; 2749 2750 preferred_gfp = gfp | __GFP_NOWARN; 2751 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2752 2753 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2754 nr_pages, page_array); 2755 2756 if (nr_allocated < nr_pages) 2757 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2758 nr_pages - nr_allocated, 2759 page_array + nr_allocated); 2760 return nr_allocated; 2761 } 2762 2763 /* alloc pages bulk and mempolicy should be considered at the 2764 * same time in some situation such as vmalloc. 2765 * 2766 * It can accelerate memory allocation especially interleaving 2767 * allocate memory. 2768 */ 2769 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2770 unsigned long nr_pages, struct page **page_array) 2771 { 2772 struct mempolicy *pol = &default_policy; 2773 nodemask_t *nodemask; 2774 int nid; 2775 2776 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2777 pol = get_task_policy(current); 2778 2779 if (pol->mode == MPOL_INTERLEAVE) 2780 return alloc_pages_bulk_interleave(gfp, pol, 2781 nr_pages, page_array); 2782 2783 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2784 return alloc_pages_bulk_weighted_interleave( 2785 gfp, pol, nr_pages, page_array); 2786 2787 if (pol->mode == MPOL_PREFERRED_MANY) 2788 return alloc_pages_bulk_preferred_many(gfp, 2789 numa_node_id(), pol, nr_pages, page_array); 2790 2791 nid = numa_node_id(); 2792 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2793 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2794 nr_pages, page_array); 2795 } 2796 2797 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2798 { 2799 struct mempolicy *pol = mpol_dup(src->vm_policy); 2800 2801 if (IS_ERR(pol)) 2802 return PTR_ERR(pol); 2803 dst->vm_policy = pol; 2804 return 0; 2805 } 2806 2807 /* 2808 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2809 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2810 * with the mems_allowed returned by cpuset_mems_allowed(). This 2811 * keeps mempolicies cpuset relative after its cpuset moves. See 2812 * further kernel/cpuset.c update_nodemask(). 2813 * 2814 * current's mempolicy may be rebinded by the other task(the task that changes 2815 * cpuset's mems), so we needn't do rebind work for current task. 2816 */ 2817 2818 /* Slow path of a mempolicy duplicate */ 2819 struct mempolicy *__mpol_dup(struct mempolicy *old) 2820 { 2821 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2822 2823 if (!new) 2824 return ERR_PTR(-ENOMEM); 2825 2826 /* task's mempolicy is protected by alloc_lock */ 2827 if (old == current->mempolicy) { 2828 task_lock(current); 2829 *new = *old; 2830 task_unlock(current); 2831 } else 2832 *new = *old; 2833 2834 if (current_cpuset_is_being_rebound()) { 2835 nodemask_t mems = cpuset_mems_allowed(current); 2836 mpol_rebind_policy(new, &mems); 2837 } 2838 atomic_set(&new->refcnt, 1); 2839 return new; 2840 } 2841 2842 /* Slow path of a mempolicy comparison */ 2843 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2844 { 2845 if (!a || !b) 2846 return false; 2847 if (a->mode != b->mode) 2848 return false; 2849 if (a->flags != b->flags) 2850 return false; 2851 if (a->home_node != b->home_node) 2852 return false; 2853 if (mpol_store_user_nodemask(a)) 2854 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2855 return false; 2856 2857 switch (a->mode) { 2858 case MPOL_BIND: 2859 case MPOL_INTERLEAVE: 2860 case MPOL_PREFERRED: 2861 case MPOL_PREFERRED_MANY: 2862 case MPOL_WEIGHTED_INTERLEAVE: 2863 return !!nodes_equal(a->nodes, b->nodes); 2864 case MPOL_LOCAL: 2865 return true; 2866 default: 2867 BUG(); 2868 return false; 2869 } 2870 } 2871 2872 /* 2873 * Shared memory backing store policy support. 2874 * 2875 * Remember policies even when nobody has shared memory mapped. 2876 * The policies are kept in Red-Black tree linked from the inode. 2877 * They are protected by the sp->lock rwlock, which should be held 2878 * for any accesses to the tree. 2879 */ 2880 2881 /* 2882 * lookup first element intersecting start-end. Caller holds sp->lock for 2883 * reading or for writing 2884 */ 2885 static struct sp_node *sp_lookup(struct shared_policy *sp, 2886 pgoff_t start, pgoff_t end) 2887 { 2888 struct rb_node *n = sp->root.rb_node; 2889 2890 while (n) { 2891 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2892 2893 if (start >= p->end) 2894 n = n->rb_right; 2895 else if (end <= p->start) 2896 n = n->rb_left; 2897 else 2898 break; 2899 } 2900 if (!n) 2901 return NULL; 2902 for (;;) { 2903 struct sp_node *w = NULL; 2904 struct rb_node *prev = rb_prev(n); 2905 if (!prev) 2906 break; 2907 w = rb_entry(prev, struct sp_node, nd); 2908 if (w->end <= start) 2909 break; 2910 n = prev; 2911 } 2912 return rb_entry(n, struct sp_node, nd); 2913 } 2914 2915 /* 2916 * Insert a new shared policy into the list. Caller holds sp->lock for 2917 * writing. 2918 */ 2919 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2920 { 2921 struct rb_node **p = &sp->root.rb_node; 2922 struct rb_node *parent = NULL; 2923 struct sp_node *nd; 2924 2925 while (*p) { 2926 parent = *p; 2927 nd = rb_entry(parent, struct sp_node, nd); 2928 if (new->start < nd->start) 2929 p = &(*p)->rb_left; 2930 else if (new->end > nd->end) 2931 p = &(*p)->rb_right; 2932 else 2933 BUG(); 2934 } 2935 rb_link_node(&new->nd, parent, p); 2936 rb_insert_color(&new->nd, &sp->root); 2937 } 2938 2939 /* Find shared policy intersecting idx */ 2940 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2941 pgoff_t idx) 2942 { 2943 struct mempolicy *pol = NULL; 2944 struct sp_node *sn; 2945 2946 if (!sp->root.rb_node) 2947 return NULL; 2948 read_lock(&sp->lock); 2949 sn = sp_lookup(sp, idx, idx+1); 2950 if (sn) { 2951 mpol_get(sn->policy); 2952 pol = sn->policy; 2953 } 2954 read_unlock(&sp->lock); 2955 return pol; 2956 } 2957 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); 2958 2959 static void sp_free(struct sp_node *n) 2960 { 2961 mpol_put(n->policy); 2962 kmem_cache_free(sn_cache, n); 2963 } 2964 2965 /** 2966 * mpol_misplaced - check whether current folio node is valid in policy 2967 * 2968 * @folio: folio to be checked 2969 * @vmf: structure describing the fault 2970 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2971 * 2972 * Lookup current policy node id for vma,addr and "compare to" folio's 2973 * node id. Policy determination "mimics" alloc_page_vma(). 2974 * Called from fault path where we know the vma and faulting address. 2975 * 2976 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2977 * policy, or a suitable node ID to allocate a replacement folio from. 2978 */ 2979 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2980 unsigned long addr) 2981 { 2982 struct mempolicy *pol; 2983 pgoff_t ilx; 2984 struct zoneref *z; 2985 int curnid = folio_nid(folio); 2986 struct vm_area_struct *vma = vmf->vma; 2987 int thiscpu = raw_smp_processor_id(); 2988 int thisnid = numa_node_id(); 2989 int polnid = NUMA_NO_NODE; 2990 int ret = NUMA_NO_NODE; 2991 2992 /* 2993 * Make sure ptl is held so that we don't preempt and we 2994 * have a stable smp processor id 2995 */ 2996 lockdep_assert_held(vmf->ptl); 2997 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2998 if (!(pol->flags & MPOL_F_MOF)) 2999 goto out; 3000 3001 switch (pol->mode) { 3002 case MPOL_INTERLEAVE: 3003 polnid = interleave_nid(pol, ilx); 3004 break; 3005 3006 case MPOL_WEIGHTED_INTERLEAVE: 3007 polnid = weighted_interleave_nid(pol, ilx); 3008 break; 3009 3010 case MPOL_PREFERRED: 3011 if (node_isset(curnid, pol->nodes)) 3012 goto out; 3013 polnid = first_node(pol->nodes); 3014 break; 3015 3016 case MPOL_LOCAL: 3017 polnid = numa_node_id(); 3018 break; 3019 3020 case MPOL_BIND: 3021 case MPOL_PREFERRED_MANY: 3022 /* 3023 * Even though MPOL_PREFERRED_MANY can allocate pages outside 3024 * policy nodemask we don't allow numa migration to nodes 3025 * outside policy nodemask for now. This is done so that if we 3026 * want demotion to slow memory to happen, before allocating 3027 * from some DRAM node say 'x', we will end up using a 3028 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 3029 * we should not promote to node 'x' from slow memory node. 3030 */ 3031 if (pol->flags & MPOL_F_MORON) { 3032 /* 3033 * Optimize placement among multiple nodes 3034 * via NUMA balancing 3035 */ 3036 if (node_isset(thisnid, pol->nodes)) 3037 break; 3038 goto out; 3039 } 3040 3041 /* 3042 * use current page if in policy nodemask, 3043 * else select nearest allowed node, if any. 3044 * If no allowed nodes, use current [!misplaced]. 3045 */ 3046 if (node_isset(curnid, pol->nodes)) 3047 goto out; 3048 z = first_zones_zonelist( 3049 node_zonelist(thisnid, GFP_HIGHUSER), 3050 gfp_zone(GFP_HIGHUSER), 3051 &pol->nodes); 3052 polnid = zonelist_node_idx(z); 3053 break; 3054 3055 default: 3056 BUG(); 3057 } 3058 3059 /* Migrate the folio towards the node whose CPU is referencing it */ 3060 if (pol->flags & MPOL_F_MORON) { 3061 polnid = thisnid; 3062 3063 if (!should_numa_migrate_memory(current, folio, curnid, 3064 thiscpu)) 3065 goto out; 3066 } 3067 3068 if (curnid != polnid) 3069 ret = polnid; 3070 out: 3071 mpol_cond_put(pol); 3072 3073 return ret; 3074 } 3075 3076 /* 3077 * Drop the (possibly final) reference to task->mempolicy. It needs to be 3078 * dropped after task->mempolicy is set to NULL so that any allocation done as 3079 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 3080 * policy. 3081 */ 3082 void mpol_put_task_policy(struct task_struct *task) 3083 { 3084 struct mempolicy *pol; 3085 3086 task_lock(task); 3087 pol = task->mempolicy; 3088 task->mempolicy = NULL; 3089 task_unlock(task); 3090 mpol_put(pol); 3091 } 3092 3093 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 3094 { 3095 rb_erase(&n->nd, &sp->root); 3096 sp_free(n); 3097 } 3098 3099 static void sp_node_init(struct sp_node *node, unsigned long start, 3100 unsigned long end, struct mempolicy *pol) 3101 { 3102 node->start = start; 3103 node->end = end; 3104 node->policy = pol; 3105 } 3106 3107 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 3108 struct mempolicy *pol) 3109 { 3110 struct sp_node *n; 3111 struct mempolicy *newpol; 3112 3113 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3114 if (!n) 3115 return NULL; 3116 3117 newpol = mpol_dup(pol); 3118 if (IS_ERR(newpol)) { 3119 kmem_cache_free(sn_cache, n); 3120 return NULL; 3121 } 3122 newpol->flags |= MPOL_F_SHARED; 3123 sp_node_init(n, start, end, newpol); 3124 3125 return n; 3126 } 3127 3128 /* Replace a policy range. */ 3129 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 3130 pgoff_t end, struct sp_node *new) 3131 { 3132 struct sp_node *n; 3133 struct sp_node *n_new = NULL; 3134 struct mempolicy *mpol_new = NULL; 3135 int ret = 0; 3136 3137 restart: 3138 write_lock(&sp->lock); 3139 n = sp_lookup(sp, start, end); 3140 /* Take care of old policies in the same range. */ 3141 while (n && n->start < end) { 3142 struct rb_node *next = rb_next(&n->nd); 3143 if (n->start >= start) { 3144 if (n->end <= end) 3145 sp_delete(sp, n); 3146 else 3147 n->start = end; 3148 } else { 3149 /* Old policy spanning whole new range. */ 3150 if (n->end > end) { 3151 if (!n_new) 3152 goto alloc_new; 3153 3154 *mpol_new = *n->policy; 3155 atomic_set(&mpol_new->refcnt, 1); 3156 sp_node_init(n_new, end, n->end, mpol_new); 3157 n->end = start; 3158 sp_insert(sp, n_new); 3159 n_new = NULL; 3160 mpol_new = NULL; 3161 break; 3162 } else 3163 n->end = start; 3164 } 3165 if (!next) 3166 break; 3167 n = rb_entry(next, struct sp_node, nd); 3168 } 3169 if (new) 3170 sp_insert(sp, new); 3171 write_unlock(&sp->lock); 3172 ret = 0; 3173 3174 err_out: 3175 if (mpol_new) 3176 mpol_put(mpol_new); 3177 if (n_new) 3178 kmem_cache_free(sn_cache, n_new); 3179 3180 return ret; 3181 3182 alloc_new: 3183 write_unlock(&sp->lock); 3184 ret = -ENOMEM; 3185 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3186 if (!n_new) 3187 goto err_out; 3188 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3189 if (!mpol_new) 3190 goto err_out; 3191 atomic_set(&mpol_new->refcnt, 1); 3192 goto restart; 3193 } 3194 3195 /** 3196 * mpol_shared_policy_init - initialize shared policy for inode 3197 * @sp: pointer to inode shared policy 3198 * @mpol: struct mempolicy to install 3199 * 3200 * Install non-NULL @mpol in inode's shared policy rb-tree. 3201 * On entry, the current task has a reference on a non-NULL @mpol. 3202 * This must be released on exit. 3203 * This is called at get_inode() calls and we can use GFP_KERNEL. 3204 */ 3205 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3206 { 3207 int ret; 3208 3209 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3210 rwlock_init(&sp->lock); 3211 3212 if (mpol) { 3213 struct sp_node *sn; 3214 struct mempolicy *npol; 3215 NODEMASK_SCRATCH(scratch); 3216 3217 if (!scratch) 3218 goto put_mpol; 3219 3220 /* contextualize the tmpfs mount point mempolicy to this file */ 3221 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3222 if (IS_ERR(npol)) 3223 goto free_scratch; /* no valid nodemask intersection */ 3224 3225 task_lock(current); 3226 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3227 task_unlock(current); 3228 if (ret) 3229 goto put_npol; 3230 3231 /* alloc node covering entire file; adds ref to file's npol */ 3232 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3233 if (sn) 3234 sp_insert(sp, sn); 3235 put_npol: 3236 mpol_put(npol); /* drop initial ref on file's npol */ 3237 free_scratch: 3238 NODEMASK_SCRATCH_FREE(scratch); 3239 put_mpol: 3240 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3241 } 3242 } 3243 EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); 3244 3245 int mpol_set_shared_policy(struct shared_policy *sp, 3246 struct vm_area_struct *vma, struct mempolicy *pol) 3247 { 3248 int err; 3249 struct sp_node *new = NULL; 3250 unsigned long sz = vma_pages(vma); 3251 3252 if (pol) { 3253 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3254 if (!new) 3255 return -ENOMEM; 3256 } 3257 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3258 if (err && new) 3259 sp_free(new); 3260 return err; 3261 } 3262 EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); 3263 3264 /* Free a backing policy store on inode delete. */ 3265 void mpol_free_shared_policy(struct shared_policy *sp) 3266 { 3267 struct sp_node *n; 3268 struct rb_node *next; 3269 3270 if (!sp->root.rb_node) 3271 return; 3272 write_lock(&sp->lock); 3273 next = rb_first(&sp->root); 3274 while (next) { 3275 n = rb_entry(next, struct sp_node, nd); 3276 next = rb_next(&n->nd); 3277 sp_delete(sp, n); 3278 } 3279 write_unlock(&sp->lock); 3280 } 3281 EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); 3282 3283 #ifdef CONFIG_NUMA_BALANCING 3284 static int __initdata numabalancing_override; 3285 3286 static void __init check_numabalancing_enable(void) 3287 { 3288 bool numabalancing_default = false; 3289 3290 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3291 numabalancing_default = true; 3292 3293 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3294 if (numabalancing_override) 3295 set_numabalancing_state(numabalancing_override == 1); 3296 3297 if (num_online_nodes() > 1 && !numabalancing_override) { 3298 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3299 numabalancing_default ? "Enabling" : "Disabling"); 3300 set_numabalancing_state(numabalancing_default); 3301 } 3302 } 3303 3304 static int __init setup_numabalancing(char *str) 3305 { 3306 int ret = 0; 3307 if (!str) 3308 goto out; 3309 3310 if (!strcmp(str, "enable")) { 3311 numabalancing_override = 1; 3312 ret = 1; 3313 } else if (!strcmp(str, "disable")) { 3314 numabalancing_override = -1; 3315 ret = 1; 3316 } 3317 out: 3318 if (!ret) 3319 pr_warn("Unable to parse numa_balancing=\n"); 3320 3321 return ret; 3322 } 3323 __setup("numa_balancing=", setup_numabalancing); 3324 #else 3325 static inline void __init check_numabalancing_enable(void) 3326 { 3327 } 3328 #endif /* CONFIG_NUMA_BALANCING */ 3329 3330 void __init numa_policy_init(void) 3331 { 3332 nodemask_t interleave_nodes; 3333 unsigned long largest = 0; 3334 int nid, prefer = 0; 3335 3336 policy_cache = kmem_cache_create("numa_policy", 3337 sizeof(struct mempolicy), 3338 0, SLAB_PANIC, NULL); 3339 3340 sn_cache = kmem_cache_create("shared_policy_node", 3341 sizeof(struct sp_node), 3342 0, SLAB_PANIC, NULL); 3343 3344 for_each_node(nid) { 3345 preferred_node_policy[nid] = (struct mempolicy) { 3346 .refcnt = ATOMIC_INIT(1), 3347 .mode = MPOL_PREFERRED, 3348 .flags = MPOL_F_MOF | MPOL_F_MORON, 3349 .nodes = nodemask_of_node(nid), 3350 }; 3351 } 3352 3353 /* 3354 * Set interleaving policy for system init. Interleaving is only 3355 * enabled across suitably sized nodes (default is >= 16MB), or 3356 * fall back to the largest node if they're all smaller. 3357 */ 3358 nodes_clear(interleave_nodes); 3359 for_each_node_state(nid, N_MEMORY) { 3360 unsigned long total_pages = node_present_pages(nid); 3361 3362 /* Preserve the largest node */ 3363 if (largest < total_pages) { 3364 largest = total_pages; 3365 prefer = nid; 3366 } 3367 3368 /* Interleave this node? */ 3369 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3370 node_set(nid, interleave_nodes); 3371 } 3372 3373 /* All too small, use the largest */ 3374 if (unlikely(nodes_empty(interleave_nodes))) 3375 node_set(prefer, interleave_nodes); 3376 3377 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3378 pr_err("%s: interleaving failed\n", __func__); 3379 3380 check_numabalancing_enable(); 3381 } 3382 3383 /* Reset policy of current process to default */ 3384 void numa_default_policy(void) 3385 { 3386 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3387 } 3388 3389 /* 3390 * Parse and format mempolicy from/to strings 3391 */ 3392 static const char * const policy_modes[] = 3393 { 3394 [MPOL_DEFAULT] = "default", 3395 [MPOL_PREFERRED] = "prefer", 3396 [MPOL_BIND] = "bind", 3397 [MPOL_INTERLEAVE] = "interleave", 3398 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3399 [MPOL_LOCAL] = "local", 3400 [MPOL_PREFERRED_MANY] = "prefer (many)", 3401 }; 3402 3403 #ifdef CONFIG_TMPFS 3404 /** 3405 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3406 * @str: string containing mempolicy to parse 3407 * @mpol: pointer to struct mempolicy pointer, returned on success. 3408 * 3409 * Format of input: 3410 * <mode>[=<flags>][:<nodelist>] 3411 * 3412 * Return: %0 on success, else %1 3413 */ 3414 int mpol_parse_str(char *str, struct mempolicy **mpol) 3415 { 3416 struct mempolicy *new = NULL; 3417 unsigned short mode_flags; 3418 nodemask_t nodes; 3419 char *nodelist = strchr(str, ':'); 3420 char *flags = strchr(str, '='); 3421 int err = 1, mode; 3422 3423 if (flags) 3424 *flags++ = '\0'; /* terminate mode string */ 3425 3426 if (nodelist) { 3427 /* NUL-terminate mode or flags string */ 3428 *nodelist++ = '\0'; 3429 if (nodelist_parse(nodelist, nodes)) 3430 goto out; 3431 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3432 goto out; 3433 } else 3434 nodes_clear(nodes); 3435 3436 mode = match_string(policy_modes, MPOL_MAX, str); 3437 if (mode < 0) 3438 goto out; 3439 3440 switch (mode) { 3441 case MPOL_PREFERRED: 3442 /* 3443 * Insist on a nodelist of one node only, although later 3444 * we use first_node(nodes) to grab a single node, so here 3445 * nodelist (or nodes) cannot be empty. 3446 */ 3447 if (nodelist) { 3448 char *rest = nodelist; 3449 while (isdigit(*rest)) 3450 rest++; 3451 if (*rest) 3452 goto out; 3453 if (nodes_empty(nodes)) 3454 goto out; 3455 } 3456 break; 3457 case MPOL_INTERLEAVE: 3458 case MPOL_WEIGHTED_INTERLEAVE: 3459 /* 3460 * Default to online nodes with memory if no nodelist 3461 */ 3462 if (!nodelist) 3463 nodes = node_states[N_MEMORY]; 3464 break; 3465 case MPOL_LOCAL: 3466 /* 3467 * Don't allow a nodelist; mpol_new() checks flags 3468 */ 3469 if (nodelist) 3470 goto out; 3471 break; 3472 case MPOL_DEFAULT: 3473 /* 3474 * Insist on a empty nodelist 3475 */ 3476 if (!nodelist) 3477 err = 0; 3478 goto out; 3479 case MPOL_PREFERRED_MANY: 3480 case MPOL_BIND: 3481 /* 3482 * Insist on a nodelist 3483 */ 3484 if (!nodelist) 3485 goto out; 3486 } 3487 3488 mode_flags = 0; 3489 if (flags) { 3490 /* 3491 * Currently, we only support two mutually exclusive 3492 * mode flags. 3493 */ 3494 if (!strcmp(flags, "static")) 3495 mode_flags |= MPOL_F_STATIC_NODES; 3496 else if (!strcmp(flags, "relative")) 3497 mode_flags |= MPOL_F_RELATIVE_NODES; 3498 else 3499 goto out; 3500 } 3501 3502 new = mpol_new(mode, mode_flags, &nodes); 3503 if (IS_ERR(new)) 3504 goto out; 3505 3506 /* 3507 * Save nodes for mpol_to_str() to show the tmpfs mount options 3508 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3509 */ 3510 if (mode != MPOL_PREFERRED) { 3511 new->nodes = nodes; 3512 } else if (nodelist) { 3513 nodes_clear(new->nodes); 3514 node_set(first_node(nodes), new->nodes); 3515 } else { 3516 new->mode = MPOL_LOCAL; 3517 } 3518 3519 /* 3520 * Save nodes for contextualization: this will be used to "clone" 3521 * the mempolicy in a specific context [cpuset] at a later time. 3522 */ 3523 new->w.user_nodemask = nodes; 3524 3525 err = 0; 3526 3527 out: 3528 /* Restore string for error message */ 3529 if (nodelist) 3530 *--nodelist = ':'; 3531 if (flags) 3532 *--flags = '='; 3533 if (!err) 3534 *mpol = new; 3535 return err; 3536 } 3537 #endif /* CONFIG_TMPFS */ 3538 3539 /** 3540 * mpol_to_str - format a mempolicy structure for printing 3541 * @buffer: to contain formatted mempolicy string 3542 * @maxlen: length of @buffer 3543 * @pol: pointer to mempolicy to be formatted 3544 * 3545 * Convert @pol into a string. If @buffer is too short, truncate the string. 3546 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3547 * interleave", plus the longest flag flags, "relative|balancing", and to 3548 * display at least a few node ids. 3549 */ 3550 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3551 { 3552 char *p = buffer; 3553 nodemask_t nodes = NODE_MASK_NONE; 3554 unsigned short mode = MPOL_DEFAULT; 3555 unsigned short flags = 0; 3556 3557 if (pol && 3558 pol != &default_policy && 3559 !(pol >= &preferred_node_policy[0] && 3560 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3561 mode = pol->mode; 3562 flags = pol->flags; 3563 } 3564 3565 switch (mode) { 3566 case MPOL_DEFAULT: 3567 case MPOL_LOCAL: 3568 break; 3569 case MPOL_PREFERRED: 3570 case MPOL_PREFERRED_MANY: 3571 case MPOL_BIND: 3572 case MPOL_INTERLEAVE: 3573 case MPOL_WEIGHTED_INTERLEAVE: 3574 nodes = pol->nodes; 3575 break; 3576 default: 3577 WARN_ON_ONCE(1); 3578 snprintf(p, maxlen, "unknown"); 3579 return; 3580 } 3581 3582 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3583 3584 if (flags & MPOL_MODE_FLAGS) { 3585 p += snprintf(p, buffer + maxlen - p, "="); 3586 3587 /* 3588 * Static and relative are mutually exclusive. 3589 */ 3590 if (flags & MPOL_F_STATIC_NODES) 3591 p += snprintf(p, buffer + maxlen - p, "static"); 3592 else if (flags & MPOL_F_RELATIVE_NODES) 3593 p += snprintf(p, buffer + maxlen - p, "relative"); 3594 3595 if (flags & MPOL_F_NUMA_BALANCING) { 3596 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3597 p += snprintf(p, buffer + maxlen - p, "|"); 3598 p += snprintf(p, buffer + maxlen - p, "balancing"); 3599 } 3600 } 3601 3602 if (!nodes_empty(nodes)) 3603 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3604 nodemask_pr_args(&nodes)); 3605 } 3606 3607 #ifdef CONFIG_SYSFS 3608 struct iw_node_attr { 3609 struct kobj_attribute kobj_attr; 3610 int nid; 3611 }; 3612 3613 struct sysfs_wi_group { 3614 struct kobject wi_kobj; 3615 struct mutex kobj_lock; 3616 struct iw_node_attr *nattrs[]; 3617 }; 3618 3619 static struct sysfs_wi_group *wi_group; 3620 3621 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3622 char *buf) 3623 { 3624 struct iw_node_attr *node_attr; 3625 u8 weight; 3626 3627 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3628 weight = get_il_weight(node_attr->nid); 3629 return sysfs_emit(buf, "%d\n", weight); 3630 } 3631 3632 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3633 const char *buf, size_t count) 3634 { 3635 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3636 struct iw_node_attr *node_attr; 3637 u8 weight = 0; 3638 int i; 3639 3640 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3641 if (count == 0 || sysfs_streq(buf, "") || 3642 kstrtou8(buf, 0, &weight) || weight == 0) 3643 return -EINVAL; 3644 3645 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3646 GFP_KERNEL); 3647 if (!new_wi_state) 3648 return -ENOMEM; 3649 3650 mutex_lock(&wi_state_lock); 3651 old_wi_state = rcu_dereference_protected(wi_state, 3652 lockdep_is_held(&wi_state_lock)); 3653 if (old_wi_state) { 3654 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3655 nr_node_ids * sizeof(u8)); 3656 } else { 3657 for (i = 0; i < nr_node_ids; i++) 3658 new_wi_state->iw_table[i] = 1; 3659 } 3660 new_wi_state->iw_table[node_attr->nid] = weight; 3661 new_wi_state->mode_auto = false; 3662 3663 rcu_assign_pointer(wi_state, new_wi_state); 3664 mutex_unlock(&wi_state_lock); 3665 if (old_wi_state) { 3666 synchronize_rcu(); 3667 kfree(old_wi_state); 3668 } 3669 return count; 3670 } 3671 3672 static ssize_t weighted_interleave_auto_show(struct kobject *kobj, 3673 struct kobj_attribute *attr, char *buf) 3674 { 3675 struct weighted_interleave_state *state; 3676 bool wi_auto = true; 3677 3678 rcu_read_lock(); 3679 state = rcu_dereference(wi_state); 3680 if (state) 3681 wi_auto = state->mode_auto; 3682 rcu_read_unlock(); 3683 3684 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); 3685 } 3686 3687 static ssize_t weighted_interleave_auto_store(struct kobject *kobj, 3688 struct kobj_attribute *attr, const char *buf, size_t count) 3689 { 3690 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3691 unsigned int *bw; 3692 bool input; 3693 int i; 3694 3695 if (kstrtobool(buf, &input)) 3696 return -EINVAL; 3697 3698 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3699 GFP_KERNEL); 3700 if (!new_wi_state) 3701 return -ENOMEM; 3702 for (i = 0; i < nr_node_ids; i++) 3703 new_wi_state->iw_table[i] = 1; 3704 3705 mutex_lock(&wi_state_lock); 3706 if (!input) { 3707 old_wi_state = rcu_dereference_protected(wi_state, 3708 lockdep_is_held(&wi_state_lock)); 3709 if (!old_wi_state) 3710 goto update_wi_state; 3711 if (input == old_wi_state->mode_auto) { 3712 mutex_unlock(&wi_state_lock); 3713 return count; 3714 } 3715 3716 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3717 nr_node_ids * sizeof(u8)); 3718 goto update_wi_state; 3719 } 3720 3721 bw = node_bw_table; 3722 if (!bw) { 3723 mutex_unlock(&wi_state_lock); 3724 kfree(new_wi_state); 3725 return -ENODEV; 3726 } 3727 3728 new_wi_state->mode_auto = true; 3729 reduce_interleave_weights(bw, new_wi_state->iw_table); 3730 3731 update_wi_state: 3732 rcu_assign_pointer(wi_state, new_wi_state); 3733 mutex_unlock(&wi_state_lock); 3734 if (old_wi_state) { 3735 synchronize_rcu(); 3736 kfree(old_wi_state); 3737 } 3738 return count; 3739 } 3740 3741 static void sysfs_wi_node_delete(int nid) 3742 { 3743 struct iw_node_attr *attr; 3744 3745 if (nid < 0 || nid >= nr_node_ids) 3746 return; 3747 3748 mutex_lock(&wi_group->kobj_lock); 3749 attr = wi_group->nattrs[nid]; 3750 if (!attr) { 3751 mutex_unlock(&wi_group->kobj_lock); 3752 return; 3753 } 3754 3755 wi_group->nattrs[nid] = NULL; 3756 mutex_unlock(&wi_group->kobj_lock); 3757 3758 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); 3759 kfree(attr->kobj_attr.attr.name); 3760 kfree(attr); 3761 } 3762 3763 static void sysfs_wi_node_delete_all(void) 3764 { 3765 int nid; 3766 3767 for (nid = 0; nid < nr_node_ids; nid++) 3768 sysfs_wi_node_delete(nid); 3769 } 3770 3771 static void wi_state_free(void) 3772 { 3773 struct weighted_interleave_state *old_wi_state; 3774 3775 mutex_lock(&wi_state_lock); 3776 old_wi_state = rcu_dereference_protected(wi_state, 3777 lockdep_is_held(&wi_state_lock)); 3778 rcu_assign_pointer(wi_state, NULL); 3779 mutex_unlock(&wi_state_lock); 3780 3781 if (old_wi_state) { 3782 synchronize_rcu(); 3783 kfree(old_wi_state); 3784 } 3785 } 3786 3787 static struct kobj_attribute wi_auto_attr = 3788 __ATTR(auto, 0664, weighted_interleave_auto_show, 3789 weighted_interleave_auto_store); 3790 3791 static void wi_cleanup(void) { 3792 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3793 sysfs_wi_node_delete_all(); 3794 wi_state_free(); 3795 } 3796 3797 static void wi_kobj_release(struct kobject *wi_kobj) 3798 { 3799 kfree(wi_group); 3800 } 3801 3802 static const struct kobj_type wi_ktype = { 3803 .sysfs_ops = &kobj_sysfs_ops, 3804 .release = wi_kobj_release, 3805 }; 3806 3807 static int sysfs_wi_node_add(int nid) 3808 { 3809 int ret; 3810 char *name; 3811 struct iw_node_attr *new_attr; 3812 3813 if (nid < 0 || nid >= nr_node_ids) { 3814 pr_err("invalid node id: %d\n", nid); 3815 return -EINVAL; 3816 } 3817 3818 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); 3819 if (!new_attr) 3820 return -ENOMEM; 3821 3822 name = kasprintf(GFP_KERNEL, "node%d", nid); 3823 if (!name) { 3824 kfree(new_attr); 3825 return -ENOMEM; 3826 } 3827 3828 sysfs_attr_init(&new_attr->kobj_attr.attr); 3829 new_attr->kobj_attr.attr.name = name; 3830 new_attr->kobj_attr.attr.mode = 0644; 3831 new_attr->kobj_attr.show = node_show; 3832 new_attr->kobj_attr.store = node_store; 3833 new_attr->nid = nid; 3834 3835 mutex_lock(&wi_group->kobj_lock); 3836 if (wi_group->nattrs[nid]) { 3837 mutex_unlock(&wi_group->kobj_lock); 3838 ret = -EEXIST; 3839 goto out; 3840 } 3841 3842 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); 3843 if (ret) { 3844 mutex_unlock(&wi_group->kobj_lock); 3845 goto out; 3846 } 3847 wi_group->nattrs[nid] = new_attr; 3848 mutex_unlock(&wi_group->kobj_lock); 3849 return 0; 3850 3851 out: 3852 kfree(new_attr->kobj_attr.attr.name); 3853 kfree(new_attr); 3854 return ret; 3855 } 3856 3857 static int wi_node_notifier(struct notifier_block *nb, 3858 unsigned long action, void *data) 3859 { 3860 int err; 3861 struct node_notify *nn = data; 3862 int nid = nn->nid; 3863 3864 switch (action) { 3865 case NODE_ADDED_FIRST_MEMORY: 3866 err = sysfs_wi_node_add(nid); 3867 if (err) 3868 pr_err("failed to add sysfs for node%d during hotplug: %d\n", 3869 nid, err); 3870 break; 3871 case NODE_REMOVED_LAST_MEMORY: 3872 sysfs_wi_node_delete(nid); 3873 break; 3874 } 3875 3876 return NOTIFY_OK; 3877 } 3878 3879 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) 3880 { 3881 int nid, err; 3882 3883 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), 3884 GFP_KERNEL); 3885 if (!wi_group) 3886 return -ENOMEM; 3887 mutex_init(&wi_group->kobj_lock); 3888 3889 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, 3890 "weighted_interleave"); 3891 if (err) 3892 goto err_put_kobj; 3893 3894 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3895 if (err) 3896 goto err_put_kobj; 3897 3898 for_each_online_node(nid) { 3899 if (!node_state(nid, N_MEMORY)) 3900 continue; 3901 3902 err = sysfs_wi_node_add(nid); 3903 if (err) { 3904 pr_err("failed to add sysfs for node%d during init: %d\n", 3905 nid, err); 3906 goto err_cleanup_kobj; 3907 } 3908 } 3909 3910 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); 3911 return 0; 3912 3913 err_cleanup_kobj: 3914 wi_cleanup(); 3915 kobject_del(&wi_group->wi_kobj); 3916 err_put_kobj: 3917 kobject_put(&wi_group->wi_kobj); 3918 return err; 3919 } 3920 3921 static int __init mempolicy_sysfs_init(void) 3922 { 3923 int err; 3924 static struct kobject *mempolicy_kobj; 3925 3926 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); 3927 if (!mempolicy_kobj) 3928 return -ENOMEM; 3929 3930 err = add_weighted_interleave_group(mempolicy_kobj); 3931 if (err) 3932 goto err_kobj; 3933 3934 return 0; 3935 3936 err_kobj: 3937 kobject_del(mempolicy_kobj); 3938 kobject_put(mempolicy_kobj); 3939 return err; 3940 } 3941 3942 late_initcall(mempolicy_sysfs_init); 3943 #endif /* CONFIG_SYSFS */ 3944