1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple NUMA memory policy for the Linux kernel. 4 * 5 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 7 * 8 * NUMA policy allows the user to give hints in which node(s) memory should 9 * be allocated. 10 * 11 * Support six policies per VMA and per process: 12 * 13 * The VMA policy has priority over the process policy for a page fault. 14 * 15 * interleave Allocate memory interleaved over a set of nodes, 16 * with normal fallback if it fails. 17 * For VMA based allocations this interleaves based on the 18 * offset into the backing object or offset into the mapping 19 * for anonymous memory. For process policy an process counter 20 * is used. 21 * 22 * weighted interleave 23 * Allocate memory interleaved over a set of nodes based on 24 * a set of weights (per-node), with normal fallback if it 25 * fails. Otherwise operates the same as interleave. 26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated 27 * on node 0 for every 1 page allocated on node 1. 28 * 29 * bind Only allocate memory on a specific set of nodes, 30 * no fallback. 31 * FIXME: memory is allocated starting with the first node 32 * to the last. It would be better if bind would truly restrict 33 * the allocation to memory nodes instead 34 * 35 * preferred Try a specific node first before normal fallback. 36 * As a special case NUMA_NO_NODE here means do the allocation 37 * on the local CPU. This is normally identical to default, 38 * but useful to set in a VMA when you have a non default 39 * process policy. 40 * 41 * preferred many Try a set of nodes first before normal fallback. This is 42 * similar to preferred without the special case. 43 * 44 * default Allocate on the local node first, or when on a VMA 45 * use the process policy. This is what Linux always did 46 * in a NUMA aware kernel and still does by, ahem, default. 47 * 48 * The process policy is applied for most non interrupt memory allocations 49 * in that process' context. Interrupts ignore the policies and always 50 * try to allocate on the local CPU. The VMA policy is only applied for memory 51 * allocations for a VMA in the VM. 52 * 53 * Currently there are a few corner cases in swapping where the policy 54 * is not applied, but the majority should be handled. When process policy 55 * is used it is not remembered over swap outs/swap ins. 56 * 57 * Only the highest zone in the zone hierarchy gets policied. Allocations 58 * requesting a lower zone just use default policy. This implies that 59 * on systems with highmem kernel lowmem allocation don't get policied. 60 * Same with GFP_DMA allocations. 61 * 62 * For shmem/tmpfs shared memory the policy is shared between 63 * all users and remembered even when nobody has memory mapped. 64 */ 65 66 /* Notebook: 67 fix mmap readahead to honour policy and enable policy for any page cache 68 object 69 statistics for bigpages 70 global policy for page cache? currently it uses process policy. Requires 71 first item above. 72 handle mremap for shared memory (currently ignored for the policy) 73 grows down? 74 make bind policy root only? It can trigger oom much faster and the 75 kernel is not always grateful with that. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/mempolicy.h> 81 #include <linux/pagewalk.h> 82 #include <linux/highmem.h> 83 #include <linux/hugetlb.h> 84 #include <linux/kernel.h> 85 #include <linux/sched.h> 86 #include <linux/sched/mm.h> 87 #include <linux/sched/numa_balancing.h> 88 #include <linux/sched/task.h> 89 #include <linux/nodemask.h> 90 #include <linux/cpuset.h> 91 #include <linux/slab.h> 92 #include <linux/string.h> 93 #include <linux/export.h> 94 #include <linux/nsproxy.h> 95 #include <linux/interrupt.h> 96 #include <linux/init.h> 97 #include <linux/compat.h> 98 #include <linux/ptrace.h> 99 #include <linux/swap.h> 100 #include <linux/seq_file.h> 101 #include <linux/proc_fs.h> 102 #include <linux/migrate.h> 103 #include <linux/ksm.h> 104 #include <linux/rmap.h> 105 #include <linux/security.h> 106 #include <linux/syscalls.h> 107 #include <linux/ctype.h> 108 #include <linux/mm_inline.h> 109 #include <linux/mmu_notifier.h> 110 #include <linux/printk.h> 111 #include <linux/swapops.h> 112 #include <linux/gcd.h> 113 114 #include <asm/tlbflush.h> 115 #include <asm/tlb.h> 116 #include <linux/uaccess.h> 117 #include <linux/memory.h> 118 119 #include "internal.h" 120 121 /* Internal flags */ 122 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 123 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 124 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ 125 126 static struct kmem_cache *policy_cache; 127 static struct kmem_cache *sn_cache; 128 129 /* Highest zone. An specific allocation for a zone below that is not 130 policied. */ 131 enum zone_type policy_zone = 0; 132 133 /* 134 * run-time system-wide default policy => local allocation 135 */ 136 static struct mempolicy default_policy = { 137 .refcnt = ATOMIC_INIT(1), /* never free it */ 138 .mode = MPOL_LOCAL, 139 }; 140 141 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 142 143 /* 144 * weightiness balances the tradeoff between small weights (cycles through nodes 145 * faster, more fair/even distribution) and large weights (smaller errors 146 * between actual bandwidth ratios and weight ratios). 32 is a number that has 147 * been found to perform at a reasonable compromise between the two goals. 148 */ 149 static const int weightiness = 32; 150 151 /* 152 * A null weighted_interleave_state is interpreted as having .mode="auto", 153 * and .iw_table is interpreted as an array of 1s with length nr_node_ids. 154 */ 155 struct weighted_interleave_state { 156 bool mode_auto; 157 u8 iw_table[]; 158 }; 159 static struct weighted_interleave_state __rcu *wi_state; 160 static unsigned int *node_bw_table; 161 162 /* 163 * wi_state_lock protects both wi_state and node_bw_table. 164 * node_bw_table is only used by writers to update wi_state. 165 */ 166 static DEFINE_MUTEX(wi_state_lock); 167 168 static u8 get_il_weight(int node) 169 { 170 struct weighted_interleave_state *state; 171 u8 weight = 1; 172 173 rcu_read_lock(); 174 state = rcu_dereference(wi_state); 175 if (state) 176 weight = state->iw_table[node]; 177 rcu_read_unlock(); 178 return weight; 179 } 180 181 /* 182 * Convert bandwidth values into weighted interleave weights. 183 * Call with wi_state_lock. 184 */ 185 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) 186 { 187 u64 sum_bw = 0; 188 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; 189 int nid; 190 191 for_each_node_state(nid, N_MEMORY) 192 sum_bw += bw[nid]; 193 194 /* Scale bandwidths to whole numbers in the range [1, weightiness] */ 195 for_each_node_state(nid, N_MEMORY) { 196 /* 197 * Try not to perform 64-bit division. 198 * If sum_bw < scaling_factor, then sum_bw < U32_MAX. 199 * If sum_bw > scaling_factor, then round the weight up to 1. 200 */ 201 scaling_factor = weightiness * bw[nid]; 202 if (bw[nid] && sum_bw < scaling_factor) { 203 cast_sum_bw = (unsigned int)sum_bw; 204 new_iw[nid] = scaling_factor / cast_sum_bw; 205 } else { 206 new_iw[nid] = 1; 207 } 208 if (!iw_gcd) 209 iw_gcd = new_iw[nid]; 210 iw_gcd = gcd(iw_gcd, new_iw[nid]); 211 } 212 213 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ 214 for_each_node_state(nid, N_MEMORY) 215 new_iw[nid] /= iw_gcd; 216 } 217 218 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) 219 { 220 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 221 unsigned int *old_bw, *new_bw; 222 unsigned int bw_val; 223 int i; 224 225 bw_val = min(coords->read_bandwidth, coords->write_bandwidth); 226 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); 227 if (!new_bw) 228 return -ENOMEM; 229 230 new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 231 GFP_KERNEL); 232 if (!new_wi_state) { 233 kfree(new_bw); 234 return -ENOMEM; 235 } 236 new_wi_state->mode_auto = true; 237 for (i = 0; i < nr_node_ids; i++) 238 new_wi_state->iw_table[i] = 1; 239 240 /* 241 * Update bandwidth info, even in manual mode. That way, when switching 242 * to auto mode in the future, iw_table can be overwritten using 243 * accurate bw data. 244 */ 245 mutex_lock(&wi_state_lock); 246 247 old_bw = node_bw_table; 248 if (old_bw) 249 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); 250 new_bw[node] = bw_val; 251 node_bw_table = new_bw; 252 253 old_wi_state = rcu_dereference_protected(wi_state, 254 lockdep_is_held(&wi_state_lock)); 255 if (old_wi_state && !old_wi_state->mode_auto) { 256 /* Manual mode; skip reducing weights and updating wi_state */ 257 mutex_unlock(&wi_state_lock); 258 kfree(new_wi_state); 259 goto out; 260 } 261 262 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ 263 reduce_interleave_weights(new_bw, new_wi_state->iw_table); 264 rcu_assign_pointer(wi_state, new_wi_state); 265 266 mutex_unlock(&wi_state_lock); 267 if (old_wi_state) { 268 synchronize_rcu(); 269 kfree(old_wi_state); 270 } 271 out: 272 kfree(old_bw); 273 return 0; 274 } 275 276 /** 277 * numa_nearest_node - Find nearest node by state 278 * @node: Node id to start the search 279 * @state: State to filter the search 280 * 281 * Lookup the closest node by distance if @nid is not in state. 282 * 283 * Return: this @node if it is in state, otherwise the closest node by distance 284 */ 285 int numa_nearest_node(int node, unsigned int state) 286 { 287 int min_dist = INT_MAX, dist, n, min_node; 288 289 if (state >= NR_NODE_STATES) 290 return -EINVAL; 291 292 if (node == NUMA_NO_NODE || node_state(node, state)) 293 return node; 294 295 min_node = node; 296 for_each_node_state(n, state) { 297 dist = node_distance(node, n); 298 if (dist < min_dist) { 299 min_dist = dist; 300 min_node = n; 301 } 302 } 303 304 return min_node; 305 } 306 EXPORT_SYMBOL_GPL(numa_nearest_node); 307 308 /** 309 * nearest_node_nodemask - Find the node in @mask at the nearest distance 310 * from @node. 311 * 312 * @node: a valid node ID to start the search from. 313 * @mask: a pointer to a nodemask representing the allowed nodes. 314 * 315 * This function iterates over all nodes in @mask and calculates the 316 * distance from the starting @node, then it returns the node ID that is 317 * the closest to @node, or MAX_NUMNODES if no node is found. 318 * 319 * Note that @node must be a valid node ID usable with node_distance(), 320 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 321 * or unexpected behavior. 322 */ 323 int nearest_node_nodemask(int node, nodemask_t *mask) 324 { 325 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 326 327 for_each_node_mask(n, *mask) { 328 dist = node_distance(node, n); 329 if (dist < min_dist) { 330 min_dist = dist; 331 min_node = n; 332 } 333 } 334 335 return min_node; 336 } 337 EXPORT_SYMBOL_GPL(nearest_node_nodemask); 338 339 struct mempolicy *get_task_policy(struct task_struct *p) 340 { 341 struct mempolicy *pol = p->mempolicy; 342 int node; 343 344 if (pol) 345 return pol; 346 347 node = numa_node_id(); 348 if (node != NUMA_NO_NODE) { 349 pol = &preferred_node_policy[node]; 350 /* preferred_node_policy is not initialised early in boot */ 351 if (pol->mode) 352 return pol; 353 } 354 355 return &default_policy; 356 } 357 358 static const struct mempolicy_operations { 359 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 360 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 361 } mpol_ops[MPOL_MAX]; 362 363 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 364 { 365 return pol->flags & MPOL_MODE_FLAGS; 366 } 367 368 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 369 const nodemask_t *rel) 370 { 371 nodemask_t tmp; 372 nodes_fold(tmp, *orig, nodes_weight(*rel)); 373 nodes_onto(*ret, tmp, *rel); 374 } 375 376 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 377 { 378 if (nodes_empty(*nodes)) 379 return -EINVAL; 380 pol->nodes = *nodes; 381 return 0; 382 } 383 384 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 385 { 386 if (nodes_empty(*nodes)) 387 return -EINVAL; 388 389 nodes_clear(pol->nodes); 390 node_set(first_node(*nodes), pol->nodes); 391 return 0; 392 } 393 394 /* 395 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 396 * any, for the new policy. mpol_new() has already validated the nodes 397 * parameter with respect to the policy mode and flags. 398 * 399 * Must be called holding task's alloc_lock to protect task's mems_allowed 400 * and mempolicy. May also be called holding the mmap_lock for write. 401 */ 402 static int mpol_set_nodemask(struct mempolicy *pol, 403 const nodemask_t *nodes, struct nodemask_scratch *nsc) 404 { 405 int ret; 406 407 /* 408 * Default (pol==NULL) resp. local memory policies are not a 409 * subject of any remapping. They also do not need any special 410 * constructor. 411 */ 412 if (!pol || pol->mode == MPOL_LOCAL) 413 return 0; 414 415 /* Check N_MEMORY */ 416 nodes_and(nsc->mask1, 417 cpuset_current_mems_allowed, node_states[N_MEMORY]); 418 419 VM_BUG_ON(!nodes); 420 421 if (pol->flags & MPOL_F_RELATIVE_NODES) 422 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 423 else 424 nodes_and(nsc->mask2, *nodes, nsc->mask1); 425 426 if (mpol_store_user_nodemask(pol)) 427 pol->w.user_nodemask = *nodes; 428 else 429 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 430 431 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 432 return ret; 433 } 434 435 /* 436 * This function just creates a new policy, does some check and simple 437 * initialization. You must invoke mpol_set_nodemask() to set nodes. 438 */ 439 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 440 nodemask_t *nodes) 441 { 442 struct mempolicy *policy; 443 444 if (mode == MPOL_DEFAULT) { 445 if (nodes && !nodes_empty(*nodes)) 446 return ERR_PTR(-EINVAL); 447 return NULL; 448 } 449 VM_BUG_ON(!nodes); 450 451 /* 452 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 453 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 454 * All other modes require a valid pointer to a non-empty nodemask. 455 */ 456 if (mode == MPOL_PREFERRED) { 457 if (nodes_empty(*nodes)) { 458 if (((flags & MPOL_F_STATIC_NODES) || 459 (flags & MPOL_F_RELATIVE_NODES))) 460 return ERR_PTR(-EINVAL); 461 462 mode = MPOL_LOCAL; 463 } 464 } else if (mode == MPOL_LOCAL) { 465 if (!nodes_empty(*nodes) || 466 (flags & MPOL_F_STATIC_NODES) || 467 (flags & MPOL_F_RELATIVE_NODES)) 468 return ERR_PTR(-EINVAL); 469 } else if (nodes_empty(*nodes)) 470 return ERR_PTR(-EINVAL); 471 472 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 473 if (!policy) 474 return ERR_PTR(-ENOMEM); 475 atomic_set(&policy->refcnt, 1); 476 policy->mode = mode; 477 policy->flags = flags; 478 policy->home_node = NUMA_NO_NODE; 479 480 return policy; 481 } 482 483 /* Slow path of a mpol destructor. */ 484 void __mpol_put(struct mempolicy *pol) 485 { 486 if (!atomic_dec_and_test(&pol->refcnt)) 487 return; 488 kmem_cache_free(policy_cache, pol); 489 } 490 491 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 492 { 493 } 494 495 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 496 { 497 nodemask_t tmp; 498 499 if (pol->flags & MPOL_F_STATIC_NODES) 500 nodes_and(tmp, pol->w.user_nodemask, *nodes); 501 else if (pol->flags & MPOL_F_RELATIVE_NODES) 502 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 503 else { 504 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 505 *nodes); 506 pol->w.cpuset_mems_allowed = *nodes; 507 } 508 509 if (nodes_empty(tmp)) 510 tmp = *nodes; 511 512 pol->nodes = tmp; 513 } 514 515 static void mpol_rebind_preferred(struct mempolicy *pol, 516 const nodemask_t *nodes) 517 { 518 pol->w.cpuset_mems_allowed = *nodes; 519 } 520 521 /* 522 * mpol_rebind_policy - Migrate a policy to a different set of nodes 523 * 524 * Per-vma policies are protected by mmap_lock. Allocations using per-task 525 * policies are protected by task->mems_allowed_seq to prevent a premature 526 * OOM/allocation failure due to parallel nodemask modification. 527 */ 528 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 529 { 530 if (!pol || pol->mode == MPOL_LOCAL) 531 return; 532 if (!mpol_store_user_nodemask(pol) && 533 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 534 return; 535 536 mpol_ops[pol->mode].rebind(pol, newmask); 537 } 538 539 /* 540 * Wrapper for mpol_rebind_policy() that just requires task 541 * pointer, and updates task mempolicy. 542 * 543 * Called with task's alloc_lock held. 544 */ 545 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 546 { 547 mpol_rebind_policy(tsk->mempolicy, new); 548 } 549 550 /* 551 * Rebind each vma in mm to new nodemask. 552 * 553 * Call holding a reference to mm. Takes mm->mmap_lock during call. 554 */ 555 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 556 { 557 struct vm_area_struct *vma; 558 VMA_ITERATOR(vmi, mm, 0); 559 560 mmap_write_lock(mm); 561 for_each_vma(vmi, vma) { 562 vma_start_write(vma); 563 mpol_rebind_policy(vma->vm_policy, new); 564 } 565 mmap_write_unlock(mm); 566 } 567 568 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 569 [MPOL_DEFAULT] = { 570 .rebind = mpol_rebind_default, 571 }, 572 [MPOL_INTERLEAVE] = { 573 .create = mpol_new_nodemask, 574 .rebind = mpol_rebind_nodemask, 575 }, 576 [MPOL_PREFERRED] = { 577 .create = mpol_new_preferred, 578 .rebind = mpol_rebind_preferred, 579 }, 580 [MPOL_BIND] = { 581 .create = mpol_new_nodemask, 582 .rebind = mpol_rebind_nodemask, 583 }, 584 [MPOL_LOCAL] = { 585 .rebind = mpol_rebind_default, 586 }, 587 [MPOL_PREFERRED_MANY] = { 588 .create = mpol_new_nodemask, 589 .rebind = mpol_rebind_preferred, 590 }, 591 [MPOL_WEIGHTED_INTERLEAVE] = { 592 .create = mpol_new_nodemask, 593 .rebind = mpol_rebind_nodemask, 594 }, 595 }; 596 597 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 598 unsigned long flags); 599 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 600 pgoff_t ilx, int *nid); 601 602 static bool strictly_unmovable(unsigned long flags) 603 { 604 /* 605 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO 606 * if any misplaced page is found. 607 */ 608 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == 609 MPOL_MF_STRICT; 610 } 611 612 struct migration_mpol { /* for alloc_migration_target_by_mpol() */ 613 struct mempolicy *pol; 614 pgoff_t ilx; 615 }; 616 617 struct queue_pages { 618 struct list_head *pagelist; 619 unsigned long flags; 620 nodemask_t *nmask; 621 unsigned long start; 622 unsigned long end; 623 struct vm_area_struct *first; 624 struct folio *large; /* note last large folio encountered */ 625 long nr_failed; /* could not be isolated at this time */ 626 }; 627 628 /* 629 * Check if the folio's nid is in qp->nmask. 630 * 631 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 632 * in the invert of qp->nmask. 633 */ 634 static inline bool queue_folio_required(struct folio *folio, 635 struct queue_pages *qp) 636 { 637 int nid = folio_nid(folio); 638 unsigned long flags = qp->flags; 639 640 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 641 } 642 643 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) 644 { 645 struct folio *folio; 646 struct queue_pages *qp = walk->private; 647 648 if (unlikely(is_pmd_migration_entry(*pmd))) { 649 qp->nr_failed++; 650 return; 651 } 652 folio = pmd_folio(*pmd); 653 if (is_huge_zero_folio(folio)) { 654 walk->action = ACTION_CONTINUE; 655 return; 656 } 657 if (!queue_folio_required(folio, qp)) 658 return; 659 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 660 !vma_migratable(walk->vma) || 661 !migrate_folio_add(folio, qp->pagelist, qp->flags)) 662 qp->nr_failed++; 663 } 664 665 /* 666 * Scan through folios, checking if they satisfy the required conditions, 667 * moving them from LRU to local pagelist for migration if they do (or not). 668 * 669 * queue_folios_pte_range() has two possible return values: 670 * 0 - continue walking to scan for more, even if an existing folio on the 671 * wrong node could not be isolated and queued for migration. 672 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, 673 * and an existing folio was on a node that does not follow the policy. 674 */ 675 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 676 unsigned long end, struct mm_walk *walk) 677 { 678 struct vm_area_struct *vma = walk->vma; 679 struct folio *folio; 680 struct queue_pages *qp = walk->private; 681 unsigned long flags = qp->flags; 682 pte_t *pte, *mapped_pte; 683 pte_t ptent; 684 spinlock_t *ptl; 685 int max_nr, nr; 686 687 ptl = pmd_trans_huge_lock(pmd, vma); 688 if (ptl) { 689 queue_folios_pmd(pmd, walk); 690 spin_unlock(ptl); 691 goto out; 692 } 693 694 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 695 if (!pte) { 696 walk->action = ACTION_AGAIN; 697 return 0; 698 } 699 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { 700 max_nr = (end - addr) >> PAGE_SHIFT; 701 nr = 1; 702 ptent = ptep_get(pte); 703 if (pte_none(ptent)) 704 continue; 705 if (!pte_present(ptent)) { 706 if (is_migration_entry(pte_to_swp_entry(ptent))) 707 qp->nr_failed++; 708 continue; 709 } 710 folio = vm_normal_folio(vma, addr, ptent); 711 if (!folio || folio_is_zone_device(folio)) 712 continue; 713 if (folio_test_large(folio) && max_nr != 1) 714 nr = folio_pte_batch(folio, pte, ptent, max_nr); 715 /* 716 * vm_normal_folio() filters out zero pages, but there might 717 * still be reserved folios to skip, perhaps in a VDSO. 718 */ 719 if (folio_test_reserved(folio)) 720 continue; 721 if (!queue_folio_required(folio, qp)) 722 continue; 723 if (folio_test_large(folio)) { 724 /* 725 * A large folio can only be isolated from LRU once, 726 * but may be mapped by many PTEs (and Copy-On-Write may 727 * intersperse PTEs of other, order 0, folios). This is 728 * a common case, so don't mistake it for failure (but 729 * there can be other cases of multi-mapped pages which 730 * this quick check does not help to filter out - and a 731 * search of the pagelist might grow to be prohibitive). 732 * 733 * migrate_pages(&pagelist) returns nr_failed folios, so 734 * check "large" now so that queue_pages_range() returns 735 * a comparable nr_failed folios. This does imply that 736 * if folio could not be isolated for some racy reason 737 * at its first PTE, later PTEs will not give it another 738 * chance of isolation; but keeps the accounting simple. 739 */ 740 if (folio == qp->large) 741 continue; 742 qp->large = folio; 743 } 744 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 745 !vma_migratable(vma) || 746 !migrate_folio_add(folio, qp->pagelist, flags)) { 747 qp->nr_failed += nr; 748 if (strictly_unmovable(flags)) 749 break; 750 } 751 } 752 pte_unmap_unlock(mapped_pte, ptl); 753 cond_resched(); 754 out: 755 if (qp->nr_failed && strictly_unmovable(flags)) 756 return -EIO; 757 return 0; 758 } 759 760 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 761 unsigned long addr, unsigned long end, 762 struct mm_walk *walk) 763 { 764 #ifdef CONFIG_HUGETLB_PAGE 765 struct queue_pages *qp = walk->private; 766 unsigned long flags = qp->flags; 767 struct folio *folio; 768 spinlock_t *ptl; 769 pte_t entry; 770 771 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 772 entry = huge_ptep_get(walk->mm, addr, pte); 773 if (!pte_present(entry)) { 774 if (unlikely(is_hugetlb_entry_migration(entry))) 775 qp->nr_failed++; 776 goto unlock; 777 } 778 folio = pfn_folio(pte_pfn(entry)); 779 if (!queue_folio_required(folio, qp)) 780 goto unlock; 781 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || 782 !vma_migratable(walk->vma)) { 783 qp->nr_failed++; 784 goto unlock; 785 } 786 /* 787 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 788 * Choosing not to migrate a shared folio is not counted as a failure. 789 * 790 * See folio_maybe_mapped_shared() on possible imprecision when we 791 * cannot easily detect if a folio is shared. 792 */ 793 if ((flags & MPOL_MF_MOVE_ALL) || 794 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) 795 if (!folio_isolate_hugetlb(folio, qp->pagelist)) 796 qp->nr_failed++; 797 unlock: 798 spin_unlock(ptl); 799 if (qp->nr_failed && strictly_unmovable(flags)) 800 return -EIO; 801 #endif 802 return 0; 803 } 804 805 #ifdef CONFIG_NUMA_BALANCING 806 /* 807 * This is used to mark a range of virtual addresses to be inaccessible. 808 * These are later cleared by a NUMA hinting fault. Depending on these 809 * faults, pages may be migrated for better NUMA placement. 810 * 811 * This is assuming that NUMA faults are handled using PROT_NONE. If 812 * an architecture makes a different choice, it will need further 813 * changes to the core. 814 */ 815 unsigned long change_prot_numa(struct vm_area_struct *vma, 816 unsigned long addr, unsigned long end) 817 { 818 struct mmu_gather tlb; 819 long nr_updated; 820 821 tlb_gather_mmu(&tlb, vma->vm_mm); 822 823 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 824 if (nr_updated > 0) { 825 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 826 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); 827 } 828 829 tlb_finish_mmu(&tlb); 830 831 return nr_updated; 832 } 833 #endif /* CONFIG_NUMA_BALANCING */ 834 835 static int queue_pages_test_walk(unsigned long start, unsigned long end, 836 struct mm_walk *walk) 837 { 838 struct vm_area_struct *next, *vma = walk->vma; 839 struct queue_pages *qp = walk->private; 840 unsigned long flags = qp->flags; 841 842 /* range check first */ 843 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 844 845 if (!qp->first) { 846 qp->first = vma; 847 if (!(flags & MPOL_MF_DISCONTIG_OK) && 848 (qp->start < vma->vm_start)) 849 /* hole at head side of range */ 850 return -EFAULT; 851 } 852 next = find_vma(vma->vm_mm, vma->vm_end); 853 if (!(flags & MPOL_MF_DISCONTIG_OK) && 854 ((vma->vm_end < qp->end) && 855 (!next || vma->vm_end < next->vm_start))) 856 /* hole at middle or tail of range */ 857 return -EFAULT; 858 859 /* 860 * Need check MPOL_MF_STRICT to return -EIO if possible 861 * regardless of vma_migratable 862 */ 863 if (!vma_migratable(vma) && 864 !(flags & MPOL_MF_STRICT)) 865 return 1; 866 867 /* 868 * Check page nodes, and queue pages to move, in the current vma. 869 * But if no moving, and no strict checking, the scan can be skipped. 870 */ 871 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 872 return 0; 873 return 1; 874 } 875 876 static const struct mm_walk_ops queue_pages_walk_ops = { 877 .hugetlb_entry = queue_folios_hugetlb, 878 .pmd_entry = queue_folios_pte_range, 879 .test_walk = queue_pages_test_walk, 880 .walk_lock = PGWALK_RDLOCK, 881 }; 882 883 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 884 .hugetlb_entry = queue_folios_hugetlb, 885 .pmd_entry = queue_folios_pte_range, 886 .test_walk = queue_pages_test_walk, 887 .walk_lock = PGWALK_WRLOCK, 888 }; 889 890 /* 891 * Walk through page tables and collect pages to be migrated. 892 * 893 * If pages found in a given range are not on the required set of @nodes, 894 * and migration is allowed, they are isolated and queued to @pagelist. 895 * 896 * queue_pages_range() may return: 897 * 0 - all pages already on the right node, or successfully queued for moving 898 * (or neither strict checking nor moving requested: only range checking). 899 * >0 - this number of misplaced folios could not be queued for moving 900 * (a hugetlbfs page or a transparent huge page being counted as 1). 901 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. 902 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. 903 */ 904 static long 905 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 906 nodemask_t *nodes, unsigned long flags, 907 struct list_head *pagelist) 908 { 909 int err; 910 struct queue_pages qp = { 911 .pagelist = pagelist, 912 .flags = flags, 913 .nmask = nodes, 914 .start = start, 915 .end = end, 916 .first = NULL, 917 }; 918 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? 919 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 920 921 err = walk_page_range(mm, start, end, ops, &qp); 922 923 if (!qp.first) 924 /* whole range in hole */ 925 err = -EFAULT; 926 927 return err ? : qp.nr_failed; 928 } 929 930 /* 931 * Apply policy to a single VMA 932 * This must be called with the mmap_lock held for writing. 933 */ 934 static int vma_replace_policy(struct vm_area_struct *vma, 935 struct mempolicy *pol) 936 { 937 int err; 938 struct mempolicy *old; 939 struct mempolicy *new; 940 941 vma_assert_write_locked(vma); 942 943 new = mpol_dup(pol); 944 if (IS_ERR(new)) 945 return PTR_ERR(new); 946 947 if (vma->vm_ops && vma->vm_ops->set_policy) { 948 err = vma->vm_ops->set_policy(vma, new); 949 if (err) 950 goto err_out; 951 } 952 953 old = vma->vm_policy; 954 vma->vm_policy = new; /* protected by mmap_lock */ 955 mpol_put(old); 956 957 return 0; 958 err_out: 959 mpol_put(new); 960 return err; 961 } 962 963 /* Split or merge the VMA (if required) and apply the new policy */ 964 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 965 struct vm_area_struct **prev, unsigned long start, 966 unsigned long end, struct mempolicy *new_pol) 967 { 968 unsigned long vmstart, vmend; 969 970 vmend = min(end, vma->vm_end); 971 if (start > vma->vm_start) { 972 *prev = vma; 973 vmstart = start; 974 } else { 975 vmstart = vma->vm_start; 976 } 977 978 if (mpol_equal(vma->vm_policy, new_pol)) { 979 *prev = vma; 980 return 0; 981 } 982 983 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); 984 if (IS_ERR(vma)) 985 return PTR_ERR(vma); 986 987 *prev = vma; 988 return vma_replace_policy(vma, new_pol); 989 } 990 991 /* Set the process memory policy */ 992 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 993 nodemask_t *nodes) 994 { 995 struct mempolicy *new, *old; 996 NODEMASK_SCRATCH(scratch); 997 int ret; 998 999 if (!scratch) 1000 return -ENOMEM; 1001 1002 new = mpol_new(mode, flags, nodes); 1003 if (IS_ERR(new)) { 1004 ret = PTR_ERR(new); 1005 goto out; 1006 } 1007 1008 task_lock(current); 1009 ret = mpol_set_nodemask(new, nodes, scratch); 1010 if (ret) { 1011 task_unlock(current); 1012 mpol_put(new); 1013 goto out; 1014 } 1015 1016 old = current->mempolicy; 1017 current->mempolicy = new; 1018 if (new && (new->mode == MPOL_INTERLEAVE || 1019 new->mode == MPOL_WEIGHTED_INTERLEAVE)) { 1020 current->il_prev = MAX_NUMNODES-1; 1021 current->il_weight = 0; 1022 } 1023 task_unlock(current); 1024 mpol_put(old); 1025 ret = 0; 1026 out: 1027 NODEMASK_SCRATCH_FREE(scratch); 1028 return ret; 1029 } 1030 1031 /* 1032 * Return nodemask for policy for get_mempolicy() query 1033 * 1034 * Called with task's alloc_lock held 1035 */ 1036 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) 1037 { 1038 nodes_clear(*nodes); 1039 if (pol == &default_policy) 1040 return; 1041 1042 switch (pol->mode) { 1043 case MPOL_BIND: 1044 case MPOL_INTERLEAVE: 1045 case MPOL_PREFERRED: 1046 case MPOL_PREFERRED_MANY: 1047 case MPOL_WEIGHTED_INTERLEAVE: 1048 *nodes = pol->nodes; 1049 break; 1050 case MPOL_LOCAL: 1051 /* return empty node mask for local allocation */ 1052 break; 1053 default: 1054 BUG(); 1055 } 1056 } 1057 1058 static int lookup_node(struct mm_struct *mm, unsigned long addr) 1059 { 1060 struct page *p = NULL; 1061 int ret; 1062 1063 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 1064 if (ret > 0) { 1065 ret = page_to_nid(p); 1066 put_page(p); 1067 } 1068 return ret; 1069 } 1070 1071 /* Retrieve NUMA policy */ 1072 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 1073 unsigned long addr, unsigned long flags) 1074 { 1075 int err; 1076 struct mm_struct *mm = current->mm; 1077 struct vm_area_struct *vma = NULL; 1078 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 1079 1080 if (flags & 1081 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 1082 return -EINVAL; 1083 1084 if (flags & MPOL_F_MEMS_ALLOWED) { 1085 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 1086 return -EINVAL; 1087 *policy = 0; /* just so it's initialized */ 1088 task_lock(current); 1089 *nmask = cpuset_current_mems_allowed; 1090 task_unlock(current); 1091 return 0; 1092 } 1093 1094 if (flags & MPOL_F_ADDR) { 1095 pgoff_t ilx; /* ignored here */ 1096 /* 1097 * Do NOT fall back to task policy if the 1098 * vma/shared policy at addr is NULL. We 1099 * want to return MPOL_DEFAULT in this case. 1100 */ 1101 mmap_read_lock(mm); 1102 vma = vma_lookup(mm, addr); 1103 if (!vma) { 1104 mmap_read_unlock(mm); 1105 return -EFAULT; 1106 } 1107 pol = __get_vma_policy(vma, addr, &ilx); 1108 } else if (addr) 1109 return -EINVAL; 1110 1111 if (!pol) 1112 pol = &default_policy; /* indicates default behavior */ 1113 1114 if (flags & MPOL_F_NODE) { 1115 if (flags & MPOL_F_ADDR) { 1116 /* 1117 * Take a refcount on the mpol, because we are about to 1118 * drop the mmap_lock, after which only "pol" remains 1119 * valid, "vma" is stale. 1120 */ 1121 pol_refcount = pol; 1122 vma = NULL; 1123 mpol_get(pol); 1124 mmap_read_unlock(mm); 1125 err = lookup_node(mm, addr); 1126 if (err < 0) 1127 goto out; 1128 *policy = err; 1129 } else if (pol == current->mempolicy && 1130 pol->mode == MPOL_INTERLEAVE) { 1131 *policy = next_node_in(current->il_prev, pol->nodes); 1132 } else if (pol == current->mempolicy && 1133 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1134 if (current->il_weight) 1135 *policy = current->il_prev; 1136 else 1137 *policy = next_node_in(current->il_prev, 1138 pol->nodes); 1139 } else { 1140 err = -EINVAL; 1141 goto out; 1142 } 1143 } else { 1144 *policy = pol == &default_policy ? MPOL_DEFAULT : 1145 pol->mode; 1146 /* 1147 * Internal mempolicy flags must be masked off before exposing 1148 * the policy to userspace. 1149 */ 1150 *policy |= (pol->flags & MPOL_MODE_FLAGS); 1151 } 1152 1153 err = 0; 1154 if (nmask) { 1155 if (mpol_store_user_nodemask(pol)) { 1156 *nmask = pol->w.user_nodemask; 1157 } else { 1158 task_lock(current); 1159 get_policy_nodemask(pol, nmask); 1160 task_unlock(current); 1161 } 1162 } 1163 1164 out: 1165 mpol_cond_put(pol); 1166 if (vma) 1167 mmap_read_unlock(mm); 1168 if (pol_refcount) 1169 mpol_put(pol_refcount); 1170 return err; 1171 } 1172 1173 #ifdef CONFIG_MIGRATION 1174 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1175 unsigned long flags) 1176 { 1177 /* 1178 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. 1179 * Choosing not to migrate a shared folio is not counted as a failure. 1180 * 1181 * See folio_maybe_mapped_shared() on possible imprecision when we 1182 * cannot easily detect if a folio is shared. 1183 */ 1184 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { 1185 if (folio_isolate_lru(folio)) { 1186 list_add_tail(&folio->lru, foliolist); 1187 node_stat_mod_folio(folio, 1188 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1189 folio_nr_pages(folio)); 1190 } else { 1191 /* 1192 * Non-movable folio may reach here. And, there may be 1193 * temporary off LRU folios or non-LRU movable folios. 1194 * Treat them as unmovable folios since they can't be 1195 * isolated, so they can't be moved at the moment. 1196 */ 1197 return false; 1198 } 1199 } 1200 return true; 1201 } 1202 1203 /* 1204 * Migrate pages from one node to a target node. 1205 * Returns error or the number of pages not migrated. 1206 */ 1207 static long migrate_to_node(struct mm_struct *mm, int source, int dest, 1208 int flags) 1209 { 1210 nodemask_t nmask; 1211 struct vm_area_struct *vma; 1212 LIST_HEAD(pagelist); 1213 long nr_failed; 1214 long err = 0; 1215 struct migration_target_control mtc = { 1216 .nid = dest, 1217 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 1218 .reason = MR_SYSCALL, 1219 }; 1220 1221 nodes_clear(nmask); 1222 node_set(source, nmask); 1223 1224 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1225 1226 mmap_read_lock(mm); 1227 vma = find_vma(mm, 0); 1228 if (unlikely(!vma)) { 1229 mmap_read_unlock(mm); 1230 return 0; 1231 } 1232 1233 /* 1234 * This does not migrate the range, but isolates all pages that 1235 * need migration. Between passing in the full user address 1236 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, 1237 * but passes back the count of pages which could not be isolated. 1238 */ 1239 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1240 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1241 mmap_read_unlock(mm); 1242 1243 if (!list_empty(&pagelist)) { 1244 err = migrate_pages(&pagelist, alloc_migration_target, NULL, 1245 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 1246 if (err) 1247 putback_movable_pages(&pagelist); 1248 } 1249 1250 if (err >= 0) 1251 err += nr_failed; 1252 return err; 1253 } 1254 1255 /* 1256 * Move pages between the two nodesets so as to preserve the physical 1257 * layout as much as possible. 1258 * 1259 * Returns the number of page that could not be moved. 1260 */ 1261 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1262 const nodemask_t *to, int flags) 1263 { 1264 long nr_failed = 0; 1265 long err = 0; 1266 nodemask_t tmp; 1267 1268 lru_cache_disable(); 1269 1270 /* 1271 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1272 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1273 * bit in 'tmp', and return that <source, dest> pair for migration. 1274 * The pair of nodemasks 'to' and 'from' define the map. 1275 * 1276 * If no pair of bits is found that way, fallback to picking some 1277 * pair of 'source' and 'dest' bits that are not the same. If the 1278 * 'source' and 'dest' bits are the same, this represents a node 1279 * that will be migrating to itself, so no pages need move. 1280 * 1281 * If no bits are left in 'tmp', or if all remaining bits left 1282 * in 'tmp' correspond to the same bit in 'to', return false 1283 * (nothing left to migrate). 1284 * 1285 * This lets us pick a pair of nodes to migrate between, such that 1286 * if possible the dest node is not already occupied by some other 1287 * source node, minimizing the risk of overloading the memory on a 1288 * node that would happen if we migrated incoming memory to a node 1289 * before migrating outgoing memory source that same node. 1290 * 1291 * A single scan of tmp is sufficient. As we go, we remember the 1292 * most recent <s, d> pair that moved (s != d). If we find a pair 1293 * that not only moved, but what's better, moved to an empty slot 1294 * (d is not set in tmp), then we break out then, with that pair. 1295 * Otherwise when we finish scanning from_tmp, we at least have the 1296 * most recent <s, d> pair that moved. If we get all the way through 1297 * the scan of tmp without finding any node that moved, much less 1298 * moved to an empty node, then there is nothing left worth migrating. 1299 */ 1300 1301 tmp = *from; 1302 while (!nodes_empty(tmp)) { 1303 int s, d; 1304 int source = NUMA_NO_NODE; 1305 int dest = 0; 1306 1307 for_each_node_mask(s, tmp) { 1308 1309 /* 1310 * do_migrate_pages() tries to maintain the relative 1311 * node relationship of the pages established between 1312 * threads and memory areas. 1313 * 1314 * However if the number of source nodes is not equal to 1315 * the number of destination nodes we can not preserve 1316 * this node relative relationship. In that case, skip 1317 * copying memory from a node that is in the destination 1318 * mask. 1319 * 1320 * Example: [2,3,4] -> [3,4,5] moves everything. 1321 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1322 */ 1323 1324 if ((nodes_weight(*from) != nodes_weight(*to)) && 1325 (node_isset(s, *to))) 1326 continue; 1327 1328 d = node_remap(s, *from, *to); 1329 if (s == d) 1330 continue; 1331 1332 source = s; /* Node moved. Memorize */ 1333 dest = d; 1334 1335 /* dest not in remaining from nodes? */ 1336 if (!node_isset(dest, tmp)) 1337 break; 1338 } 1339 if (source == NUMA_NO_NODE) 1340 break; 1341 1342 node_clear(source, tmp); 1343 err = migrate_to_node(mm, source, dest, flags); 1344 if (err > 0) 1345 nr_failed += err; 1346 if (err < 0) 1347 break; 1348 } 1349 1350 lru_cache_enable(); 1351 if (err < 0) 1352 return err; 1353 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; 1354 } 1355 1356 /* 1357 * Allocate a new folio for page migration, according to NUMA mempolicy. 1358 */ 1359 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1360 unsigned long private) 1361 { 1362 struct migration_mpol *mmpol = (struct migration_mpol *)private; 1363 struct mempolicy *pol = mmpol->pol; 1364 pgoff_t ilx = mmpol->ilx; 1365 unsigned int order; 1366 int nid = numa_node_id(); 1367 gfp_t gfp; 1368 1369 order = folio_order(src); 1370 ilx += src->index >> order; 1371 1372 if (folio_test_hugetlb(src)) { 1373 nodemask_t *nodemask; 1374 struct hstate *h; 1375 1376 h = folio_hstate(src); 1377 gfp = htlb_alloc_mask(h); 1378 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 1379 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, 1380 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); 1381 } 1382 1383 if (folio_test_large(src)) 1384 gfp = GFP_TRANSHUGE; 1385 else 1386 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; 1387 1388 return folio_alloc_mpol(gfp, order, pol, ilx, nid); 1389 } 1390 #else 1391 1392 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, 1393 unsigned long flags) 1394 { 1395 return false; 1396 } 1397 1398 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1399 const nodemask_t *to, int flags) 1400 { 1401 return -ENOSYS; 1402 } 1403 1404 static struct folio *alloc_migration_target_by_mpol(struct folio *src, 1405 unsigned long private) 1406 { 1407 return NULL; 1408 } 1409 #endif 1410 1411 static long do_mbind(unsigned long start, unsigned long len, 1412 unsigned short mode, unsigned short mode_flags, 1413 nodemask_t *nmask, unsigned long flags) 1414 { 1415 struct mm_struct *mm = current->mm; 1416 struct vm_area_struct *vma, *prev; 1417 struct vma_iterator vmi; 1418 struct migration_mpol mmpol; 1419 struct mempolicy *new; 1420 unsigned long end; 1421 long err; 1422 long nr_failed; 1423 LIST_HEAD(pagelist); 1424 1425 if (flags & ~(unsigned long)MPOL_MF_VALID) 1426 return -EINVAL; 1427 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1428 return -EPERM; 1429 1430 if (start & ~PAGE_MASK) 1431 return -EINVAL; 1432 1433 if (mode == MPOL_DEFAULT) 1434 flags &= ~MPOL_MF_STRICT; 1435 1436 len = PAGE_ALIGN(len); 1437 end = start + len; 1438 1439 if (end < start) 1440 return -EINVAL; 1441 if (end == start) 1442 return 0; 1443 1444 new = mpol_new(mode, mode_flags, nmask); 1445 if (IS_ERR(new)) 1446 return PTR_ERR(new); 1447 1448 /* 1449 * If we are using the default policy then operation 1450 * on discontinuous address spaces is okay after all 1451 */ 1452 if (!new) 1453 flags |= MPOL_MF_DISCONTIG_OK; 1454 1455 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1456 lru_cache_disable(); 1457 { 1458 NODEMASK_SCRATCH(scratch); 1459 if (scratch) { 1460 mmap_write_lock(mm); 1461 err = mpol_set_nodemask(new, nmask, scratch); 1462 if (err) 1463 mmap_write_unlock(mm); 1464 } else 1465 err = -ENOMEM; 1466 NODEMASK_SCRATCH_FREE(scratch); 1467 } 1468 if (err) 1469 goto mpol_out; 1470 1471 /* 1472 * Lock the VMAs before scanning for pages to migrate, 1473 * to ensure we don't miss a concurrently inserted page. 1474 */ 1475 nr_failed = queue_pages_range(mm, start, end, nmask, 1476 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); 1477 1478 if (nr_failed < 0) { 1479 err = nr_failed; 1480 nr_failed = 0; 1481 } else { 1482 vma_iter_init(&vmi, mm, start); 1483 prev = vma_prev(&vmi); 1484 for_each_vma_range(vmi, vma, end) { 1485 err = mbind_range(&vmi, vma, &prev, start, end, new); 1486 if (err) 1487 break; 1488 } 1489 } 1490 1491 if (!err && !list_empty(&pagelist)) { 1492 /* Convert MPOL_DEFAULT's NULL to task or default policy */ 1493 if (!new) { 1494 new = get_task_policy(current); 1495 mpol_get(new); 1496 } 1497 mmpol.pol = new; 1498 mmpol.ilx = 0; 1499 1500 /* 1501 * In the interleaved case, attempt to allocate on exactly the 1502 * targeted nodes, for the first VMA to be migrated; for later 1503 * VMAs, the nodes will still be interleaved from the targeted 1504 * nodemask, but one by one may be selected differently. 1505 */ 1506 if (new->mode == MPOL_INTERLEAVE || 1507 new->mode == MPOL_WEIGHTED_INTERLEAVE) { 1508 struct folio *folio; 1509 unsigned int order; 1510 unsigned long addr = -EFAULT; 1511 1512 list_for_each_entry(folio, &pagelist, lru) { 1513 if (!folio_test_ksm(folio)) 1514 break; 1515 } 1516 if (!list_entry_is_head(folio, &pagelist, lru)) { 1517 vma_iter_init(&vmi, mm, start); 1518 for_each_vma_range(vmi, vma, end) { 1519 addr = page_address_in_vma(folio, 1520 folio_page(folio, 0), vma); 1521 if (addr != -EFAULT) 1522 break; 1523 } 1524 } 1525 if (addr != -EFAULT) { 1526 order = folio_order(folio); 1527 /* We already know the pol, but not the ilx */ 1528 mpol_cond_put(get_vma_policy(vma, addr, order, 1529 &mmpol.ilx)); 1530 /* Set base from which to increment by index */ 1531 mmpol.ilx -= folio->index >> order; 1532 } 1533 } 1534 } 1535 1536 mmap_write_unlock(mm); 1537 1538 if (!err && !list_empty(&pagelist)) { 1539 nr_failed |= migrate_pages(&pagelist, 1540 alloc_migration_target_by_mpol, NULL, 1541 (unsigned long)&mmpol, MIGRATE_SYNC, 1542 MR_MEMPOLICY_MBIND, NULL); 1543 } 1544 1545 if (nr_failed && (flags & MPOL_MF_STRICT)) 1546 err = -EIO; 1547 if (!list_empty(&pagelist)) 1548 putback_movable_pages(&pagelist); 1549 mpol_out: 1550 mpol_put(new); 1551 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 1552 lru_cache_enable(); 1553 return err; 1554 } 1555 1556 /* 1557 * User space interface with variable sized bitmaps for nodelists. 1558 */ 1559 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 1560 unsigned long maxnode) 1561 { 1562 unsigned long nlongs = BITS_TO_LONGS(maxnode); 1563 int ret; 1564 1565 if (in_compat_syscall()) 1566 ret = compat_get_bitmap(mask, 1567 (const compat_ulong_t __user *)nmask, 1568 maxnode); 1569 else 1570 ret = copy_from_user(mask, nmask, 1571 nlongs * sizeof(unsigned long)); 1572 1573 if (ret) 1574 return -EFAULT; 1575 1576 if (maxnode % BITS_PER_LONG) 1577 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 1578 1579 return 0; 1580 } 1581 1582 /* Copy a node mask from user space. */ 1583 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1584 unsigned long maxnode) 1585 { 1586 --maxnode; 1587 nodes_clear(*nodes); 1588 if (maxnode == 0 || !nmask) 1589 return 0; 1590 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1591 return -EINVAL; 1592 1593 /* 1594 * When the user specified more nodes than supported just check 1595 * if the non supported part is all zero, one word at a time, 1596 * starting at the end. 1597 */ 1598 while (maxnode > MAX_NUMNODES) { 1599 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 1600 unsigned long t; 1601 1602 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 1603 return -EFAULT; 1604 1605 if (maxnode - bits >= MAX_NUMNODES) { 1606 maxnode -= bits; 1607 } else { 1608 maxnode = MAX_NUMNODES; 1609 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 1610 } 1611 if (t) 1612 return -EINVAL; 1613 } 1614 1615 return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 1616 } 1617 1618 /* Copy a kernel node mask to user space */ 1619 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1620 nodemask_t *nodes) 1621 { 1622 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1623 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 1624 bool compat = in_compat_syscall(); 1625 1626 if (compat) 1627 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 1628 1629 if (copy > nbytes) { 1630 if (copy > PAGE_SIZE) 1631 return -EINVAL; 1632 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1633 return -EFAULT; 1634 copy = nbytes; 1635 maxnode = nr_node_ids; 1636 } 1637 1638 if (compat) 1639 return compat_put_bitmap((compat_ulong_t __user *)mask, 1640 nodes_addr(*nodes), maxnode); 1641 1642 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1643 } 1644 1645 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 1646 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 1647 { 1648 *flags = *mode & MPOL_MODE_FLAGS; 1649 *mode &= ~MPOL_MODE_FLAGS; 1650 1651 if ((unsigned int)(*mode) >= MPOL_MAX) 1652 return -EINVAL; 1653 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 1654 return -EINVAL; 1655 if (*flags & MPOL_F_NUMA_BALANCING) { 1656 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) 1657 *flags |= (MPOL_F_MOF | MPOL_F_MORON); 1658 else 1659 return -EINVAL; 1660 } 1661 return 0; 1662 } 1663 1664 static long kernel_mbind(unsigned long start, unsigned long len, 1665 unsigned long mode, const unsigned long __user *nmask, 1666 unsigned long maxnode, unsigned int flags) 1667 { 1668 unsigned short mode_flags; 1669 nodemask_t nodes; 1670 int lmode = mode; 1671 int err; 1672 1673 start = untagged_addr(start); 1674 err = sanitize_mpol_flags(&lmode, &mode_flags); 1675 if (err) 1676 return err; 1677 1678 err = get_nodes(&nodes, nmask, maxnode); 1679 if (err) 1680 return err; 1681 1682 return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 1683 } 1684 1685 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 1686 unsigned long, home_node, unsigned long, flags) 1687 { 1688 struct mm_struct *mm = current->mm; 1689 struct vm_area_struct *vma, *prev; 1690 struct mempolicy *new, *old; 1691 unsigned long end; 1692 int err = -ENOENT; 1693 VMA_ITERATOR(vmi, mm, start); 1694 1695 start = untagged_addr(start); 1696 if (start & ~PAGE_MASK) 1697 return -EINVAL; 1698 /* 1699 * flags is used for future extension if any. 1700 */ 1701 if (flags != 0) 1702 return -EINVAL; 1703 1704 /* 1705 * Check home_node is online to avoid accessing uninitialized 1706 * NODE_DATA. 1707 */ 1708 if (home_node >= MAX_NUMNODES || !node_online(home_node)) 1709 return -EINVAL; 1710 1711 len = PAGE_ALIGN(len); 1712 end = start + len; 1713 1714 if (end < start) 1715 return -EINVAL; 1716 if (end == start) 1717 return 0; 1718 mmap_write_lock(mm); 1719 prev = vma_prev(&vmi); 1720 for_each_vma_range(vmi, vma, end) { 1721 /* 1722 * If any vma in the range got policy other than MPOL_BIND 1723 * or MPOL_PREFERRED_MANY we return error. We don't reset 1724 * the home node for vmas we already updated before. 1725 */ 1726 old = vma_policy(vma); 1727 if (!old) { 1728 prev = vma; 1729 continue; 1730 } 1731 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 1732 err = -EOPNOTSUPP; 1733 break; 1734 } 1735 new = mpol_dup(old); 1736 if (IS_ERR(new)) { 1737 err = PTR_ERR(new); 1738 break; 1739 } 1740 1741 vma_start_write(vma); 1742 new->home_node = home_node; 1743 err = mbind_range(&vmi, vma, &prev, start, end, new); 1744 mpol_put(new); 1745 if (err) 1746 break; 1747 } 1748 mmap_write_unlock(mm); 1749 return err; 1750 } 1751 1752 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1753 unsigned long, mode, const unsigned long __user *, nmask, 1754 unsigned long, maxnode, unsigned int, flags) 1755 { 1756 return kernel_mbind(start, len, mode, nmask, maxnode, flags); 1757 } 1758 1759 /* Set the process memory policy */ 1760 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 1761 unsigned long maxnode) 1762 { 1763 unsigned short mode_flags; 1764 nodemask_t nodes; 1765 int lmode = mode; 1766 int err; 1767 1768 err = sanitize_mpol_flags(&lmode, &mode_flags); 1769 if (err) 1770 return err; 1771 1772 err = get_nodes(&nodes, nmask, maxnode); 1773 if (err) 1774 return err; 1775 1776 return do_set_mempolicy(lmode, mode_flags, &nodes); 1777 } 1778 1779 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 1780 unsigned long, maxnode) 1781 { 1782 return kernel_set_mempolicy(mode, nmask, maxnode); 1783 } 1784 1785 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 1786 const unsigned long __user *old_nodes, 1787 const unsigned long __user *new_nodes) 1788 { 1789 struct mm_struct *mm = NULL; 1790 struct task_struct *task; 1791 nodemask_t task_nodes; 1792 int err; 1793 nodemask_t *old; 1794 nodemask_t *new; 1795 NODEMASK_SCRATCH(scratch); 1796 1797 if (!scratch) 1798 return -ENOMEM; 1799 1800 old = &scratch->mask1; 1801 new = &scratch->mask2; 1802 1803 err = get_nodes(old, old_nodes, maxnode); 1804 if (err) 1805 goto out; 1806 1807 err = get_nodes(new, new_nodes, maxnode); 1808 if (err) 1809 goto out; 1810 1811 /* Find the mm_struct */ 1812 rcu_read_lock(); 1813 task = pid ? find_task_by_vpid(pid) : current; 1814 if (!task) { 1815 rcu_read_unlock(); 1816 err = -ESRCH; 1817 goto out; 1818 } 1819 get_task_struct(task); 1820 1821 err = -EINVAL; 1822 1823 /* 1824 * Check if this process has the right to modify the specified process. 1825 * Use the regular "ptrace_may_access()" checks. 1826 */ 1827 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1828 rcu_read_unlock(); 1829 err = -EPERM; 1830 goto out_put; 1831 } 1832 rcu_read_unlock(); 1833 1834 task_nodes = cpuset_mems_allowed(task); 1835 /* Is the user allowed to access the target nodes? */ 1836 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1837 err = -EPERM; 1838 goto out_put; 1839 } 1840 1841 task_nodes = cpuset_mems_allowed(current); 1842 nodes_and(*new, *new, task_nodes); 1843 if (nodes_empty(*new)) 1844 goto out_put; 1845 1846 err = security_task_movememory(task); 1847 if (err) 1848 goto out_put; 1849 1850 mm = get_task_mm(task); 1851 put_task_struct(task); 1852 1853 if (!mm) { 1854 err = -EINVAL; 1855 goto out; 1856 } 1857 1858 err = do_migrate_pages(mm, old, new, 1859 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1860 1861 mmput(mm); 1862 out: 1863 NODEMASK_SCRATCH_FREE(scratch); 1864 1865 return err; 1866 1867 out_put: 1868 put_task_struct(task); 1869 goto out; 1870 } 1871 1872 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1873 const unsigned long __user *, old_nodes, 1874 const unsigned long __user *, new_nodes) 1875 { 1876 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 1877 } 1878 1879 /* Retrieve NUMA policy */ 1880 static int kernel_get_mempolicy(int __user *policy, 1881 unsigned long __user *nmask, 1882 unsigned long maxnode, 1883 unsigned long addr, 1884 unsigned long flags) 1885 { 1886 int err; 1887 int pval; 1888 nodemask_t nodes; 1889 1890 if (nmask != NULL && maxnode < nr_node_ids) 1891 return -EINVAL; 1892 1893 addr = untagged_addr(addr); 1894 1895 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1896 1897 if (err) 1898 return err; 1899 1900 if (policy && put_user(pval, policy)) 1901 return -EFAULT; 1902 1903 if (nmask) 1904 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1905 1906 return err; 1907 } 1908 1909 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1910 unsigned long __user *, nmask, unsigned long, maxnode, 1911 unsigned long, addr, unsigned long, flags) 1912 { 1913 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 1914 } 1915 1916 bool vma_migratable(struct vm_area_struct *vma) 1917 { 1918 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1919 return false; 1920 1921 /* 1922 * DAX device mappings require predictable access latency, so avoid 1923 * incurring periodic faults. 1924 */ 1925 if (vma_is_dax(vma)) 1926 return false; 1927 1928 if (is_vm_hugetlb_page(vma) && 1929 !hugepage_migration_supported(hstate_vma(vma))) 1930 return false; 1931 1932 /* 1933 * Migration allocates pages in the highest zone. If we cannot 1934 * do so then migration (at least from node to node) is not 1935 * possible. 1936 */ 1937 if (vma->vm_file && 1938 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 1939 < policy_zone) 1940 return false; 1941 return true; 1942 } 1943 1944 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1945 unsigned long addr, pgoff_t *ilx) 1946 { 1947 *ilx = 0; 1948 return (vma->vm_ops && vma->vm_ops->get_policy) ? 1949 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 1950 } 1951 1952 /* 1953 * get_vma_policy(@vma, @addr, @order, @ilx) 1954 * @vma: virtual memory area whose policy is sought 1955 * @addr: address in @vma for shared policy lookup 1956 * @order: 0, or appropriate huge_page_order for interleaving 1957 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or 1958 * MPOL_WEIGHTED_INTERLEAVE 1959 * 1960 * Returns effective policy for a VMA at specified address. 1961 * Falls back to current->mempolicy or system default policy, as necessary. 1962 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1963 * count--added by the get_policy() vm_op, as appropriate--to protect against 1964 * freeing by another task. It is the caller's responsibility to free the 1965 * extra reference for shared policies. 1966 */ 1967 struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1968 unsigned long addr, int order, pgoff_t *ilx) 1969 { 1970 struct mempolicy *pol; 1971 1972 pol = __get_vma_policy(vma, addr, ilx); 1973 if (!pol) 1974 pol = get_task_policy(current); 1975 if (pol->mode == MPOL_INTERLEAVE || 1976 pol->mode == MPOL_WEIGHTED_INTERLEAVE) { 1977 *ilx += vma->vm_pgoff >> order; 1978 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1979 } 1980 return pol; 1981 } 1982 1983 bool vma_policy_mof(struct vm_area_struct *vma) 1984 { 1985 struct mempolicy *pol; 1986 1987 if (vma->vm_ops && vma->vm_ops->get_policy) { 1988 bool ret = false; 1989 pgoff_t ilx; /* ignored here */ 1990 1991 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1992 if (pol && (pol->flags & MPOL_F_MOF)) 1993 ret = true; 1994 mpol_cond_put(pol); 1995 1996 return ret; 1997 } 1998 1999 pol = vma->vm_policy; 2000 if (!pol) 2001 pol = get_task_policy(current); 2002 2003 return pol->flags & MPOL_F_MOF; 2004 } 2005 2006 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 2007 { 2008 enum zone_type dynamic_policy_zone = policy_zone; 2009 2010 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 2011 2012 /* 2013 * if policy->nodes has movable memory only, 2014 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 2015 * 2016 * policy->nodes is intersect with node_states[N_MEMORY]. 2017 * so if the following test fails, it implies 2018 * policy->nodes has movable memory only. 2019 */ 2020 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 2021 dynamic_policy_zone = ZONE_MOVABLE; 2022 2023 return zone >= dynamic_policy_zone; 2024 } 2025 2026 static unsigned int weighted_interleave_nodes(struct mempolicy *policy) 2027 { 2028 unsigned int node; 2029 unsigned int cpuset_mems_cookie; 2030 2031 retry: 2032 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ 2033 cpuset_mems_cookie = read_mems_allowed_begin(); 2034 node = current->il_prev; 2035 if (!current->il_weight || !node_isset(node, policy->nodes)) { 2036 node = next_node_in(node, policy->nodes); 2037 if (read_mems_allowed_retry(cpuset_mems_cookie)) 2038 goto retry; 2039 if (node == MAX_NUMNODES) 2040 return node; 2041 current->il_prev = node; 2042 current->il_weight = get_il_weight(node); 2043 } 2044 current->il_weight--; 2045 return node; 2046 } 2047 2048 /* Do dynamic interleaving for a process */ 2049 static unsigned int interleave_nodes(struct mempolicy *policy) 2050 { 2051 unsigned int nid; 2052 unsigned int cpuset_mems_cookie; 2053 2054 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ 2055 do { 2056 cpuset_mems_cookie = read_mems_allowed_begin(); 2057 nid = next_node_in(current->il_prev, policy->nodes); 2058 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2059 2060 if (nid < MAX_NUMNODES) 2061 current->il_prev = nid; 2062 return nid; 2063 } 2064 2065 /* 2066 * Depending on the memory policy provide a node from which to allocate the 2067 * next slab entry. 2068 */ 2069 unsigned int mempolicy_slab_node(void) 2070 { 2071 struct mempolicy *policy; 2072 int node = numa_mem_id(); 2073 2074 if (!in_task()) 2075 return node; 2076 2077 policy = current->mempolicy; 2078 if (!policy) 2079 return node; 2080 2081 switch (policy->mode) { 2082 case MPOL_PREFERRED: 2083 return first_node(policy->nodes); 2084 2085 case MPOL_INTERLEAVE: 2086 return interleave_nodes(policy); 2087 2088 case MPOL_WEIGHTED_INTERLEAVE: 2089 return weighted_interleave_nodes(policy); 2090 2091 case MPOL_BIND: 2092 case MPOL_PREFERRED_MANY: 2093 { 2094 struct zoneref *z; 2095 2096 /* 2097 * Follow bind policy behavior and start allocation at the 2098 * first node. 2099 */ 2100 struct zonelist *zonelist; 2101 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 2102 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 2103 z = first_zones_zonelist(zonelist, highest_zoneidx, 2104 &policy->nodes); 2105 return zonelist_zone(z) ? zonelist_node_idx(z) : node; 2106 } 2107 case MPOL_LOCAL: 2108 return node; 2109 2110 default: 2111 BUG(); 2112 } 2113 } 2114 2115 static unsigned int read_once_policy_nodemask(struct mempolicy *pol, 2116 nodemask_t *mask) 2117 { 2118 /* 2119 * barrier stabilizes the nodemask locally so that it can be iterated 2120 * over safely without concern for changes. Allocators validate node 2121 * selection does not violate mems_allowed, so this is safe. 2122 */ 2123 barrier(); 2124 memcpy(mask, &pol->nodes, sizeof(nodemask_t)); 2125 barrier(); 2126 return nodes_weight(*mask); 2127 } 2128 2129 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2130 { 2131 struct weighted_interleave_state *state; 2132 nodemask_t nodemask; 2133 unsigned int target, nr_nodes; 2134 u8 *table = NULL; 2135 unsigned int weight_total = 0; 2136 u8 weight; 2137 int nid = 0; 2138 2139 nr_nodes = read_once_policy_nodemask(pol, &nodemask); 2140 if (!nr_nodes) 2141 return numa_node_id(); 2142 2143 rcu_read_lock(); 2144 2145 state = rcu_dereference(wi_state); 2146 /* Uninitialized wi_state means we should assume all weights are 1 */ 2147 if (state) 2148 table = state->iw_table; 2149 2150 /* calculate the total weight */ 2151 for_each_node_mask(nid, nodemask) 2152 weight_total += table ? table[nid] : 1; 2153 2154 /* Calculate the node offset based on totals */ 2155 target = ilx % weight_total; 2156 nid = first_node(nodemask); 2157 while (target) { 2158 /* detect system default usage */ 2159 weight = table ? table[nid] : 1; 2160 if (target < weight) 2161 break; 2162 target -= weight; 2163 nid = next_node_in(nid, nodemask); 2164 } 2165 rcu_read_unlock(); 2166 return nid; 2167 } 2168 2169 /* 2170 * Do static interleaving for interleave index @ilx. Returns the ilx'th 2171 * node in pol->nodes (starting from ilx=0), wrapping around if ilx 2172 * exceeds the number of present nodes. 2173 */ 2174 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 2175 { 2176 nodemask_t nodemask; 2177 unsigned int target, nnodes; 2178 int i; 2179 int nid; 2180 2181 nnodes = read_once_policy_nodemask(pol, &nodemask); 2182 if (!nnodes) 2183 return numa_node_id(); 2184 target = ilx % nnodes; 2185 nid = first_node(nodemask); 2186 for (i = 0; i < target; i++) 2187 nid = next_node(nid, nodemask); 2188 return nid; 2189 } 2190 2191 /* 2192 * Return a nodemask representing a mempolicy for filtering nodes for 2193 * page allocation, together with preferred node id (or the input node id). 2194 */ 2195 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 2196 pgoff_t ilx, int *nid) 2197 { 2198 nodemask_t *nodemask = NULL; 2199 2200 switch (pol->mode) { 2201 case MPOL_PREFERRED: 2202 /* Override input node id */ 2203 *nid = first_node(pol->nodes); 2204 break; 2205 case MPOL_PREFERRED_MANY: 2206 nodemask = &pol->nodes; 2207 if (pol->home_node != NUMA_NO_NODE) 2208 *nid = pol->home_node; 2209 break; 2210 case MPOL_BIND: 2211 /* Restrict to nodemask (but not on lower zones) */ 2212 if (apply_policy_zone(pol, gfp_zone(gfp)) && 2213 cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 2214 nodemask = &pol->nodes; 2215 if (pol->home_node != NUMA_NO_NODE) 2216 *nid = pol->home_node; 2217 /* 2218 * __GFP_THISNODE shouldn't even be used with the bind policy 2219 * because we might easily break the expectation to stay on the 2220 * requested node and not break the policy. 2221 */ 2222 WARN_ON_ONCE(gfp & __GFP_THISNODE); 2223 break; 2224 case MPOL_INTERLEAVE: 2225 /* Override input node id */ 2226 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2227 interleave_nodes(pol) : interleave_nid(pol, ilx); 2228 break; 2229 case MPOL_WEIGHTED_INTERLEAVE: 2230 *nid = (ilx == NO_INTERLEAVE_INDEX) ? 2231 weighted_interleave_nodes(pol) : 2232 weighted_interleave_nid(pol, ilx); 2233 break; 2234 } 2235 2236 return nodemask; 2237 } 2238 2239 #ifdef CONFIG_HUGETLBFS 2240 /* 2241 * huge_node(@vma, @addr, @gfp_flags, @mpol) 2242 * @vma: virtual memory area whose policy is sought 2243 * @addr: address in @vma for shared policy lookup and interleave policy 2244 * @gfp_flags: for requested zone 2245 * @mpol: pointer to mempolicy pointer for reference counted mempolicy 2246 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 2247 * 2248 * Returns a nid suitable for a huge page allocation and a pointer 2249 * to the struct mempolicy for conditional unref after allocation. 2250 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 2251 * to the mempolicy's @nodemask for filtering the zonelist. 2252 */ 2253 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 2254 struct mempolicy **mpol, nodemask_t **nodemask) 2255 { 2256 pgoff_t ilx; 2257 int nid; 2258 2259 nid = numa_node_id(); 2260 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 2261 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 2262 return nid; 2263 } 2264 2265 /* 2266 * init_nodemask_of_mempolicy 2267 * 2268 * If the current task's mempolicy is "default" [NULL], return 'false' 2269 * to indicate default policy. Otherwise, extract the policy nodemask 2270 * for 'bind' or 'interleave' policy into the argument nodemask, or 2271 * initialize the argument nodemask to contain the single node for 2272 * 'preferred' or 'local' policy and return 'true' to indicate presence 2273 * of non-default mempolicy. 2274 * 2275 * We don't bother with reference counting the mempolicy [mpol_get/put] 2276 * because the current task is examining it's own mempolicy and a task's 2277 * mempolicy is only ever changed by the task itself. 2278 * 2279 * N.B., it is the caller's responsibility to free a returned nodemask. 2280 */ 2281 bool init_nodemask_of_mempolicy(nodemask_t *mask) 2282 { 2283 struct mempolicy *mempolicy; 2284 2285 if (!(mask && current->mempolicy)) 2286 return false; 2287 2288 task_lock(current); 2289 mempolicy = current->mempolicy; 2290 switch (mempolicy->mode) { 2291 case MPOL_PREFERRED: 2292 case MPOL_PREFERRED_MANY: 2293 case MPOL_BIND: 2294 case MPOL_INTERLEAVE: 2295 case MPOL_WEIGHTED_INTERLEAVE: 2296 *mask = mempolicy->nodes; 2297 break; 2298 2299 case MPOL_LOCAL: 2300 init_nodemask_of_node(mask, numa_node_id()); 2301 break; 2302 2303 default: 2304 BUG(); 2305 } 2306 task_unlock(current); 2307 2308 return true; 2309 } 2310 #endif 2311 2312 /* 2313 * mempolicy_in_oom_domain 2314 * 2315 * If tsk's mempolicy is "bind", check for intersection between mask and 2316 * the policy nodemask. Otherwise, return true for all other policies 2317 * including "interleave", as a tsk with "interleave" policy may have 2318 * memory allocated from all nodes in system. 2319 * 2320 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 2321 */ 2322 bool mempolicy_in_oom_domain(struct task_struct *tsk, 2323 const nodemask_t *mask) 2324 { 2325 struct mempolicy *mempolicy; 2326 bool ret = true; 2327 2328 if (!mask) 2329 return ret; 2330 2331 task_lock(tsk); 2332 mempolicy = tsk->mempolicy; 2333 if (mempolicy && mempolicy->mode == MPOL_BIND) 2334 ret = nodes_intersects(mempolicy->nodes, *mask); 2335 task_unlock(tsk); 2336 2337 return ret; 2338 } 2339 2340 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2341 int nid, nodemask_t *nodemask) 2342 { 2343 struct page *page; 2344 gfp_t preferred_gfp; 2345 2346 /* 2347 * This is a two pass approach. The first pass will only try the 2348 * preferred nodes but skip the direct reclaim and allow the 2349 * allocation to fail, while the second pass will try all the 2350 * nodes in system. 2351 */ 2352 preferred_gfp = gfp | __GFP_NOWARN; 2353 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2354 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); 2355 if (!page) 2356 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); 2357 2358 return page; 2359 } 2360 2361 /** 2362 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2363 * @gfp: GFP flags. 2364 * @order: Order of the page allocation. 2365 * @pol: Pointer to the NUMA mempolicy. 2366 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2367 * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2368 * 2369 * Return: The page on success or NULL if allocation fails. 2370 */ 2371 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2372 struct mempolicy *pol, pgoff_t ilx, int nid) 2373 { 2374 nodemask_t *nodemask; 2375 struct page *page; 2376 2377 nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2378 2379 if (pol->mode == MPOL_PREFERRED_MANY) 2380 return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2381 2382 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2383 /* filter "hugepage" allocation, unless from alloc_pages() */ 2384 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2385 /* 2386 * For hugepage allocation and non-interleave policy which 2387 * allows the current node (or other explicitly preferred 2388 * node) we only try to allocate from the current/preferred 2389 * node and don't fall back to other nodes, as the cost of 2390 * remote accesses would likely offset THP benefits. 2391 * 2392 * If the policy is interleave or does not allow the current 2393 * node in its nodemask, we allocate the standard way. 2394 */ 2395 if (pol->mode != MPOL_INTERLEAVE && 2396 pol->mode != MPOL_WEIGHTED_INTERLEAVE && 2397 (!nodemask || node_isset(nid, *nodemask))) { 2398 /* 2399 * First, try to allocate THP only on local node, but 2400 * don't reclaim unnecessarily, just compact. 2401 */ 2402 page = __alloc_frozen_pages_noprof( 2403 gfp | __GFP_THISNODE | __GFP_NORETRY, order, 2404 nid, NULL); 2405 if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2406 return page; 2407 /* 2408 * If hugepage allocations are configured to always 2409 * synchronous compact or the vma has been madvised 2410 * to prefer hugepage backing, retry allowing remote 2411 * memory with both reclaim and compact as well. 2412 */ 2413 } 2414 } 2415 2416 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); 2417 2418 if (unlikely(pol->mode == MPOL_INTERLEAVE || 2419 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { 2420 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2421 if (static_branch_likely(&vm_numa_stat_key) && 2422 page_to_nid(page) == nid) { 2423 preempt_disable(); 2424 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2425 preempt_enable(); 2426 } 2427 } 2428 2429 return page; 2430 } 2431 2432 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, 2433 struct mempolicy *pol, pgoff_t ilx, int nid) 2434 { 2435 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, 2436 ilx, nid); 2437 if (!page) 2438 return NULL; 2439 2440 set_page_refcounted(page); 2441 return page_rmappable_folio(page); 2442 } 2443 2444 /** 2445 * vma_alloc_folio - Allocate a folio for a VMA. 2446 * @gfp: GFP flags. 2447 * @order: Order of the folio. 2448 * @vma: Pointer to VMA. 2449 * @addr: Virtual address of the allocation. Must be inside @vma. 2450 * 2451 * Allocate a folio for a specific address in @vma, using the appropriate 2452 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2453 * VMA to prevent it from going away. Should be used for all allocations 2454 * for folios that will be mapped into user space, excepting hugetlbfs, and 2455 * excepting where direct use of folio_alloc_mpol() is more appropriate. 2456 * 2457 * Return: The folio on success or NULL if allocation fails. 2458 */ 2459 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, 2460 unsigned long addr) 2461 { 2462 struct mempolicy *pol; 2463 pgoff_t ilx; 2464 struct folio *folio; 2465 2466 if (vma->vm_flags & VM_DROPPABLE) 2467 gfp |= __GFP_NOWARN; 2468 2469 pol = get_vma_policy(vma, addr, order, &ilx); 2470 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2471 mpol_cond_put(pol); 2472 return folio; 2473 } 2474 EXPORT_SYMBOL(vma_alloc_folio_noprof); 2475 2476 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) 2477 { 2478 struct mempolicy *pol = &default_policy; 2479 2480 /* 2481 * No reference counting needed for current->mempolicy 2482 * nor system default_policy 2483 */ 2484 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2485 pol = get_task_policy(current); 2486 2487 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, 2488 numa_node_id()); 2489 } 2490 2491 /** 2492 * alloc_pages - Allocate pages. 2493 * @gfp: GFP flags. 2494 * @order: Power of two of number of pages to allocate. 2495 * 2496 * Allocate 1 << @order contiguous pages. The physical address of the 2497 * first page is naturally aligned (eg an order-3 allocation will be aligned 2498 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 2499 * process is honoured when in process context. 2500 * 2501 * Context: Can be called from any context, providing the appropriate GFP 2502 * flags are used. 2503 * Return: The page on success or NULL if allocation fails. 2504 */ 2505 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) 2506 { 2507 struct page *page = alloc_frozen_pages_noprof(gfp, order); 2508 2509 if (page) 2510 set_page_refcounted(page); 2511 return page; 2512 } 2513 EXPORT_SYMBOL(alloc_pages_noprof); 2514 2515 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) 2516 { 2517 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); 2518 } 2519 EXPORT_SYMBOL(folio_alloc_noprof); 2520 2521 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, 2522 struct mempolicy *pol, unsigned long nr_pages, 2523 struct page **page_array) 2524 { 2525 int nodes; 2526 unsigned long nr_pages_per_node; 2527 int delta; 2528 int i; 2529 unsigned long nr_allocated; 2530 unsigned long total_allocated = 0; 2531 2532 nodes = nodes_weight(pol->nodes); 2533 nr_pages_per_node = nr_pages / nodes; 2534 delta = nr_pages - nodes * nr_pages_per_node; 2535 2536 for (i = 0; i < nodes; i++) { 2537 if (delta) { 2538 nr_allocated = alloc_pages_bulk_noprof(gfp, 2539 interleave_nodes(pol), NULL, 2540 nr_pages_per_node + 1, 2541 page_array); 2542 delta--; 2543 } else { 2544 nr_allocated = alloc_pages_bulk_noprof(gfp, 2545 interleave_nodes(pol), NULL, 2546 nr_pages_per_node, page_array); 2547 } 2548 2549 page_array += nr_allocated; 2550 total_allocated += nr_allocated; 2551 } 2552 2553 return total_allocated; 2554 } 2555 2556 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, 2557 struct mempolicy *pol, unsigned long nr_pages, 2558 struct page **page_array) 2559 { 2560 struct weighted_interleave_state *state; 2561 struct task_struct *me = current; 2562 unsigned int cpuset_mems_cookie; 2563 unsigned long total_allocated = 0; 2564 unsigned long nr_allocated = 0; 2565 unsigned long rounds; 2566 unsigned long node_pages, delta; 2567 u8 *weights, weight; 2568 unsigned int weight_total = 0; 2569 unsigned long rem_pages = nr_pages; 2570 nodemask_t nodes; 2571 int nnodes, node; 2572 int resume_node = MAX_NUMNODES - 1; 2573 u8 resume_weight = 0; 2574 int prev_node; 2575 int i; 2576 2577 if (!nr_pages) 2578 return 0; 2579 2580 /* read the nodes onto the stack, retry if done during rebind */ 2581 do { 2582 cpuset_mems_cookie = read_mems_allowed_begin(); 2583 nnodes = read_once_policy_nodemask(pol, &nodes); 2584 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2585 2586 /* if the nodemask has become invalid, we cannot do anything */ 2587 if (!nnodes) 2588 return 0; 2589 2590 /* Continue allocating from most recent node and adjust the nr_pages */ 2591 node = me->il_prev; 2592 weight = me->il_weight; 2593 if (weight && node_isset(node, nodes)) { 2594 node_pages = min(rem_pages, weight); 2595 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2596 page_array); 2597 page_array += nr_allocated; 2598 total_allocated += nr_allocated; 2599 /* if that's all the pages, no need to interleave */ 2600 if (rem_pages <= weight) { 2601 me->il_weight -= rem_pages; 2602 return total_allocated; 2603 } 2604 /* Otherwise we adjust remaining pages, continue from there */ 2605 rem_pages -= weight; 2606 } 2607 /* clear active weight in case of an allocation failure */ 2608 me->il_weight = 0; 2609 prev_node = node; 2610 2611 /* create a local copy of node weights to operate on outside rcu */ 2612 weights = kzalloc(nr_node_ids, GFP_KERNEL); 2613 if (!weights) 2614 return total_allocated; 2615 2616 rcu_read_lock(); 2617 state = rcu_dereference(wi_state); 2618 if (state) { 2619 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); 2620 rcu_read_unlock(); 2621 } else { 2622 rcu_read_unlock(); 2623 for (i = 0; i < nr_node_ids; i++) 2624 weights[i] = 1; 2625 } 2626 2627 /* calculate total, detect system default usage */ 2628 for_each_node_mask(node, nodes) 2629 weight_total += weights[node]; 2630 2631 /* 2632 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. 2633 * Track which node weighted interleave should resume from. 2634 * 2635 * if (rounds > 0) and (delta == 0), resume_node will always be 2636 * the node following prev_node and its weight. 2637 */ 2638 rounds = rem_pages / weight_total; 2639 delta = rem_pages % weight_total; 2640 resume_node = next_node_in(prev_node, nodes); 2641 resume_weight = weights[resume_node]; 2642 for (i = 0; i < nnodes; i++) { 2643 node = next_node_in(prev_node, nodes); 2644 weight = weights[node]; 2645 node_pages = weight * rounds; 2646 /* If a delta exists, add this node's portion of the delta */ 2647 if (delta > weight) { 2648 node_pages += weight; 2649 delta -= weight; 2650 } else if (delta) { 2651 /* when delta is depleted, resume from that node */ 2652 node_pages += delta; 2653 resume_node = node; 2654 resume_weight = weight - delta; 2655 delta = 0; 2656 } 2657 /* node_pages can be 0 if an allocation fails and rounds == 0 */ 2658 if (!node_pages) 2659 break; 2660 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, 2661 page_array); 2662 page_array += nr_allocated; 2663 total_allocated += nr_allocated; 2664 if (total_allocated == nr_pages) 2665 break; 2666 prev_node = node; 2667 } 2668 me->il_prev = resume_node; 2669 me->il_weight = resume_weight; 2670 kfree(weights); 2671 return total_allocated; 2672 } 2673 2674 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, 2675 struct mempolicy *pol, unsigned long nr_pages, 2676 struct page **page_array) 2677 { 2678 gfp_t preferred_gfp; 2679 unsigned long nr_allocated = 0; 2680 2681 preferred_gfp = gfp | __GFP_NOWARN; 2682 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2683 2684 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, 2685 nr_pages, page_array); 2686 2687 if (nr_allocated < nr_pages) 2688 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, 2689 nr_pages - nr_allocated, 2690 page_array + nr_allocated); 2691 return nr_allocated; 2692 } 2693 2694 /* alloc pages bulk and mempolicy should be considered at the 2695 * same time in some situation such as vmalloc. 2696 * 2697 * It can accelerate memory allocation especially interleaving 2698 * allocate memory. 2699 */ 2700 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, 2701 unsigned long nr_pages, struct page **page_array) 2702 { 2703 struct mempolicy *pol = &default_policy; 2704 nodemask_t *nodemask; 2705 int nid; 2706 2707 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2708 pol = get_task_policy(current); 2709 2710 if (pol->mode == MPOL_INTERLEAVE) 2711 return alloc_pages_bulk_interleave(gfp, pol, 2712 nr_pages, page_array); 2713 2714 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) 2715 return alloc_pages_bulk_weighted_interleave( 2716 gfp, pol, nr_pages, page_array); 2717 2718 if (pol->mode == MPOL_PREFERRED_MANY) 2719 return alloc_pages_bulk_preferred_many(gfp, 2720 numa_node_id(), pol, nr_pages, page_array); 2721 2722 nid = numa_node_id(); 2723 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2724 return alloc_pages_bulk_noprof(gfp, nid, nodemask, 2725 nr_pages, page_array); 2726 } 2727 2728 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2729 { 2730 struct mempolicy *pol = mpol_dup(src->vm_policy); 2731 2732 if (IS_ERR(pol)) 2733 return PTR_ERR(pol); 2734 dst->vm_policy = pol; 2735 return 0; 2736 } 2737 2738 /* 2739 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2740 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2741 * with the mems_allowed returned by cpuset_mems_allowed(). This 2742 * keeps mempolicies cpuset relative after its cpuset moves. See 2743 * further kernel/cpuset.c update_nodemask(). 2744 * 2745 * current's mempolicy may be rebinded by the other task(the task that changes 2746 * cpuset's mems), so we needn't do rebind work for current task. 2747 */ 2748 2749 /* Slow path of a mempolicy duplicate */ 2750 struct mempolicy *__mpol_dup(struct mempolicy *old) 2751 { 2752 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2753 2754 if (!new) 2755 return ERR_PTR(-ENOMEM); 2756 2757 /* task's mempolicy is protected by alloc_lock */ 2758 if (old == current->mempolicy) { 2759 task_lock(current); 2760 *new = *old; 2761 task_unlock(current); 2762 } else 2763 *new = *old; 2764 2765 if (current_cpuset_is_being_rebound()) { 2766 nodemask_t mems = cpuset_mems_allowed(current); 2767 mpol_rebind_policy(new, &mems); 2768 } 2769 atomic_set(&new->refcnt, 1); 2770 return new; 2771 } 2772 2773 /* Slow path of a mempolicy comparison */ 2774 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2775 { 2776 if (!a || !b) 2777 return false; 2778 if (a->mode != b->mode) 2779 return false; 2780 if (a->flags != b->flags) 2781 return false; 2782 if (a->home_node != b->home_node) 2783 return false; 2784 if (mpol_store_user_nodemask(a)) 2785 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2786 return false; 2787 2788 switch (a->mode) { 2789 case MPOL_BIND: 2790 case MPOL_INTERLEAVE: 2791 case MPOL_PREFERRED: 2792 case MPOL_PREFERRED_MANY: 2793 case MPOL_WEIGHTED_INTERLEAVE: 2794 return !!nodes_equal(a->nodes, b->nodes); 2795 case MPOL_LOCAL: 2796 return true; 2797 default: 2798 BUG(); 2799 return false; 2800 } 2801 } 2802 2803 /* 2804 * Shared memory backing store policy support. 2805 * 2806 * Remember policies even when nobody has shared memory mapped. 2807 * The policies are kept in Red-Black tree linked from the inode. 2808 * They are protected by the sp->lock rwlock, which should be held 2809 * for any accesses to the tree. 2810 */ 2811 2812 /* 2813 * lookup first element intersecting start-end. Caller holds sp->lock for 2814 * reading or for writing 2815 */ 2816 static struct sp_node *sp_lookup(struct shared_policy *sp, 2817 pgoff_t start, pgoff_t end) 2818 { 2819 struct rb_node *n = sp->root.rb_node; 2820 2821 while (n) { 2822 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2823 2824 if (start >= p->end) 2825 n = n->rb_right; 2826 else if (end <= p->start) 2827 n = n->rb_left; 2828 else 2829 break; 2830 } 2831 if (!n) 2832 return NULL; 2833 for (;;) { 2834 struct sp_node *w = NULL; 2835 struct rb_node *prev = rb_prev(n); 2836 if (!prev) 2837 break; 2838 w = rb_entry(prev, struct sp_node, nd); 2839 if (w->end <= start) 2840 break; 2841 n = prev; 2842 } 2843 return rb_entry(n, struct sp_node, nd); 2844 } 2845 2846 /* 2847 * Insert a new shared policy into the list. Caller holds sp->lock for 2848 * writing. 2849 */ 2850 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2851 { 2852 struct rb_node **p = &sp->root.rb_node; 2853 struct rb_node *parent = NULL; 2854 struct sp_node *nd; 2855 2856 while (*p) { 2857 parent = *p; 2858 nd = rb_entry(parent, struct sp_node, nd); 2859 if (new->start < nd->start) 2860 p = &(*p)->rb_left; 2861 else if (new->end > nd->end) 2862 p = &(*p)->rb_right; 2863 else 2864 BUG(); 2865 } 2866 rb_link_node(&new->nd, parent, p); 2867 rb_insert_color(&new->nd, &sp->root); 2868 } 2869 2870 /* Find shared policy intersecting idx */ 2871 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, 2872 pgoff_t idx) 2873 { 2874 struct mempolicy *pol = NULL; 2875 struct sp_node *sn; 2876 2877 if (!sp->root.rb_node) 2878 return NULL; 2879 read_lock(&sp->lock); 2880 sn = sp_lookup(sp, idx, idx+1); 2881 if (sn) { 2882 mpol_get(sn->policy); 2883 pol = sn->policy; 2884 } 2885 read_unlock(&sp->lock); 2886 return pol; 2887 } 2888 2889 static void sp_free(struct sp_node *n) 2890 { 2891 mpol_put(n->policy); 2892 kmem_cache_free(sn_cache, n); 2893 } 2894 2895 /** 2896 * mpol_misplaced - check whether current folio node is valid in policy 2897 * 2898 * @folio: folio to be checked 2899 * @vmf: structure describing the fault 2900 * @addr: virtual address in @vma for shared policy lookup and interleave policy 2901 * 2902 * Lookup current policy node id for vma,addr and "compare to" folio's 2903 * node id. Policy determination "mimics" alloc_page_vma(). 2904 * Called from fault path where we know the vma and faulting address. 2905 * 2906 * Return: NUMA_NO_NODE if the page is in a node that is valid for this 2907 * policy, or a suitable node ID to allocate a replacement folio from. 2908 */ 2909 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, 2910 unsigned long addr) 2911 { 2912 struct mempolicy *pol; 2913 pgoff_t ilx; 2914 struct zoneref *z; 2915 int curnid = folio_nid(folio); 2916 struct vm_area_struct *vma = vmf->vma; 2917 int thiscpu = raw_smp_processor_id(); 2918 int thisnid = numa_node_id(); 2919 int polnid = NUMA_NO_NODE; 2920 int ret = NUMA_NO_NODE; 2921 2922 /* 2923 * Make sure ptl is held so that we don't preempt and we 2924 * have a stable smp processor id 2925 */ 2926 lockdep_assert_held(vmf->ptl); 2927 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2928 if (!(pol->flags & MPOL_F_MOF)) 2929 goto out; 2930 2931 switch (pol->mode) { 2932 case MPOL_INTERLEAVE: 2933 polnid = interleave_nid(pol, ilx); 2934 break; 2935 2936 case MPOL_WEIGHTED_INTERLEAVE: 2937 polnid = weighted_interleave_nid(pol, ilx); 2938 break; 2939 2940 case MPOL_PREFERRED: 2941 if (node_isset(curnid, pol->nodes)) 2942 goto out; 2943 polnid = first_node(pol->nodes); 2944 break; 2945 2946 case MPOL_LOCAL: 2947 polnid = numa_node_id(); 2948 break; 2949 2950 case MPOL_BIND: 2951 case MPOL_PREFERRED_MANY: 2952 /* 2953 * Even though MPOL_PREFERRED_MANY can allocate pages outside 2954 * policy nodemask we don't allow numa migration to nodes 2955 * outside policy nodemask for now. This is done so that if we 2956 * want demotion to slow memory to happen, before allocating 2957 * from some DRAM node say 'x', we will end up using a 2958 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario 2959 * we should not promote to node 'x' from slow memory node. 2960 */ 2961 if (pol->flags & MPOL_F_MORON) { 2962 /* 2963 * Optimize placement among multiple nodes 2964 * via NUMA balancing 2965 */ 2966 if (node_isset(thisnid, pol->nodes)) 2967 break; 2968 goto out; 2969 } 2970 2971 /* 2972 * use current page if in policy nodemask, 2973 * else select nearest allowed node, if any. 2974 * If no allowed nodes, use current [!misplaced]. 2975 */ 2976 if (node_isset(curnid, pol->nodes)) 2977 goto out; 2978 z = first_zones_zonelist( 2979 node_zonelist(thisnid, GFP_HIGHUSER), 2980 gfp_zone(GFP_HIGHUSER), 2981 &pol->nodes); 2982 polnid = zonelist_node_idx(z); 2983 break; 2984 2985 default: 2986 BUG(); 2987 } 2988 2989 /* Migrate the folio towards the node whose CPU is referencing it */ 2990 if (pol->flags & MPOL_F_MORON) { 2991 polnid = thisnid; 2992 2993 if (!should_numa_migrate_memory(current, folio, curnid, 2994 thiscpu)) 2995 goto out; 2996 } 2997 2998 if (curnid != polnid) 2999 ret = polnid; 3000 out: 3001 mpol_cond_put(pol); 3002 3003 return ret; 3004 } 3005 3006 /* 3007 * Drop the (possibly final) reference to task->mempolicy. It needs to be 3008 * dropped after task->mempolicy is set to NULL so that any allocation done as 3009 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 3010 * policy. 3011 */ 3012 void mpol_put_task_policy(struct task_struct *task) 3013 { 3014 struct mempolicy *pol; 3015 3016 task_lock(task); 3017 pol = task->mempolicy; 3018 task->mempolicy = NULL; 3019 task_unlock(task); 3020 mpol_put(pol); 3021 } 3022 3023 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 3024 { 3025 rb_erase(&n->nd, &sp->root); 3026 sp_free(n); 3027 } 3028 3029 static void sp_node_init(struct sp_node *node, unsigned long start, 3030 unsigned long end, struct mempolicy *pol) 3031 { 3032 node->start = start; 3033 node->end = end; 3034 node->policy = pol; 3035 } 3036 3037 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 3038 struct mempolicy *pol) 3039 { 3040 struct sp_node *n; 3041 struct mempolicy *newpol; 3042 3043 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3044 if (!n) 3045 return NULL; 3046 3047 newpol = mpol_dup(pol); 3048 if (IS_ERR(newpol)) { 3049 kmem_cache_free(sn_cache, n); 3050 return NULL; 3051 } 3052 newpol->flags |= MPOL_F_SHARED; 3053 sp_node_init(n, start, end, newpol); 3054 3055 return n; 3056 } 3057 3058 /* Replace a policy range. */ 3059 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, 3060 pgoff_t end, struct sp_node *new) 3061 { 3062 struct sp_node *n; 3063 struct sp_node *n_new = NULL; 3064 struct mempolicy *mpol_new = NULL; 3065 int ret = 0; 3066 3067 restart: 3068 write_lock(&sp->lock); 3069 n = sp_lookup(sp, start, end); 3070 /* Take care of old policies in the same range. */ 3071 while (n && n->start < end) { 3072 struct rb_node *next = rb_next(&n->nd); 3073 if (n->start >= start) { 3074 if (n->end <= end) 3075 sp_delete(sp, n); 3076 else 3077 n->start = end; 3078 } else { 3079 /* Old policy spanning whole new range. */ 3080 if (n->end > end) { 3081 if (!n_new) 3082 goto alloc_new; 3083 3084 *mpol_new = *n->policy; 3085 atomic_set(&mpol_new->refcnt, 1); 3086 sp_node_init(n_new, end, n->end, mpol_new); 3087 n->end = start; 3088 sp_insert(sp, n_new); 3089 n_new = NULL; 3090 mpol_new = NULL; 3091 break; 3092 } else 3093 n->end = start; 3094 } 3095 if (!next) 3096 break; 3097 n = rb_entry(next, struct sp_node, nd); 3098 } 3099 if (new) 3100 sp_insert(sp, new); 3101 write_unlock(&sp->lock); 3102 ret = 0; 3103 3104 err_out: 3105 if (mpol_new) 3106 mpol_put(mpol_new); 3107 if (n_new) 3108 kmem_cache_free(sn_cache, n_new); 3109 3110 return ret; 3111 3112 alloc_new: 3113 write_unlock(&sp->lock); 3114 ret = -ENOMEM; 3115 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 3116 if (!n_new) 3117 goto err_out; 3118 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 3119 if (!mpol_new) 3120 goto err_out; 3121 atomic_set(&mpol_new->refcnt, 1); 3122 goto restart; 3123 } 3124 3125 /** 3126 * mpol_shared_policy_init - initialize shared policy for inode 3127 * @sp: pointer to inode shared policy 3128 * @mpol: struct mempolicy to install 3129 * 3130 * Install non-NULL @mpol in inode's shared policy rb-tree. 3131 * On entry, the current task has a reference on a non-NULL @mpol. 3132 * This must be released on exit. 3133 * This is called at get_inode() calls and we can use GFP_KERNEL. 3134 */ 3135 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 3136 { 3137 int ret; 3138 3139 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 3140 rwlock_init(&sp->lock); 3141 3142 if (mpol) { 3143 struct sp_node *sn; 3144 struct mempolicy *npol; 3145 NODEMASK_SCRATCH(scratch); 3146 3147 if (!scratch) 3148 goto put_mpol; 3149 3150 /* contextualize the tmpfs mount point mempolicy to this file */ 3151 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 3152 if (IS_ERR(npol)) 3153 goto free_scratch; /* no valid nodemask intersection */ 3154 3155 task_lock(current); 3156 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); 3157 task_unlock(current); 3158 if (ret) 3159 goto put_npol; 3160 3161 /* alloc node covering entire file; adds ref to file's npol */ 3162 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); 3163 if (sn) 3164 sp_insert(sp, sn); 3165 put_npol: 3166 mpol_put(npol); /* drop initial ref on file's npol */ 3167 free_scratch: 3168 NODEMASK_SCRATCH_FREE(scratch); 3169 put_mpol: 3170 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 3171 } 3172 } 3173 3174 int mpol_set_shared_policy(struct shared_policy *sp, 3175 struct vm_area_struct *vma, struct mempolicy *pol) 3176 { 3177 int err; 3178 struct sp_node *new = NULL; 3179 unsigned long sz = vma_pages(vma); 3180 3181 if (pol) { 3182 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); 3183 if (!new) 3184 return -ENOMEM; 3185 } 3186 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); 3187 if (err && new) 3188 sp_free(new); 3189 return err; 3190 } 3191 3192 /* Free a backing policy store on inode delete. */ 3193 void mpol_free_shared_policy(struct shared_policy *sp) 3194 { 3195 struct sp_node *n; 3196 struct rb_node *next; 3197 3198 if (!sp->root.rb_node) 3199 return; 3200 write_lock(&sp->lock); 3201 next = rb_first(&sp->root); 3202 while (next) { 3203 n = rb_entry(next, struct sp_node, nd); 3204 next = rb_next(&n->nd); 3205 sp_delete(sp, n); 3206 } 3207 write_unlock(&sp->lock); 3208 } 3209 3210 #ifdef CONFIG_NUMA_BALANCING 3211 static int __initdata numabalancing_override; 3212 3213 static void __init check_numabalancing_enable(void) 3214 { 3215 bool numabalancing_default = false; 3216 3217 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 3218 numabalancing_default = true; 3219 3220 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 3221 if (numabalancing_override) 3222 set_numabalancing_state(numabalancing_override == 1); 3223 3224 if (num_online_nodes() > 1 && !numabalancing_override) { 3225 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 3226 numabalancing_default ? "Enabling" : "Disabling"); 3227 set_numabalancing_state(numabalancing_default); 3228 } 3229 } 3230 3231 static int __init setup_numabalancing(char *str) 3232 { 3233 int ret = 0; 3234 if (!str) 3235 goto out; 3236 3237 if (!strcmp(str, "enable")) { 3238 numabalancing_override = 1; 3239 ret = 1; 3240 } else if (!strcmp(str, "disable")) { 3241 numabalancing_override = -1; 3242 ret = 1; 3243 } 3244 out: 3245 if (!ret) 3246 pr_warn("Unable to parse numa_balancing=\n"); 3247 3248 return ret; 3249 } 3250 __setup("numa_balancing=", setup_numabalancing); 3251 #else 3252 static inline void __init check_numabalancing_enable(void) 3253 { 3254 } 3255 #endif /* CONFIG_NUMA_BALANCING */ 3256 3257 void __init numa_policy_init(void) 3258 { 3259 nodemask_t interleave_nodes; 3260 unsigned long largest = 0; 3261 int nid, prefer = 0; 3262 3263 policy_cache = kmem_cache_create("numa_policy", 3264 sizeof(struct mempolicy), 3265 0, SLAB_PANIC, NULL); 3266 3267 sn_cache = kmem_cache_create("shared_policy_node", 3268 sizeof(struct sp_node), 3269 0, SLAB_PANIC, NULL); 3270 3271 for_each_node(nid) { 3272 preferred_node_policy[nid] = (struct mempolicy) { 3273 .refcnt = ATOMIC_INIT(1), 3274 .mode = MPOL_PREFERRED, 3275 .flags = MPOL_F_MOF | MPOL_F_MORON, 3276 .nodes = nodemask_of_node(nid), 3277 }; 3278 } 3279 3280 /* 3281 * Set interleaving policy for system init. Interleaving is only 3282 * enabled across suitably sized nodes (default is >= 16MB), or 3283 * fall back to the largest node if they're all smaller. 3284 */ 3285 nodes_clear(interleave_nodes); 3286 for_each_node_state(nid, N_MEMORY) { 3287 unsigned long total_pages = node_present_pages(nid); 3288 3289 /* Preserve the largest node */ 3290 if (largest < total_pages) { 3291 largest = total_pages; 3292 prefer = nid; 3293 } 3294 3295 /* Interleave this node? */ 3296 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 3297 node_set(nid, interleave_nodes); 3298 } 3299 3300 /* All too small, use the largest */ 3301 if (unlikely(nodes_empty(interleave_nodes))) 3302 node_set(prefer, interleave_nodes); 3303 3304 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 3305 pr_err("%s: interleaving failed\n", __func__); 3306 3307 check_numabalancing_enable(); 3308 } 3309 3310 /* Reset policy of current process to default */ 3311 void numa_default_policy(void) 3312 { 3313 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 3314 } 3315 3316 /* 3317 * Parse and format mempolicy from/to strings 3318 */ 3319 static const char * const policy_modes[] = 3320 { 3321 [MPOL_DEFAULT] = "default", 3322 [MPOL_PREFERRED] = "prefer", 3323 [MPOL_BIND] = "bind", 3324 [MPOL_INTERLEAVE] = "interleave", 3325 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", 3326 [MPOL_LOCAL] = "local", 3327 [MPOL_PREFERRED_MANY] = "prefer (many)", 3328 }; 3329 3330 #ifdef CONFIG_TMPFS 3331 /** 3332 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 3333 * @str: string containing mempolicy to parse 3334 * @mpol: pointer to struct mempolicy pointer, returned on success. 3335 * 3336 * Format of input: 3337 * <mode>[=<flags>][:<nodelist>] 3338 * 3339 * Return: %0 on success, else %1 3340 */ 3341 int mpol_parse_str(char *str, struct mempolicy **mpol) 3342 { 3343 struct mempolicy *new = NULL; 3344 unsigned short mode_flags; 3345 nodemask_t nodes; 3346 char *nodelist = strchr(str, ':'); 3347 char *flags = strchr(str, '='); 3348 int err = 1, mode; 3349 3350 if (flags) 3351 *flags++ = '\0'; /* terminate mode string */ 3352 3353 if (nodelist) { 3354 /* NUL-terminate mode or flags string */ 3355 *nodelist++ = '\0'; 3356 if (nodelist_parse(nodelist, nodes)) 3357 goto out; 3358 if (!nodes_subset(nodes, node_states[N_MEMORY])) 3359 goto out; 3360 } else 3361 nodes_clear(nodes); 3362 3363 mode = match_string(policy_modes, MPOL_MAX, str); 3364 if (mode < 0) 3365 goto out; 3366 3367 switch (mode) { 3368 case MPOL_PREFERRED: 3369 /* 3370 * Insist on a nodelist of one node only, although later 3371 * we use first_node(nodes) to grab a single node, so here 3372 * nodelist (or nodes) cannot be empty. 3373 */ 3374 if (nodelist) { 3375 char *rest = nodelist; 3376 while (isdigit(*rest)) 3377 rest++; 3378 if (*rest) 3379 goto out; 3380 if (nodes_empty(nodes)) 3381 goto out; 3382 } 3383 break; 3384 case MPOL_INTERLEAVE: 3385 case MPOL_WEIGHTED_INTERLEAVE: 3386 /* 3387 * Default to online nodes with memory if no nodelist 3388 */ 3389 if (!nodelist) 3390 nodes = node_states[N_MEMORY]; 3391 break; 3392 case MPOL_LOCAL: 3393 /* 3394 * Don't allow a nodelist; mpol_new() checks flags 3395 */ 3396 if (nodelist) 3397 goto out; 3398 break; 3399 case MPOL_DEFAULT: 3400 /* 3401 * Insist on a empty nodelist 3402 */ 3403 if (!nodelist) 3404 err = 0; 3405 goto out; 3406 case MPOL_PREFERRED_MANY: 3407 case MPOL_BIND: 3408 /* 3409 * Insist on a nodelist 3410 */ 3411 if (!nodelist) 3412 goto out; 3413 } 3414 3415 mode_flags = 0; 3416 if (flags) { 3417 /* 3418 * Currently, we only support two mutually exclusive 3419 * mode flags. 3420 */ 3421 if (!strcmp(flags, "static")) 3422 mode_flags |= MPOL_F_STATIC_NODES; 3423 else if (!strcmp(flags, "relative")) 3424 mode_flags |= MPOL_F_RELATIVE_NODES; 3425 else 3426 goto out; 3427 } 3428 3429 new = mpol_new(mode, mode_flags, &nodes); 3430 if (IS_ERR(new)) 3431 goto out; 3432 3433 /* 3434 * Save nodes for mpol_to_str() to show the tmpfs mount options 3435 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 3436 */ 3437 if (mode != MPOL_PREFERRED) { 3438 new->nodes = nodes; 3439 } else if (nodelist) { 3440 nodes_clear(new->nodes); 3441 node_set(first_node(nodes), new->nodes); 3442 } else { 3443 new->mode = MPOL_LOCAL; 3444 } 3445 3446 /* 3447 * Save nodes for contextualization: this will be used to "clone" 3448 * the mempolicy in a specific context [cpuset] at a later time. 3449 */ 3450 new->w.user_nodemask = nodes; 3451 3452 err = 0; 3453 3454 out: 3455 /* Restore string for error message */ 3456 if (nodelist) 3457 *--nodelist = ':'; 3458 if (flags) 3459 *--flags = '='; 3460 if (!err) 3461 *mpol = new; 3462 return err; 3463 } 3464 #endif /* CONFIG_TMPFS */ 3465 3466 /** 3467 * mpol_to_str - format a mempolicy structure for printing 3468 * @buffer: to contain formatted mempolicy string 3469 * @maxlen: length of @buffer 3470 * @pol: pointer to mempolicy to be formatted 3471 * 3472 * Convert @pol into a string. If @buffer is too short, truncate the string. 3473 * Recommend a @maxlen of at least 51 for the longest mode, "weighted 3474 * interleave", plus the longest flag flags, "relative|balancing", and to 3475 * display at least a few node ids. 3476 */ 3477 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 3478 { 3479 char *p = buffer; 3480 nodemask_t nodes = NODE_MASK_NONE; 3481 unsigned short mode = MPOL_DEFAULT; 3482 unsigned short flags = 0; 3483 3484 if (pol && 3485 pol != &default_policy && 3486 !(pol >= &preferred_node_policy[0] && 3487 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { 3488 mode = pol->mode; 3489 flags = pol->flags; 3490 } 3491 3492 switch (mode) { 3493 case MPOL_DEFAULT: 3494 case MPOL_LOCAL: 3495 break; 3496 case MPOL_PREFERRED: 3497 case MPOL_PREFERRED_MANY: 3498 case MPOL_BIND: 3499 case MPOL_INTERLEAVE: 3500 case MPOL_WEIGHTED_INTERLEAVE: 3501 nodes = pol->nodes; 3502 break; 3503 default: 3504 WARN_ON_ONCE(1); 3505 snprintf(p, maxlen, "unknown"); 3506 return; 3507 } 3508 3509 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 3510 3511 if (flags & MPOL_MODE_FLAGS) { 3512 p += snprintf(p, buffer + maxlen - p, "="); 3513 3514 /* 3515 * Static and relative are mutually exclusive. 3516 */ 3517 if (flags & MPOL_F_STATIC_NODES) 3518 p += snprintf(p, buffer + maxlen - p, "static"); 3519 else if (flags & MPOL_F_RELATIVE_NODES) 3520 p += snprintf(p, buffer + maxlen - p, "relative"); 3521 3522 if (flags & MPOL_F_NUMA_BALANCING) { 3523 if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) 3524 p += snprintf(p, buffer + maxlen - p, "|"); 3525 p += snprintf(p, buffer + maxlen - p, "balancing"); 3526 } 3527 } 3528 3529 if (!nodes_empty(nodes)) 3530 p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 3531 nodemask_pr_args(&nodes)); 3532 } 3533 3534 #ifdef CONFIG_SYSFS 3535 struct iw_node_attr { 3536 struct kobj_attribute kobj_attr; 3537 int nid; 3538 }; 3539 3540 struct sysfs_wi_group { 3541 struct kobject wi_kobj; 3542 struct mutex kobj_lock; 3543 struct iw_node_attr *nattrs[]; 3544 }; 3545 3546 static struct sysfs_wi_group *wi_group; 3547 3548 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, 3549 char *buf) 3550 { 3551 struct iw_node_attr *node_attr; 3552 u8 weight; 3553 3554 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3555 weight = get_il_weight(node_attr->nid); 3556 return sysfs_emit(buf, "%d\n", weight); 3557 } 3558 3559 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, 3560 const char *buf, size_t count) 3561 { 3562 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3563 struct iw_node_attr *node_attr; 3564 u8 weight = 0; 3565 int i; 3566 3567 node_attr = container_of(attr, struct iw_node_attr, kobj_attr); 3568 if (count == 0 || sysfs_streq(buf, "") || 3569 kstrtou8(buf, 0, &weight) || weight == 0) 3570 return -EINVAL; 3571 3572 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3573 GFP_KERNEL); 3574 if (!new_wi_state) 3575 return -ENOMEM; 3576 3577 mutex_lock(&wi_state_lock); 3578 old_wi_state = rcu_dereference_protected(wi_state, 3579 lockdep_is_held(&wi_state_lock)); 3580 if (old_wi_state) { 3581 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3582 nr_node_ids * sizeof(u8)); 3583 } else { 3584 for (i = 0; i < nr_node_ids; i++) 3585 new_wi_state->iw_table[i] = 1; 3586 } 3587 new_wi_state->iw_table[node_attr->nid] = weight; 3588 new_wi_state->mode_auto = false; 3589 3590 rcu_assign_pointer(wi_state, new_wi_state); 3591 mutex_unlock(&wi_state_lock); 3592 if (old_wi_state) { 3593 synchronize_rcu(); 3594 kfree(old_wi_state); 3595 } 3596 return count; 3597 } 3598 3599 static ssize_t weighted_interleave_auto_show(struct kobject *kobj, 3600 struct kobj_attribute *attr, char *buf) 3601 { 3602 struct weighted_interleave_state *state; 3603 bool wi_auto = true; 3604 3605 rcu_read_lock(); 3606 state = rcu_dereference(wi_state); 3607 if (state) 3608 wi_auto = state->mode_auto; 3609 rcu_read_unlock(); 3610 3611 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); 3612 } 3613 3614 static ssize_t weighted_interleave_auto_store(struct kobject *kobj, 3615 struct kobj_attribute *attr, const char *buf, size_t count) 3616 { 3617 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; 3618 unsigned int *bw; 3619 bool input; 3620 int i; 3621 3622 if (kstrtobool(buf, &input)) 3623 return -EINVAL; 3624 3625 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), 3626 GFP_KERNEL); 3627 if (!new_wi_state) 3628 return -ENOMEM; 3629 for (i = 0; i < nr_node_ids; i++) 3630 new_wi_state->iw_table[i] = 1; 3631 3632 mutex_lock(&wi_state_lock); 3633 if (!input) { 3634 old_wi_state = rcu_dereference_protected(wi_state, 3635 lockdep_is_held(&wi_state_lock)); 3636 if (!old_wi_state) 3637 goto update_wi_state; 3638 if (input == old_wi_state->mode_auto) { 3639 mutex_unlock(&wi_state_lock); 3640 return count; 3641 } 3642 3643 memcpy(new_wi_state->iw_table, old_wi_state->iw_table, 3644 nr_node_ids * sizeof(u8)); 3645 goto update_wi_state; 3646 } 3647 3648 bw = node_bw_table; 3649 if (!bw) { 3650 mutex_unlock(&wi_state_lock); 3651 kfree(new_wi_state); 3652 return -ENODEV; 3653 } 3654 3655 new_wi_state->mode_auto = true; 3656 reduce_interleave_weights(bw, new_wi_state->iw_table); 3657 3658 update_wi_state: 3659 rcu_assign_pointer(wi_state, new_wi_state); 3660 mutex_unlock(&wi_state_lock); 3661 if (old_wi_state) { 3662 synchronize_rcu(); 3663 kfree(old_wi_state); 3664 } 3665 return count; 3666 } 3667 3668 static void sysfs_wi_node_delete(int nid) 3669 { 3670 struct iw_node_attr *attr; 3671 3672 if (nid < 0 || nid >= nr_node_ids) 3673 return; 3674 3675 mutex_lock(&wi_group->kobj_lock); 3676 attr = wi_group->nattrs[nid]; 3677 if (!attr) { 3678 mutex_unlock(&wi_group->kobj_lock); 3679 return; 3680 } 3681 3682 wi_group->nattrs[nid] = NULL; 3683 mutex_unlock(&wi_group->kobj_lock); 3684 3685 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); 3686 kfree(attr->kobj_attr.attr.name); 3687 kfree(attr); 3688 } 3689 3690 static void sysfs_wi_node_delete_all(void) 3691 { 3692 int nid; 3693 3694 for (nid = 0; nid < nr_node_ids; nid++) 3695 sysfs_wi_node_delete(nid); 3696 } 3697 3698 static void wi_state_free(void) 3699 { 3700 struct weighted_interleave_state *old_wi_state; 3701 3702 mutex_lock(&wi_state_lock); 3703 old_wi_state = rcu_dereference_protected(wi_state, 3704 lockdep_is_held(&wi_state_lock)); 3705 rcu_assign_pointer(wi_state, NULL); 3706 mutex_unlock(&wi_state_lock); 3707 3708 if (old_wi_state) { 3709 synchronize_rcu(); 3710 kfree(old_wi_state); 3711 } 3712 } 3713 3714 static struct kobj_attribute wi_auto_attr = 3715 __ATTR(auto, 0664, weighted_interleave_auto_show, 3716 weighted_interleave_auto_store); 3717 3718 static void wi_cleanup(void) { 3719 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3720 sysfs_wi_node_delete_all(); 3721 wi_state_free(); 3722 } 3723 3724 static void wi_kobj_release(struct kobject *wi_kobj) 3725 { 3726 kfree(wi_group); 3727 } 3728 3729 static const struct kobj_type wi_ktype = { 3730 .sysfs_ops = &kobj_sysfs_ops, 3731 .release = wi_kobj_release, 3732 }; 3733 3734 static int sysfs_wi_node_add(int nid) 3735 { 3736 int ret; 3737 char *name; 3738 struct iw_node_attr *new_attr; 3739 3740 if (nid < 0 || nid >= nr_node_ids) { 3741 pr_err("invalid node id: %d\n", nid); 3742 return -EINVAL; 3743 } 3744 3745 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); 3746 if (!new_attr) 3747 return -ENOMEM; 3748 3749 name = kasprintf(GFP_KERNEL, "node%d", nid); 3750 if (!name) { 3751 kfree(new_attr); 3752 return -ENOMEM; 3753 } 3754 3755 sysfs_attr_init(&new_attr->kobj_attr.attr); 3756 new_attr->kobj_attr.attr.name = name; 3757 new_attr->kobj_attr.attr.mode = 0644; 3758 new_attr->kobj_attr.show = node_show; 3759 new_attr->kobj_attr.store = node_store; 3760 new_attr->nid = nid; 3761 3762 mutex_lock(&wi_group->kobj_lock); 3763 if (wi_group->nattrs[nid]) { 3764 mutex_unlock(&wi_group->kobj_lock); 3765 ret = -EEXIST; 3766 goto out; 3767 } 3768 3769 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); 3770 if (ret) { 3771 mutex_unlock(&wi_group->kobj_lock); 3772 goto out; 3773 } 3774 wi_group->nattrs[nid] = new_attr; 3775 mutex_unlock(&wi_group->kobj_lock); 3776 return 0; 3777 3778 out: 3779 kfree(new_attr->kobj_attr.attr.name); 3780 kfree(new_attr); 3781 return ret; 3782 } 3783 3784 static int wi_node_notifier(struct notifier_block *nb, 3785 unsigned long action, void *data) 3786 { 3787 int err; 3788 struct node_notify *nn = data; 3789 int nid = nn->nid; 3790 3791 switch (action) { 3792 case NODE_ADDED_FIRST_MEMORY: 3793 err = sysfs_wi_node_add(nid); 3794 if (err) 3795 pr_err("failed to add sysfs for node%d during hotplug: %d\n", 3796 nid, err); 3797 break; 3798 case NODE_REMOVED_LAST_MEMORY: 3799 sysfs_wi_node_delete(nid); 3800 break; 3801 } 3802 3803 return NOTIFY_OK; 3804 } 3805 3806 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) 3807 { 3808 int nid, err; 3809 3810 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), 3811 GFP_KERNEL); 3812 if (!wi_group) 3813 return -ENOMEM; 3814 mutex_init(&wi_group->kobj_lock); 3815 3816 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, 3817 "weighted_interleave"); 3818 if (err) 3819 goto err_put_kobj; 3820 3821 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); 3822 if (err) 3823 goto err_put_kobj; 3824 3825 for_each_online_node(nid) { 3826 if (!node_state(nid, N_MEMORY)) 3827 continue; 3828 3829 err = sysfs_wi_node_add(nid); 3830 if (err) { 3831 pr_err("failed to add sysfs for node%d during init: %d\n", 3832 nid, err); 3833 goto err_cleanup_kobj; 3834 } 3835 } 3836 3837 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); 3838 return 0; 3839 3840 err_cleanup_kobj: 3841 wi_cleanup(); 3842 kobject_del(&wi_group->wi_kobj); 3843 err_put_kobj: 3844 kobject_put(&wi_group->wi_kobj); 3845 return err; 3846 } 3847 3848 static int __init mempolicy_sysfs_init(void) 3849 { 3850 int err; 3851 static struct kobject *mempolicy_kobj; 3852 3853 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); 3854 if (!mempolicy_kobj) 3855 return -ENOMEM; 3856 3857 err = add_weighted_interleave_group(mempolicy_kobj); 3858 if (err) 3859 goto err_kobj; 3860 3861 return 0; 3862 3863 err_kobj: 3864 kobject_del(mempolicy_kobj); 3865 kobject_put(mempolicy_kobj); 3866 return err; 3867 } 3868 3869 late_initcall(mempolicy_sysfs_init); 3870 #endif /* CONFIG_SYSFS */ 3871