1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. 4 * Authors: David Chinner and Glauber Costa 5 * 6 * Generic LRU infrastructure 7 */ 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/mm.h> 11 #include <linux/list_lru.h> 12 #include <linux/slab.h> 13 #include <linux/mutex.h> 14 #include <linux/memcontrol.h> 15 #include "slab.h" 16 #include "internal.h" 17 18 static inline void lock_list_lru(struct list_lru_one *l, bool irq, 19 unsigned long *irq_flags) 20 { 21 if (irq_flags) 22 spin_lock_irqsave(&l->lock, *irq_flags); 23 else if (irq) 24 spin_lock_irq(&l->lock); 25 else 26 spin_lock(&l->lock); 27 } 28 29 static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off, 30 unsigned long *irq_flags) 31 { 32 if (irq_flags) 33 spin_unlock_irqrestore(&l->lock, *irq_flags); 34 else if (irq_off) 35 spin_unlock_irq(&l->lock); 36 else 37 spin_unlock(&l->lock); 38 } 39 40 #ifdef CONFIG_MEMCG 41 static LIST_HEAD(memcg_list_lrus); 42 static DEFINE_MUTEX(list_lrus_mutex); 43 44 static inline bool list_lru_memcg_aware(struct list_lru *lru) 45 { 46 return lru->memcg_aware; 47 } 48 49 static void list_lru_register(struct list_lru *lru) 50 { 51 if (!list_lru_memcg_aware(lru)) 52 return; 53 54 mutex_lock(&list_lrus_mutex); 55 list_add(&lru->list, &memcg_list_lrus); 56 mutex_unlock(&list_lrus_mutex); 57 } 58 59 static void list_lru_unregister(struct list_lru *lru) 60 { 61 if (!list_lru_memcg_aware(lru)) 62 return; 63 64 mutex_lock(&list_lrus_mutex); 65 list_del(&lru->list); 66 mutex_unlock(&list_lrus_mutex); 67 } 68 69 static int lru_shrinker_id(struct list_lru *lru) 70 { 71 return lru->shrinker_id; 72 } 73 74 static inline struct list_lru_one * 75 list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) 76 { 77 if (list_lru_memcg_aware(lru) && idx >= 0) { 78 struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); 79 80 return mlru ? &mlru->node[nid] : NULL; 81 } 82 return &lru->node[nid].lru; 83 } 84 85 static inline struct list_lru_one * 86 lock_list_lru_of_memcg(struct list_lru *lru, int nid, 87 struct mem_cgroup **memcg, bool irq, 88 unsigned long *irq_flags, bool skip_empty) 89 { 90 struct list_lru_one *l; 91 92 rcu_read_lock(); 93 again: 94 l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg)); 95 if (likely(l)) { 96 lock_list_lru(l, irq, irq_flags); 97 if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { 98 rcu_read_unlock(); 99 return l; 100 } 101 unlock_list_lru(l, irq, irq_flags); 102 } 103 /* 104 * Caller may simply bail out if raced with reparenting or 105 * may iterate through the list_lru and expect empty slots. 106 */ 107 if (skip_empty) { 108 rcu_read_unlock(); 109 return NULL; 110 } 111 VM_WARN_ON(!css_is_dying(&(*memcg)->css)); 112 *memcg = parent_mem_cgroup(*memcg); 113 goto again; 114 } 115 #else 116 static void list_lru_register(struct list_lru *lru) 117 { 118 } 119 120 static void list_lru_unregister(struct list_lru *lru) 121 { 122 } 123 124 static int lru_shrinker_id(struct list_lru *lru) 125 { 126 return -1; 127 } 128 129 static inline bool list_lru_memcg_aware(struct list_lru *lru) 130 { 131 return false; 132 } 133 134 static inline struct list_lru_one * 135 list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) 136 { 137 return &lru->node[nid].lru; 138 } 139 140 static inline struct list_lru_one * 141 lock_list_lru_of_memcg(struct list_lru *lru, int nid, 142 struct mem_cgroup **memcg, bool irq, 143 unsigned long *irq_flags, bool skip_empty) 144 { 145 struct list_lru_one *l = &lru->node[nid].lru; 146 147 lock_list_lru(l, irq, irq_flags); 148 149 return l; 150 } 151 #endif /* CONFIG_MEMCG */ 152 153 struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid, 154 struct mem_cgroup **memcg) 155 { 156 return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/false, 157 /*irq_flags=*/NULL, /*skip_empty=*/false); 158 } 159 160 void list_lru_unlock(struct list_lru_one *l) 161 { 162 unlock_list_lru(l, /*irq_off=*/false, /*irq_flags=*/NULL); 163 } 164 165 struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid, 166 struct mem_cgroup **memcg) 167 { 168 return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, 169 /*irq_flags=*/NULL, /*skip_empty=*/false); 170 } 171 172 void list_lru_unlock_irq(struct list_lru_one *l) 173 { 174 unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/NULL); 175 } 176 177 struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid, 178 struct mem_cgroup **memcg, 179 unsigned long *flags) 180 { 181 return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, 182 /*irq_flags=*/flags, /*skip_empty=*/false); 183 } 184 185 void list_lru_unlock_irqrestore(struct list_lru_one *l, unsigned long *flags) 186 { 187 unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/flags); 188 } 189 190 bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l, 191 struct list_head *item, int nid, 192 struct mem_cgroup *memcg) 193 { 194 if (list_empty(item)) { 195 list_add_tail(item, &l->list); 196 /* 197 * Set shrinker bit on the memcg that owns the locked 198 * sublist - lock_list_lru_of_memcg() may have walked up 199 * past a dying memcg, and the bit must be set there. 200 */ 201 if (!l->nr_items++) 202 set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); 203 atomic_long_inc(&lru->node[nid].nr_items); 204 return true; 205 } 206 return false; 207 } 208 EXPORT_SYMBOL_GPL(list_lru_add); 209 210 bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l, 211 struct list_head *item, int nid) 212 { 213 if (!list_empty(item)) { 214 list_del_init(item); 215 l->nr_items--; 216 atomic_long_dec(&lru->node[nid].nr_items); 217 return true; 218 } 219 return false; 220 } 221 222 /* The caller must ensure the memcg lifetime. */ 223 bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, 224 struct mem_cgroup *memcg) 225 { 226 struct list_lru_one *l; 227 bool ret; 228 229 l = list_lru_lock(lru, nid, &memcg); 230 ret = __list_lru_add(lru, l, item, nid, memcg); 231 list_lru_unlock(l); 232 return ret; 233 } 234 235 bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, 236 int nid, struct mem_cgroup *memcg) 237 { 238 struct list_lru_one *l; 239 bool ret; 240 241 l = list_lru_lock_irq(lru, nid, &memcg); 242 ret = __list_lru_add(lru, l, item, nid, memcg); 243 list_lru_unlock_irq(l); 244 return ret; 245 } 246 247 bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) 248 { 249 bool ret; 250 int nid = page_to_nid(virt_to_page(item)); 251 252 if (list_lru_memcg_aware(lru)) { 253 rcu_read_lock(); 254 ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item)); 255 rcu_read_unlock(); 256 } else { 257 ret = list_lru_add(lru, item, nid, NULL); 258 } 259 260 return ret; 261 } 262 EXPORT_SYMBOL_GPL(list_lru_add_obj); 263 264 /* The caller must ensure the memcg lifetime. */ 265 bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, 266 struct mem_cgroup *memcg) 267 { 268 struct list_lru_one *l; 269 bool ret; 270 271 l = list_lru_lock(lru, nid, &memcg); 272 ret = __list_lru_del(lru, l, item, nid); 273 list_lru_unlock(l); 274 return ret; 275 } 276 277 bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) 278 { 279 bool ret; 280 int nid = page_to_nid(virt_to_page(item)); 281 282 if (list_lru_memcg_aware(lru)) { 283 rcu_read_lock(); 284 ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item)); 285 rcu_read_unlock(); 286 } else { 287 ret = list_lru_del(lru, item, nid, NULL); 288 } 289 290 return ret; 291 } 292 EXPORT_SYMBOL_GPL(list_lru_del_obj); 293 294 void list_lru_isolate(struct list_lru_one *list, struct list_head *item) 295 { 296 list_del_init(item); 297 list->nr_items--; 298 } 299 EXPORT_SYMBOL_GPL(list_lru_isolate); 300 301 void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, 302 struct list_head *head) 303 { 304 list_move(item, head); 305 list->nr_items--; 306 } 307 EXPORT_SYMBOL_GPL(list_lru_isolate_move); 308 309 unsigned long list_lru_count_one(struct list_lru *lru, 310 int nid, struct mem_cgroup *memcg) 311 { 312 struct list_lru_one *l; 313 long count; 314 315 rcu_read_lock(); 316 l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); 317 count = l ? READ_ONCE(l->nr_items) : 0; 318 rcu_read_unlock(); 319 320 if (unlikely(count < 0)) 321 count = 0; 322 323 return count; 324 } 325 EXPORT_SYMBOL_GPL(list_lru_count_one); 326 327 unsigned long list_lru_count_node(struct list_lru *lru, int nid) 328 { 329 struct list_lru_node *nlru; 330 331 nlru = &lru->node[nid]; 332 return atomic_long_read(&nlru->nr_items); 333 } 334 EXPORT_SYMBOL_GPL(list_lru_count_node); 335 336 static unsigned long 337 __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 338 list_lru_walk_cb isolate, void *cb_arg, 339 unsigned long *nr_to_walk, bool irq_off) 340 { 341 struct list_lru_node *nlru = &lru->node[nid]; 342 struct list_lru_one *l = NULL; 343 struct list_head *item, *n; 344 unsigned long isolated = 0; 345 346 restart: 347 l = lock_list_lru_of_memcg(lru, nid, &memcg, /*irq=*/irq_off, 348 /*irq_flags=*/NULL, /*skip_empty=*/true); 349 if (!l) 350 return isolated; 351 list_for_each_safe(item, n, &l->list) { 352 enum lru_status ret; 353 354 /* 355 * decrement nr_to_walk first so that we don't livelock if we 356 * get stuck on large numbers of LRU_RETRY items 357 */ 358 if (!*nr_to_walk) 359 break; 360 --*nr_to_walk; 361 362 ret = isolate(item, l, cb_arg); 363 switch (ret) { 364 /* 365 * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru 366 * lock. List traversal will have to restart from scratch. 367 */ 368 case LRU_RETRY: 369 goto restart; 370 case LRU_REMOVED_RETRY: 371 fallthrough; 372 case LRU_REMOVED: 373 isolated++; 374 atomic_long_dec(&nlru->nr_items); 375 if (ret == LRU_REMOVED_RETRY) 376 goto restart; 377 break; 378 case LRU_ROTATE: 379 list_move_tail(item, &l->list); 380 break; 381 case LRU_SKIP: 382 break; 383 case LRU_STOP: 384 goto out; 385 default: 386 BUG(); 387 } 388 } 389 unlock_list_lru(l, irq_off, NULL); 390 out: 391 return isolated; 392 } 393 394 unsigned long 395 list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 396 list_lru_walk_cb isolate, void *cb_arg, 397 unsigned long *nr_to_walk) 398 { 399 return __list_lru_walk_one(lru, nid, memcg, isolate, 400 cb_arg, nr_to_walk, false); 401 } 402 EXPORT_SYMBOL_GPL(list_lru_walk_one); 403 404 unsigned long 405 list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 406 list_lru_walk_cb isolate, void *cb_arg, 407 unsigned long *nr_to_walk) 408 { 409 return __list_lru_walk_one(lru, nid, memcg, isolate, 410 cb_arg, nr_to_walk, true); 411 } 412 413 unsigned long list_lru_walk_node(struct list_lru *lru, int nid, 414 list_lru_walk_cb isolate, void *cb_arg, 415 unsigned long *nr_to_walk) 416 { 417 long isolated = 0; 418 419 isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, 420 nr_to_walk); 421 422 #ifdef CONFIG_MEMCG 423 if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { 424 struct list_lru_memcg *mlru; 425 struct mem_cgroup *memcg; 426 unsigned long index; 427 428 xa_for_each(&lru->xa, index, mlru) { 429 rcu_read_lock(); 430 memcg = mem_cgroup_from_private_id(index); 431 if (!mem_cgroup_tryget(memcg)) { 432 rcu_read_unlock(); 433 continue; 434 } 435 rcu_read_unlock(); 436 isolated += __list_lru_walk_one(lru, nid, memcg, 437 isolate, cb_arg, 438 nr_to_walk, false); 439 mem_cgroup_put(memcg); 440 441 if (*nr_to_walk <= 0) 442 break; 443 } 444 } 445 #endif 446 447 return isolated; 448 } 449 EXPORT_SYMBOL_GPL(list_lru_walk_node); 450 451 static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) 452 { 453 INIT_LIST_HEAD(&l->list); 454 spin_lock_init(&l->lock); 455 l->nr_items = 0; 456 #ifdef CONFIG_LOCKDEP 457 if (lru->key) 458 lockdep_set_class(&l->lock, lru->key); 459 #endif 460 } 461 462 #ifdef CONFIG_MEMCG 463 static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) 464 { 465 int nid; 466 struct list_lru_memcg *mlru; 467 468 mlru = kmalloc_flex(*mlru, node, nr_node_ids, gfp); 469 if (!mlru) 470 return NULL; 471 472 for_each_node(nid) 473 init_one_lru(lru, &mlru->node[nid]); 474 475 return mlru; 476 } 477 478 static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 479 { 480 if (memcg_aware) 481 xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); 482 lru->memcg_aware = memcg_aware; 483 } 484 485 static void memcg_destroy_list_lru(struct list_lru *lru) 486 { 487 XA_STATE(xas, &lru->xa, 0); 488 struct list_lru_memcg *mlru; 489 490 if (!list_lru_memcg_aware(lru)) 491 return; 492 493 xas_lock_irq(&xas); 494 xas_for_each(&xas, mlru, ULONG_MAX) { 495 kfree(mlru); 496 xas_store(&xas, NULL); 497 } 498 xas_unlock_irq(&xas); 499 } 500 501 static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, 502 struct list_lru_one *src, 503 struct mem_cgroup *dst_memcg) 504 { 505 int dst_idx = dst_memcg->kmemcg_id; 506 struct list_lru_one *dst; 507 508 spin_lock_irq(&src->lock); 509 dst = list_lru_from_memcg_idx(lru, nid, dst_idx); 510 spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); 511 512 list_splice_init(&src->list, &dst->list); 513 if (src->nr_items) { 514 WARN_ON(src->nr_items < 0); 515 dst->nr_items += src->nr_items; 516 set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); 517 } 518 /* Mark the list_lru_one dead */ 519 src->nr_items = LONG_MIN; 520 521 spin_unlock(&dst->lock); 522 spin_unlock_irq(&src->lock); 523 } 524 525 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) 526 { 527 struct list_lru *lru; 528 int i; 529 530 mutex_lock(&list_lrus_mutex); 531 list_for_each_entry(lru, &memcg_list_lrus, list) { 532 struct list_lru_memcg *mlru; 533 534 /* 535 * css_is_dying() check in memcg_list_lru_alloc() avoids 536 * allocating a new mlru since CSS_DYING is already set for this 537 * memcg a rcu grace period ago. 538 */ 539 mlru = xa_load(&lru->xa, memcg->kmemcg_id); 540 if (!mlru) 541 continue; 542 543 /* 544 * Reparent each per-node list and mark the child dead 545 * (LONG_MIN) before clearing xarray entry otherwise a 546 * concurrent list_lru_del() may corrupt the list if it arrives 547 * after xarray clear but before reparenting as 548 * lock_list_lru_of_memcg will acquire parent's lock while the 549 * item is still on child's list. 550 */ 551 for_each_node(i) 552 memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); 553 554 xa_erase_irq(&lru->xa, memcg->kmemcg_id); 555 556 /* 557 * Here all list_lrus corresponding to the cgroup are guaranteed 558 * to remain empty, we can safely free this lru, any further 559 * memcg_list_lru_alloc() call will simply bail out. 560 */ 561 kvfree_rcu(mlru, rcu); 562 } 563 mutex_unlock(&list_lrus_mutex); 564 } 565 566 static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, 567 struct list_lru *lru) 568 { 569 int idx = memcg->kmemcg_id; 570 571 return idx < 0 || xa_load(&lru->xa, idx); 572 } 573 574 static int __memcg_list_lru_alloc(struct mem_cgroup *memcg, 575 struct list_lru *lru, gfp_t gfp) 576 { 577 unsigned long flags; 578 struct list_lru_memcg *mlru = NULL; 579 struct mem_cgroup *pos, *parent; 580 XA_STATE(xas, &lru->xa, 0); 581 582 gfp &= GFP_RECLAIM_MASK; 583 /* 584 * Because the list_lru can be reparented to the parent cgroup's 585 * list_lru, we should make sure that this cgroup and all its 586 * ancestors have allocated list_lru_memcg. 587 */ 588 do { 589 /* 590 * Keep finding the farest parent that wasn't populated 591 * until found memcg itself. 592 */ 593 pos = memcg; 594 parent = parent_mem_cgroup(pos); 595 while (!memcg_list_lru_allocated(parent, lru)) { 596 pos = parent; 597 parent = parent_mem_cgroup(pos); 598 } 599 600 if (!mlru) { 601 mlru = memcg_init_list_lru_one(lru, gfp); 602 if (!mlru) 603 return -ENOMEM; 604 } 605 xas_set(&xas, pos->kmemcg_id); 606 do { 607 xas_lock_irqsave(&xas, flags); 608 if (!xas_load(&xas) && !css_is_dying(&pos->css)) { 609 xas_store(&xas, mlru); 610 if (!xas_error(&xas)) 611 mlru = NULL; 612 } 613 xas_unlock_irqrestore(&xas, flags); 614 } while (xas_nomem(&xas, gfp)); 615 } while (pos != memcg && !css_is_dying(&pos->css)); 616 617 if (unlikely(mlru)) 618 kfree(mlru); 619 620 return xas_error(&xas); 621 } 622 623 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, 624 gfp_t gfp) 625 { 626 if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) 627 return 0; 628 return __memcg_list_lru_alloc(memcg, lru, gfp); 629 } 630 631 int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru, 632 gfp_t gfp) 633 { 634 struct mem_cgroup *memcg; 635 int res; 636 637 if (!list_lru_memcg_aware(lru)) 638 return 0; 639 640 /* Fast path when list_lru heads already exist */ 641 rcu_read_lock(); 642 memcg = folio_memcg(folio); 643 res = memcg_list_lru_allocated(memcg, lru); 644 rcu_read_unlock(); 645 if (likely(res)) 646 return 0; 647 648 /* Allocation may block, pin the memcg */ 649 memcg = get_mem_cgroup_from_folio(folio); 650 res = __memcg_list_lru_alloc(memcg, lru, gfp); 651 mem_cgroup_put(memcg); 652 return res; 653 } 654 #else 655 static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 656 { 657 } 658 659 static void memcg_destroy_list_lru(struct list_lru *lru) 660 { 661 } 662 #endif /* CONFIG_MEMCG */ 663 664 int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) 665 { 666 int i; 667 668 #ifdef CONFIG_MEMCG 669 if (shrinker) 670 lru->shrinker_id = shrinker->id; 671 else 672 lru->shrinker_id = -1; 673 674 if (mem_cgroup_kmem_disabled()) 675 memcg_aware = false; 676 #endif 677 678 lru->node = kzalloc_objs(*lru->node, nr_node_ids); 679 if (!lru->node) 680 return -ENOMEM; 681 682 for_each_node(i) 683 init_one_lru(lru, &lru->node[i].lru); 684 685 memcg_init_list_lru(lru, memcg_aware); 686 list_lru_register(lru); 687 688 return 0; 689 } 690 EXPORT_SYMBOL_GPL(__list_lru_init); 691 692 void list_lru_destroy(struct list_lru *lru) 693 { 694 /* Already destroyed or not yet initialized? */ 695 if (!lru->node) 696 return; 697 698 list_lru_unregister(lru); 699 700 memcg_destroy_list_lru(lru); 701 kfree(lru->node); 702 lru->node = NULL; 703 704 #ifdef CONFIG_MEMCG 705 lru->shrinker_id = -1; 706 #endif 707 } 708 EXPORT_SYMBOL_GPL(list_lru_destroy); 709