1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 /* 4 * zsmalloc memory allocator 5 * 6 * Copyright (C) 2011 Nitin Gupta 7 * Copyright (C) 2012, 2013 Minchan Kim 8 * 9 * This code is released using a dual license strategy: BSD/GPL 10 * You can choose the license that better fits your requirements. 11 * 12 * Released under the terms of 3-clause BSD License 13 * Released under the terms of GNU General Public License Version 2.0 14 */ 15 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 18 /* 19 * lock ordering: 20 * page_lock 21 * pool->lock 22 * class->lock 23 * zspage->lock 24 */ 25 26 #include <linux/module.h> 27 #include <linux/kernel.h> 28 #include <linux/sched.h> 29 #include <linux/bitops.h> 30 #include <linux/errno.h> 31 #include <linux/highmem.h> 32 #include <linux/string.h> 33 #include <linux/slab.h> 34 #include <linux/pgtable.h> 35 #include <asm/tlbflush.h> 36 #include <linux/cpumask.h> 37 #include <linux/cpu.h> 38 #include <linux/vmalloc.h> 39 #include <linux/preempt.h> 40 #include <linux/spinlock.h> 41 #include <linux/sprintf.h> 42 #include <linux/shrinker.h> 43 #include <linux/types.h> 44 #include <linux/debugfs.h> 45 #include <linux/zsmalloc.h> 46 #include <linux/zpool.h> 47 #include <linux/migrate.h> 48 #include <linux/wait.h> 49 #include <linux/pagemap.h> 50 #include <linux/fs.h> 51 #include <linux/local_lock.h> 52 #include "zpdesc.h" 53 54 #define ZSPAGE_MAGIC 0x58 55 56 /* 57 * This must be power of 2 and greater than or equal to sizeof(link_free). 58 * These two conditions ensure that any 'struct link_free' itself doesn't 59 * span more than 1 page which avoids complex case of mapping 2 pages simply 60 * to restore link_free pointer values. 61 */ 62 #define ZS_ALIGN 8 63 64 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 65 66 /* 67 * Object location (<PFN>, <obj_idx>) is encoded as 68 * a single (unsigned long) handle value. 69 * 70 * Note that object index <obj_idx> starts from 0. 71 * 72 * This is made more complicated by various memory models and PAE. 73 */ 74 75 #ifndef MAX_POSSIBLE_PHYSMEM_BITS 76 #ifdef MAX_PHYSMEM_BITS 77 #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS 78 #else 79 /* 80 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 81 * be PAGE_SHIFT 82 */ 83 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG 84 #endif 85 #endif 86 87 #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) 88 89 /* 90 * Head in allocated object should have OBJ_ALLOCATED_TAG 91 * to identify the object was allocated or not. 92 * It's okay to add the status bit in the least bit because 93 * header keeps handle which is 4byte-aligned address so we 94 * have room for two bit at least. 95 */ 96 #define OBJ_ALLOCATED_TAG 1 97 98 #define OBJ_TAG_BITS 1 99 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG 100 101 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) 102 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 103 104 #define HUGE_BITS 1 105 #define FULLNESS_BITS 4 106 #define CLASS_BITS 8 107 #define MAGIC_VAL_BITS 8 108 109 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) 110 111 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 112 #define ZS_MIN_ALLOC_SIZE \ 113 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 114 /* each chunk includes extra space to keep handle */ 115 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 116 117 /* 118 * On systems with 4K page size, this gives 255 size classes! There is a 119 * trader-off here: 120 * - Large number of size classes is potentially wasteful as free page are 121 * spread across these classes 122 * - Small number of size classes causes large internal fragmentation 123 * - Probably its better to use specific size classes (empirically 124 * determined). NOTE: all those class sizes must be set as multiple of 125 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 126 * 127 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 128 * (reason above) 129 */ 130 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) 131 #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ 132 ZS_SIZE_CLASS_DELTA) + 1) 133 134 /* 135 * Pages are distinguished by the ratio of used memory (that is the ratio 136 * of ->inuse objects to all objects that page can store). For example, 137 * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. 138 * 139 * The number of fullness groups is not random. It allows us to keep 140 * difference between the least busy page in the group (minimum permitted 141 * number of ->inuse objects) and the most busy page (maximum permitted 142 * number of ->inuse objects) at a reasonable value. 143 */ 144 enum fullness_group { 145 ZS_INUSE_RATIO_0, 146 ZS_INUSE_RATIO_10, 147 /* NOTE: 8 more fullness groups here */ 148 ZS_INUSE_RATIO_99 = 10, 149 ZS_INUSE_RATIO_100, 150 NR_FULLNESS_GROUPS, 151 }; 152 153 enum class_stat_type { 154 /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */ 155 ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS, 156 ZS_OBJS_INUSE, 157 NR_CLASS_STAT_TYPES, 158 }; 159 160 struct zs_size_stat { 161 unsigned long objs[NR_CLASS_STAT_TYPES]; 162 }; 163 164 #ifdef CONFIG_ZSMALLOC_STAT 165 static struct dentry *zs_stat_root; 166 #endif 167 168 static size_t huge_class_size; 169 170 struct size_class { 171 spinlock_t lock; 172 struct list_head fullness_list[NR_FULLNESS_GROUPS]; 173 /* 174 * Size of objects stored in this class. Must be multiple 175 * of ZS_ALIGN. 176 */ 177 int size; 178 int objs_per_zspage; 179 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 180 int pages_per_zspage; 181 182 unsigned int index; 183 struct zs_size_stat stats; 184 }; 185 186 /* 187 * Placed within free objects to form a singly linked list. 188 * For every zspage, zspage->freeobj gives head of this list. 189 * 190 * This must be power of 2 and less than or equal to ZS_ALIGN 191 */ 192 struct link_free { 193 union { 194 /* 195 * Free object index; 196 * It's valid for non-allocated object 197 */ 198 unsigned long next; 199 /* 200 * Handle of allocated object. 201 */ 202 unsigned long handle; 203 }; 204 }; 205 206 struct zs_pool { 207 const char *name; 208 209 struct size_class *size_class[ZS_SIZE_CLASSES]; 210 struct kmem_cache *handle_cachep; 211 struct kmem_cache *zspage_cachep; 212 213 atomic_long_t pages_allocated; 214 215 struct zs_pool_stats stats; 216 217 /* Compact classes */ 218 struct shrinker *shrinker; 219 220 #ifdef CONFIG_ZSMALLOC_STAT 221 struct dentry *stat_dentry; 222 #endif 223 #ifdef CONFIG_COMPACTION 224 struct work_struct free_work; 225 #endif 226 /* protect zspage migration/compaction */ 227 rwlock_t lock; 228 atomic_t compaction_in_progress; 229 }; 230 231 static inline void zpdesc_set_first(struct zpdesc *zpdesc) 232 { 233 SetPagePrivate(zpdesc_page(zpdesc)); 234 } 235 236 static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) 237 { 238 inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); 239 } 240 241 static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) 242 { 243 dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); 244 } 245 246 static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) 247 { 248 struct page *page = alloc_page(gfp); 249 250 return page_zpdesc(page); 251 } 252 253 static inline void free_zpdesc(struct zpdesc *zpdesc) 254 { 255 struct page *page = zpdesc_page(zpdesc); 256 257 __free_page(page); 258 } 259 260 #define ZS_PAGE_UNLOCKED 0 261 #define ZS_PAGE_WRLOCKED -1 262 263 struct zspage_lock { 264 spinlock_t lock; 265 int cnt; 266 struct lockdep_map dep_map; 267 }; 268 269 struct zspage { 270 struct { 271 unsigned int huge:HUGE_BITS; 272 unsigned int fullness:FULLNESS_BITS; 273 unsigned int class:CLASS_BITS + 1; 274 unsigned int magic:MAGIC_VAL_BITS; 275 }; 276 unsigned int inuse; 277 unsigned int freeobj; 278 struct zpdesc *first_zpdesc; 279 struct list_head list; /* fullness list */ 280 struct zs_pool *pool; 281 struct zspage_lock zsl; 282 }; 283 284 struct mapping_area { 285 local_lock_t lock; 286 char *vm_buf; /* copy buffer for objects that span pages */ 287 char *vm_addr; /* address of kmap_local_page()'ed pages */ 288 enum zs_mapmode vm_mm; /* mapping mode */ 289 }; 290 291 static void zspage_lock_init(struct zspage *zspage) 292 { 293 static struct lock_class_key __key; 294 struct zspage_lock *zsl = &zspage->zsl; 295 296 lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0); 297 spin_lock_init(&zsl->lock); 298 zsl->cnt = ZS_PAGE_UNLOCKED; 299 } 300 301 /* 302 * The zspage lock can be held from atomic contexts, but it needs to remain 303 * preemptible when held for reading because it remains held outside of those 304 * atomic contexts, otherwise we unnecessarily lose preemptibility. 305 * 306 * To achieve this, the following rules are enforced on readers and writers: 307 * 308 * - Writers are blocked by both writers and readers, while readers are only 309 * blocked by writers (i.e. normal rwlock semantics). 310 * 311 * - Writers are always atomic (to allow readers to spin waiting for them). 312 * 313 * - Writers always use trylock (as the lock may be held be sleeping readers). 314 * 315 * - Readers may spin on the lock (as they can only wait for atomic writers). 316 * 317 * - Readers may sleep while holding the lock (as writes only use trylock). 318 */ 319 static void zspage_read_lock(struct zspage *zspage) 320 { 321 struct zspage_lock *zsl = &zspage->zsl; 322 323 rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_); 324 325 spin_lock(&zsl->lock); 326 zsl->cnt++; 327 spin_unlock(&zsl->lock); 328 329 lock_acquired(&zsl->dep_map, _RET_IP_); 330 } 331 332 static void zspage_read_unlock(struct zspage *zspage) 333 { 334 struct zspage_lock *zsl = &zspage->zsl; 335 336 rwsem_release(&zsl->dep_map, _RET_IP_); 337 338 spin_lock(&zsl->lock); 339 zsl->cnt--; 340 spin_unlock(&zsl->lock); 341 } 342 343 static __must_check bool zspage_write_trylock(struct zspage *zspage) 344 { 345 struct zspage_lock *zsl = &zspage->zsl; 346 347 spin_lock(&zsl->lock); 348 if (zsl->cnt == ZS_PAGE_UNLOCKED) { 349 zsl->cnt = ZS_PAGE_WRLOCKED; 350 rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_); 351 lock_acquired(&zsl->dep_map, _RET_IP_); 352 return true; 353 } 354 355 spin_unlock(&zsl->lock); 356 return false; 357 } 358 359 static void zspage_write_unlock(struct zspage *zspage) 360 { 361 struct zspage_lock *zsl = &zspage->zsl; 362 363 rwsem_release(&zsl->dep_map, _RET_IP_); 364 365 zsl->cnt = ZS_PAGE_UNLOCKED; 366 spin_unlock(&zsl->lock); 367 } 368 369 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 370 static void SetZsHugePage(struct zspage *zspage) 371 { 372 zspage->huge = 1; 373 } 374 375 static bool ZsHugePage(struct zspage *zspage) 376 { 377 return zspage->huge; 378 } 379 380 #ifdef CONFIG_COMPACTION 381 static void kick_deferred_free(struct zs_pool *pool); 382 static void init_deferred_free(struct zs_pool *pool); 383 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); 384 #else 385 static void kick_deferred_free(struct zs_pool *pool) {} 386 static void init_deferred_free(struct zs_pool *pool) {} 387 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} 388 #endif 389 390 static int create_cache(struct zs_pool *pool) 391 { 392 char *name; 393 394 name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name); 395 if (!name) 396 return -ENOMEM; 397 pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE, 398 0, 0, NULL); 399 kfree(name); 400 if (!pool->handle_cachep) 401 return -EINVAL; 402 403 name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name); 404 if (!name) 405 return -ENOMEM; 406 pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage), 407 0, 0, NULL); 408 kfree(name); 409 if (!pool->zspage_cachep) { 410 kmem_cache_destroy(pool->handle_cachep); 411 pool->handle_cachep = NULL; 412 return -EINVAL; 413 } 414 415 return 0; 416 } 417 418 static void destroy_cache(struct zs_pool *pool) 419 { 420 kmem_cache_destroy(pool->handle_cachep); 421 kmem_cache_destroy(pool->zspage_cachep); 422 } 423 424 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) 425 { 426 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 427 gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); 428 } 429 430 static void cache_free_handle(struct zs_pool *pool, unsigned long handle) 431 { 432 kmem_cache_free(pool->handle_cachep, (void *)handle); 433 } 434 435 static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) 436 { 437 return kmem_cache_zalloc(pool->zspage_cachep, 438 flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); 439 } 440 441 static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) 442 { 443 kmem_cache_free(pool->zspage_cachep, zspage); 444 } 445 446 /* class->lock(which owns the handle) synchronizes races */ 447 static void record_obj(unsigned long handle, unsigned long obj) 448 { 449 *(unsigned long *)handle = obj; 450 } 451 452 /* zpool driver */ 453 454 #ifdef CONFIG_ZPOOL 455 456 static void *zs_zpool_create(const char *name, gfp_t gfp) 457 { 458 /* 459 * Ignore global gfp flags: zs_malloc() may be invoked from 460 * different contexts and its caller must provide a valid 461 * gfp mask. 462 */ 463 return zs_create_pool(name); 464 } 465 466 static void zs_zpool_destroy(void *pool) 467 { 468 zs_destroy_pool(pool); 469 } 470 471 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 472 unsigned long *handle) 473 { 474 *handle = zs_malloc(pool, size, gfp); 475 476 if (IS_ERR_VALUE(*handle)) 477 return PTR_ERR((void *)*handle); 478 return 0; 479 } 480 static void zs_zpool_free(void *pool, unsigned long handle) 481 { 482 zs_free(pool, handle); 483 } 484 485 static void *zs_zpool_map(void *pool, unsigned long handle, 486 enum zpool_mapmode mm) 487 { 488 enum zs_mapmode zs_mm; 489 490 switch (mm) { 491 case ZPOOL_MM_RO: 492 zs_mm = ZS_MM_RO; 493 break; 494 case ZPOOL_MM_WO: 495 zs_mm = ZS_MM_WO; 496 break; 497 case ZPOOL_MM_RW: 498 default: 499 zs_mm = ZS_MM_RW; 500 break; 501 } 502 503 return zs_map_object(pool, handle, zs_mm); 504 } 505 static void zs_zpool_unmap(void *pool, unsigned long handle) 506 { 507 zs_unmap_object(pool, handle); 508 } 509 510 static u64 zs_zpool_total_pages(void *pool) 511 { 512 return zs_get_total_pages(pool); 513 } 514 515 static struct zpool_driver zs_zpool_driver = { 516 .type = "zsmalloc", 517 .owner = THIS_MODULE, 518 .create = zs_zpool_create, 519 .destroy = zs_zpool_destroy, 520 .malloc_support_movable = true, 521 .malloc = zs_zpool_malloc, 522 .free = zs_zpool_free, 523 .map = zs_zpool_map, 524 .unmap = zs_zpool_unmap, 525 .total_pages = zs_zpool_total_pages, 526 }; 527 528 MODULE_ALIAS("zpool-zsmalloc"); 529 #endif /* CONFIG_ZPOOL */ 530 531 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 532 static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { 533 .lock = INIT_LOCAL_LOCK(lock), 534 }; 535 536 static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) 537 { 538 return PagePrivate(zpdesc_page(zpdesc)); 539 } 540 541 /* Protected by class->lock */ 542 static inline int get_zspage_inuse(struct zspage *zspage) 543 { 544 return zspage->inuse; 545 } 546 547 static inline void mod_zspage_inuse(struct zspage *zspage, int val) 548 { 549 zspage->inuse += val; 550 } 551 552 static struct zpdesc *get_first_zpdesc(struct zspage *zspage) 553 { 554 struct zpdesc *first_zpdesc = zspage->first_zpdesc; 555 556 VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); 557 return first_zpdesc; 558 } 559 560 #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff 561 562 static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) 563 { 564 VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); 565 return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; 566 } 567 568 static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) 569 { 570 /* With 24 bits available, we can support offsets into 16 MiB pages. */ 571 BUILD_BUG_ON(PAGE_SIZE > SZ_16M); 572 VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); 573 VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); 574 zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; 575 zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; 576 } 577 578 static inline unsigned int get_freeobj(struct zspage *zspage) 579 { 580 return zspage->freeobj; 581 } 582 583 static inline void set_freeobj(struct zspage *zspage, unsigned int obj) 584 { 585 zspage->freeobj = obj; 586 } 587 588 static struct size_class *zspage_class(struct zs_pool *pool, 589 struct zspage *zspage) 590 { 591 return pool->size_class[zspage->class]; 592 } 593 594 /* 595 * zsmalloc divides the pool into various size classes where each 596 * class maintains a list of zspages where each zspage is divided 597 * into equal sized chunks. Each allocation falls into one of these 598 * classes depending on its size. This function returns index of the 599 * size class which has chunk size big enough to hold the given size. 600 */ 601 static int get_size_class_index(int size) 602 { 603 int idx = 0; 604 605 if (likely(size > ZS_MIN_ALLOC_SIZE)) 606 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 607 ZS_SIZE_CLASS_DELTA); 608 609 return min_t(int, ZS_SIZE_CLASSES - 1, idx); 610 } 611 612 static inline void class_stat_add(struct size_class *class, int type, 613 unsigned long cnt) 614 { 615 class->stats.objs[type] += cnt; 616 } 617 618 static inline void class_stat_sub(struct size_class *class, int type, 619 unsigned long cnt) 620 { 621 class->stats.objs[type] -= cnt; 622 } 623 624 static inline unsigned long class_stat_read(struct size_class *class, int type) 625 { 626 return class->stats.objs[type]; 627 } 628 629 #ifdef CONFIG_ZSMALLOC_STAT 630 631 static void __init zs_stat_init(void) 632 { 633 if (!debugfs_initialized()) { 634 pr_warn("debugfs not available, stat dir not created\n"); 635 return; 636 } 637 638 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 639 } 640 641 static void __exit zs_stat_exit(void) 642 { 643 debugfs_remove_recursive(zs_stat_root); 644 } 645 646 static unsigned long zs_can_compact(struct size_class *class); 647 648 static int zs_stats_size_show(struct seq_file *s, void *v) 649 { 650 int i, fg; 651 struct zs_pool *pool = s->private; 652 struct size_class *class; 653 int objs_per_zspage; 654 unsigned long obj_allocated, obj_used, pages_used, freeable; 655 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 656 unsigned long total_freeable = 0; 657 unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, }; 658 659 seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n", 660 "class", "size", "10%", "20%", "30%", "40%", 661 "50%", "60%", "70%", "80%", "90%", "99%", "100%", 662 "obj_allocated", "obj_used", "pages_used", 663 "pages_per_zspage", "freeable"); 664 665 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 666 667 class = pool->size_class[i]; 668 669 if (class->index != i) 670 continue; 671 672 spin_lock(&class->lock); 673 674 seq_printf(s, " %5u %5u ", i, class->size); 675 for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) { 676 inuse_totals[fg] += class_stat_read(class, fg); 677 seq_printf(s, "%9lu ", class_stat_read(class, fg)); 678 } 679 680 obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED); 681 obj_used = class_stat_read(class, ZS_OBJS_INUSE); 682 freeable = zs_can_compact(class); 683 spin_unlock(&class->lock); 684 685 objs_per_zspage = class->objs_per_zspage; 686 pages_used = obj_allocated / objs_per_zspage * 687 class->pages_per_zspage; 688 689 seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n", 690 obj_allocated, obj_used, pages_used, 691 class->pages_per_zspage, freeable); 692 693 total_objs += obj_allocated; 694 total_used_objs += obj_used; 695 total_pages += pages_used; 696 total_freeable += freeable; 697 } 698 699 seq_puts(s, "\n"); 700 seq_printf(s, " %5s %5s ", "Total", ""); 701 702 for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) 703 seq_printf(s, "%9lu ", inuse_totals[fg]); 704 705 seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n", 706 total_objs, total_used_objs, total_pages, "", 707 total_freeable); 708 709 return 0; 710 } 711 DEFINE_SHOW_ATTRIBUTE(zs_stats_size); 712 713 static void zs_pool_stat_create(struct zs_pool *pool, const char *name) 714 { 715 if (!zs_stat_root) { 716 pr_warn("no root stat dir, not creating <%s> stat dir\n", name); 717 return; 718 } 719 720 pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); 721 722 debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, 723 &zs_stats_size_fops); 724 } 725 726 static void zs_pool_stat_destroy(struct zs_pool *pool) 727 { 728 debugfs_remove_recursive(pool->stat_dentry); 729 } 730 731 #else /* CONFIG_ZSMALLOC_STAT */ 732 static void __init zs_stat_init(void) 733 { 734 } 735 736 static void __exit zs_stat_exit(void) 737 { 738 } 739 740 static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) 741 { 742 } 743 744 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 745 { 746 } 747 #endif 748 749 750 /* 751 * For each size class, zspages are divided into different groups 752 * depending on their usage ratio. This function returns fullness 753 * status of the given page. 754 */ 755 static int get_fullness_group(struct size_class *class, struct zspage *zspage) 756 { 757 int inuse, objs_per_zspage, ratio; 758 759 inuse = get_zspage_inuse(zspage); 760 objs_per_zspage = class->objs_per_zspage; 761 762 if (inuse == 0) 763 return ZS_INUSE_RATIO_0; 764 if (inuse == objs_per_zspage) 765 return ZS_INUSE_RATIO_100; 766 767 ratio = 100 * inuse / objs_per_zspage; 768 /* 769 * Take integer division into consideration: a page with one inuse 770 * object out of 127 possible, will end up having 0 usage ratio, 771 * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group. 772 */ 773 return ratio / 10 + 1; 774 } 775 776 /* 777 * Each size class maintains various freelists and zspages are assigned 778 * to one of these freelists based on the number of live objects they 779 * have. This functions inserts the given zspage into the freelist 780 * identified by <class, fullness_group>. 781 */ 782 static void insert_zspage(struct size_class *class, 783 struct zspage *zspage, 784 int fullness) 785 { 786 class_stat_add(class, fullness, 1); 787 list_add(&zspage->list, &class->fullness_list[fullness]); 788 zspage->fullness = fullness; 789 } 790 791 /* 792 * This function removes the given zspage from the freelist identified 793 * by <class, fullness_group>. 794 */ 795 static void remove_zspage(struct size_class *class, struct zspage *zspage) 796 { 797 int fullness = zspage->fullness; 798 799 VM_BUG_ON(list_empty(&class->fullness_list[fullness])); 800 801 list_del_init(&zspage->list); 802 class_stat_sub(class, fullness, 1); 803 } 804 805 /* 806 * Each size class maintains zspages in different fullness groups depending 807 * on the number of live objects they contain. When allocating or freeing 808 * objects, the fullness status of the page can change, for instance, from 809 * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function 810 * checks if such a status change has occurred for the given page and 811 * accordingly moves the page from the list of the old fullness group to that 812 * of the new fullness group. 813 */ 814 static int fix_fullness_group(struct size_class *class, struct zspage *zspage) 815 { 816 int newfg; 817 818 newfg = get_fullness_group(class, zspage); 819 if (newfg == zspage->fullness) 820 goto out; 821 822 remove_zspage(class, zspage); 823 insert_zspage(class, zspage, newfg); 824 out: 825 return newfg; 826 } 827 828 static struct zspage *get_zspage(struct zpdesc *zpdesc) 829 { 830 struct zspage *zspage = zpdesc->zspage; 831 832 BUG_ON(zspage->magic != ZSPAGE_MAGIC); 833 return zspage; 834 } 835 836 static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) 837 { 838 struct zspage *zspage = get_zspage(zpdesc); 839 840 if (unlikely(ZsHugePage(zspage))) 841 return NULL; 842 843 return zpdesc->next; 844 } 845 846 /** 847 * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value 848 * @obj: the encoded object value 849 * @zpdesc: zpdesc object resides in zspage 850 * @obj_idx: object index 851 */ 852 static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, 853 unsigned int *obj_idx) 854 { 855 *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); 856 *obj_idx = (obj & OBJ_INDEX_MASK); 857 } 858 859 static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) 860 { 861 *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); 862 } 863 864 /** 865 * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>) 866 * @zpdesc: zpdesc object resides in zspage 867 * @obj_idx: object index 868 */ 869 static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) 870 { 871 unsigned long obj; 872 873 obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; 874 obj |= obj_idx & OBJ_INDEX_MASK; 875 876 return obj; 877 } 878 879 static unsigned long handle_to_obj(unsigned long handle) 880 { 881 return *(unsigned long *)handle; 882 } 883 884 static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, 885 unsigned long *phandle) 886 { 887 unsigned long handle; 888 struct zspage *zspage = get_zspage(zpdesc); 889 890 if (unlikely(ZsHugePage(zspage))) { 891 VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); 892 handle = zpdesc->handle; 893 } else 894 handle = *(unsigned long *)obj; 895 896 if (!(handle & OBJ_ALLOCATED_TAG)) 897 return false; 898 899 /* Clear all tags before returning the handle */ 900 *phandle = handle & ~OBJ_TAG_MASK; 901 return true; 902 } 903 904 static void reset_zpdesc(struct zpdesc *zpdesc) 905 { 906 struct page *page = zpdesc_page(zpdesc); 907 908 __ClearPageMovable(page); 909 ClearPagePrivate(page); 910 zpdesc->zspage = NULL; 911 zpdesc->next = NULL; 912 __ClearPageZsmalloc(page); 913 } 914 915 static int trylock_zspage(struct zspage *zspage) 916 { 917 struct zpdesc *cursor, *fail; 918 919 for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = 920 get_next_zpdesc(cursor)) { 921 if (!zpdesc_trylock(cursor)) { 922 fail = cursor; 923 goto unlock; 924 } 925 } 926 927 return 1; 928 unlock: 929 for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = 930 get_next_zpdesc(cursor)) 931 zpdesc_unlock(cursor); 932 933 return 0; 934 } 935 936 static void __free_zspage(struct zs_pool *pool, struct size_class *class, 937 struct zspage *zspage) 938 { 939 struct zpdesc *zpdesc, *next; 940 941 assert_spin_locked(&class->lock); 942 943 VM_BUG_ON(get_zspage_inuse(zspage)); 944 VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); 945 946 next = zpdesc = get_first_zpdesc(zspage); 947 do { 948 VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); 949 next = get_next_zpdesc(zpdesc); 950 reset_zpdesc(zpdesc); 951 zpdesc_unlock(zpdesc); 952 zpdesc_dec_zone_page_state(zpdesc); 953 zpdesc_put(zpdesc); 954 zpdesc = next; 955 } while (zpdesc != NULL); 956 957 cache_free_zspage(pool, zspage); 958 959 class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); 960 atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); 961 } 962 963 static void free_zspage(struct zs_pool *pool, struct size_class *class, 964 struct zspage *zspage) 965 { 966 VM_BUG_ON(get_zspage_inuse(zspage)); 967 VM_BUG_ON(list_empty(&zspage->list)); 968 969 /* 970 * Since zs_free couldn't be sleepable, this function cannot call 971 * lock_page. The page locks trylock_zspage got will be released 972 * by __free_zspage. 973 */ 974 if (!trylock_zspage(zspage)) { 975 kick_deferred_free(pool); 976 return; 977 } 978 979 remove_zspage(class, zspage); 980 __free_zspage(pool, class, zspage); 981 } 982 983 /* Initialize a newly allocated zspage */ 984 static void init_zspage(struct size_class *class, struct zspage *zspage) 985 { 986 unsigned int freeobj = 1; 987 unsigned long off = 0; 988 struct zpdesc *zpdesc = get_first_zpdesc(zspage); 989 990 while (zpdesc) { 991 struct zpdesc *next_zpdesc; 992 struct link_free *link; 993 void *vaddr; 994 995 set_first_obj_offset(zpdesc, off); 996 997 vaddr = kmap_local_zpdesc(zpdesc); 998 link = (struct link_free *)vaddr + off / sizeof(*link); 999 1000 while ((off += class->size) < PAGE_SIZE) { 1001 link->next = freeobj++ << OBJ_TAG_BITS; 1002 link += class->size / sizeof(*link); 1003 } 1004 1005 /* 1006 * We now come to the last (full or partial) object on this 1007 * page, which must point to the first object on the next 1008 * page (if present) 1009 */ 1010 next_zpdesc = get_next_zpdesc(zpdesc); 1011 if (next_zpdesc) { 1012 link->next = freeobj++ << OBJ_TAG_BITS; 1013 } else { 1014 /* 1015 * Reset OBJ_TAG_BITS bit to last link to tell 1016 * whether it's allocated object or not. 1017 */ 1018 link->next = -1UL << OBJ_TAG_BITS; 1019 } 1020 kunmap_local(vaddr); 1021 zpdesc = next_zpdesc; 1022 off %= PAGE_SIZE; 1023 } 1024 1025 set_freeobj(zspage, 0); 1026 } 1027 1028 static void create_page_chain(struct size_class *class, struct zspage *zspage, 1029 struct zpdesc *zpdescs[]) 1030 { 1031 int i; 1032 struct zpdesc *zpdesc; 1033 struct zpdesc *prev_zpdesc = NULL; 1034 int nr_zpdescs = class->pages_per_zspage; 1035 1036 /* 1037 * Allocate individual pages and link them together as: 1038 * 1. all pages are linked together using zpdesc->next 1039 * 2. each sub-page point to zspage using zpdesc->zspage 1040 * 1041 * we set PG_private to identify the first zpdesc (i.e. no other zpdesc 1042 * has this flag set). 1043 */ 1044 for (i = 0; i < nr_zpdescs; i++) { 1045 zpdesc = zpdescs[i]; 1046 zpdesc->zspage = zspage; 1047 zpdesc->next = NULL; 1048 if (i == 0) { 1049 zspage->first_zpdesc = zpdesc; 1050 zpdesc_set_first(zpdesc); 1051 if (unlikely(class->objs_per_zspage == 1 && 1052 class->pages_per_zspage == 1)) 1053 SetZsHugePage(zspage); 1054 } else { 1055 prev_zpdesc->next = zpdesc; 1056 } 1057 prev_zpdesc = zpdesc; 1058 } 1059 } 1060 1061 /* 1062 * Allocate a zspage for the given size class 1063 */ 1064 static struct zspage *alloc_zspage(struct zs_pool *pool, 1065 struct size_class *class, 1066 gfp_t gfp) 1067 { 1068 int i; 1069 struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; 1070 struct zspage *zspage = cache_alloc_zspage(pool, gfp); 1071 1072 if (!zspage) 1073 return NULL; 1074 1075 zspage->magic = ZSPAGE_MAGIC; 1076 zspage->pool = pool; 1077 zspage->class = class->index; 1078 zspage_lock_init(zspage); 1079 1080 for (i = 0; i < class->pages_per_zspage; i++) { 1081 struct zpdesc *zpdesc; 1082 1083 zpdesc = alloc_zpdesc(gfp); 1084 if (!zpdesc) { 1085 while (--i >= 0) { 1086 zpdesc_dec_zone_page_state(zpdescs[i]); 1087 __zpdesc_clear_zsmalloc(zpdescs[i]); 1088 free_zpdesc(zpdescs[i]); 1089 } 1090 cache_free_zspage(pool, zspage); 1091 return NULL; 1092 } 1093 __zpdesc_set_zsmalloc(zpdesc); 1094 1095 zpdesc_inc_zone_page_state(zpdesc); 1096 zpdescs[i] = zpdesc; 1097 } 1098 1099 create_page_chain(class, zspage, zpdescs); 1100 init_zspage(class, zspage); 1101 1102 return zspage; 1103 } 1104 1105 static struct zspage *find_get_zspage(struct size_class *class) 1106 { 1107 int i; 1108 struct zspage *zspage; 1109 1110 for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) { 1111 zspage = list_first_entry_or_null(&class->fullness_list[i], 1112 struct zspage, list); 1113 if (zspage) 1114 break; 1115 } 1116 1117 return zspage; 1118 } 1119 1120 static inline int __zs_cpu_up(struct mapping_area *area) 1121 { 1122 /* 1123 * Make sure we don't leak memory if a cpu UP notification 1124 * and zs_init() race and both call zs_cpu_up() on the same cpu 1125 */ 1126 if (area->vm_buf) 1127 return 0; 1128 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1129 if (!area->vm_buf) 1130 return -ENOMEM; 1131 return 0; 1132 } 1133 1134 static inline void __zs_cpu_down(struct mapping_area *area) 1135 { 1136 kfree(area->vm_buf); 1137 area->vm_buf = NULL; 1138 } 1139 1140 static void *__zs_map_object(struct mapping_area *area, 1141 struct zpdesc *zpdescs[2], int off, int size) 1142 { 1143 size_t sizes[2]; 1144 char *buf = area->vm_buf; 1145 1146 /* disable page faults to match kmap_local_page() return conditions */ 1147 pagefault_disable(); 1148 1149 /* no read fastpath */ 1150 if (area->vm_mm == ZS_MM_WO) 1151 goto out; 1152 1153 sizes[0] = PAGE_SIZE - off; 1154 sizes[1] = size - sizes[0]; 1155 1156 /* copy object to per-cpu buffer */ 1157 memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]); 1158 memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]); 1159 out: 1160 return area->vm_buf; 1161 } 1162 1163 static void __zs_unmap_object(struct mapping_area *area, 1164 struct zpdesc *zpdescs[2], int off, int size) 1165 { 1166 size_t sizes[2]; 1167 char *buf; 1168 1169 /* no write fastpath */ 1170 if (area->vm_mm == ZS_MM_RO) 1171 goto out; 1172 1173 buf = area->vm_buf; 1174 buf = buf + ZS_HANDLE_SIZE; 1175 size -= ZS_HANDLE_SIZE; 1176 off += ZS_HANDLE_SIZE; 1177 1178 sizes[0] = PAGE_SIZE - off; 1179 sizes[1] = size - sizes[0]; 1180 1181 /* copy per-cpu buffer to object */ 1182 memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]); 1183 memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]); 1184 1185 out: 1186 /* enable page faults to match kunmap_local() return conditions */ 1187 pagefault_enable(); 1188 } 1189 1190 static int zs_cpu_prepare(unsigned int cpu) 1191 { 1192 struct mapping_area *area; 1193 1194 area = &per_cpu(zs_map_area, cpu); 1195 return __zs_cpu_up(area); 1196 } 1197 1198 static int zs_cpu_dead(unsigned int cpu) 1199 { 1200 struct mapping_area *area; 1201 1202 area = &per_cpu(zs_map_area, cpu); 1203 __zs_cpu_down(area); 1204 return 0; 1205 } 1206 1207 static bool can_merge(struct size_class *prev, int pages_per_zspage, 1208 int objs_per_zspage) 1209 { 1210 if (prev->pages_per_zspage == pages_per_zspage && 1211 prev->objs_per_zspage == objs_per_zspage) 1212 return true; 1213 1214 return false; 1215 } 1216 1217 static bool zspage_full(struct size_class *class, struct zspage *zspage) 1218 { 1219 return get_zspage_inuse(zspage) == class->objs_per_zspage; 1220 } 1221 1222 static bool zspage_empty(struct zspage *zspage) 1223 { 1224 return get_zspage_inuse(zspage) == 0; 1225 } 1226 1227 /** 1228 * zs_lookup_class_index() - Returns index of the zsmalloc &size_class 1229 * that hold objects of the provided size. 1230 * @pool: zsmalloc pool to use 1231 * @size: object size 1232 * 1233 * Context: Any context. 1234 * 1235 * Return: the index of the zsmalloc &size_class that hold objects of the 1236 * provided size. 1237 */ 1238 unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) 1239 { 1240 struct size_class *class; 1241 1242 class = pool->size_class[get_size_class_index(size)]; 1243 1244 return class->index; 1245 } 1246 EXPORT_SYMBOL_GPL(zs_lookup_class_index); 1247 1248 unsigned long zs_get_total_pages(struct zs_pool *pool) 1249 { 1250 return atomic_long_read(&pool->pages_allocated); 1251 } 1252 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1253 1254 /** 1255 * zs_map_object - get address of allocated object from handle. 1256 * @pool: pool from which the object was allocated 1257 * @handle: handle returned from zs_malloc 1258 * @mm: mapping mode to use 1259 * 1260 * Before using an object allocated from zs_malloc, it must be mapped using 1261 * this function. When done with the object, it must be unmapped using 1262 * zs_unmap_object. 1263 * 1264 * Only one object can be mapped per cpu at a time. There is no protection 1265 * against nested mappings. 1266 * 1267 * This function returns with preemption and page faults disabled. 1268 */ 1269 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1270 enum zs_mapmode mm) 1271 { 1272 struct zspage *zspage; 1273 struct zpdesc *zpdesc; 1274 unsigned long obj, off; 1275 unsigned int obj_idx; 1276 1277 struct size_class *class; 1278 struct mapping_area *area; 1279 struct zpdesc *zpdescs[2]; 1280 void *ret; 1281 1282 /* 1283 * Because we use per-cpu mapping areas shared among the 1284 * pools/users, we can't allow mapping in interrupt context 1285 * because it can corrupt another users mappings. 1286 */ 1287 BUG_ON(in_interrupt()); 1288 1289 /* It guarantees it can get zspage from handle safely */ 1290 read_lock(&pool->lock); 1291 obj = handle_to_obj(handle); 1292 obj_to_location(obj, &zpdesc, &obj_idx); 1293 zspage = get_zspage(zpdesc); 1294 1295 /* 1296 * migration cannot move any zpages in this zspage. Here, class->lock 1297 * is too heavy since callers would take some time until they calls 1298 * zs_unmap_object API so delegate the locking from class to zspage 1299 * which is smaller granularity. 1300 */ 1301 zspage_read_lock(zspage); 1302 read_unlock(&pool->lock); 1303 1304 class = zspage_class(pool, zspage); 1305 off = offset_in_page(class->size * obj_idx); 1306 1307 local_lock(&zs_map_area.lock); 1308 area = this_cpu_ptr(&zs_map_area); 1309 area->vm_mm = mm; 1310 if (off + class->size <= PAGE_SIZE) { 1311 /* this object is contained entirely within a page */ 1312 area->vm_addr = kmap_local_zpdesc(zpdesc); 1313 ret = area->vm_addr + off; 1314 goto out; 1315 } 1316 1317 /* this object spans two pages */ 1318 zpdescs[0] = zpdesc; 1319 zpdescs[1] = get_next_zpdesc(zpdesc); 1320 BUG_ON(!zpdescs[1]); 1321 1322 ret = __zs_map_object(area, zpdescs, off, class->size); 1323 out: 1324 if (likely(!ZsHugePage(zspage))) 1325 ret += ZS_HANDLE_SIZE; 1326 1327 return ret; 1328 } 1329 EXPORT_SYMBOL_GPL(zs_map_object); 1330 1331 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1332 { 1333 struct zspage *zspage; 1334 struct zpdesc *zpdesc; 1335 unsigned long obj, off; 1336 unsigned int obj_idx; 1337 1338 struct size_class *class; 1339 struct mapping_area *area; 1340 1341 obj = handle_to_obj(handle); 1342 obj_to_location(obj, &zpdesc, &obj_idx); 1343 zspage = get_zspage(zpdesc); 1344 class = zspage_class(pool, zspage); 1345 off = offset_in_page(class->size * obj_idx); 1346 1347 area = this_cpu_ptr(&zs_map_area); 1348 if (off + class->size <= PAGE_SIZE) 1349 kunmap_local(area->vm_addr); 1350 else { 1351 struct zpdesc *zpdescs[2]; 1352 1353 zpdescs[0] = zpdesc; 1354 zpdescs[1] = get_next_zpdesc(zpdesc); 1355 BUG_ON(!zpdescs[1]); 1356 1357 __zs_unmap_object(area, zpdescs, off, class->size); 1358 } 1359 local_unlock(&zs_map_area.lock); 1360 1361 zspage_read_unlock(zspage); 1362 } 1363 EXPORT_SYMBOL_GPL(zs_unmap_object); 1364 1365 void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, 1366 void *local_copy) 1367 { 1368 struct zspage *zspage; 1369 struct zpdesc *zpdesc; 1370 unsigned long obj, off; 1371 unsigned int obj_idx; 1372 struct size_class *class; 1373 void *addr; 1374 1375 /* Guarantee we can get zspage from handle safely */ 1376 read_lock(&pool->lock); 1377 obj = handle_to_obj(handle); 1378 obj_to_location(obj, &zpdesc, &obj_idx); 1379 zspage = get_zspage(zpdesc); 1380 1381 /* Make sure migration doesn't move any pages in this zspage */ 1382 zspage_read_lock(zspage); 1383 read_unlock(&pool->lock); 1384 1385 class = zspage_class(pool, zspage); 1386 off = offset_in_page(class->size * obj_idx); 1387 1388 if (off + class->size <= PAGE_SIZE) { 1389 /* this object is contained entirely within a page */ 1390 addr = kmap_local_zpdesc(zpdesc); 1391 addr += off; 1392 } else { 1393 size_t sizes[2]; 1394 1395 /* this object spans two pages */ 1396 sizes[0] = PAGE_SIZE - off; 1397 sizes[1] = class->size - sizes[0]; 1398 addr = local_copy; 1399 1400 memcpy_from_page(addr, zpdesc_page(zpdesc), 1401 off, sizes[0]); 1402 zpdesc = get_next_zpdesc(zpdesc); 1403 memcpy_from_page(addr + sizes[0], 1404 zpdesc_page(zpdesc), 1405 0, sizes[1]); 1406 } 1407 1408 if (!ZsHugePage(zspage)) 1409 addr += ZS_HANDLE_SIZE; 1410 1411 return addr; 1412 } 1413 EXPORT_SYMBOL_GPL(zs_obj_read_begin); 1414 1415 void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, 1416 void *handle_mem) 1417 { 1418 struct zspage *zspage; 1419 struct zpdesc *zpdesc; 1420 unsigned long obj, off; 1421 unsigned int obj_idx; 1422 struct size_class *class; 1423 1424 obj = handle_to_obj(handle); 1425 obj_to_location(obj, &zpdesc, &obj_idx); 1426 zspage = get_zspage(zpdesc); 1427 class = zspage_class(pool, zspage); 1428 off = offset_in_page(class->size * obj_idx); 1429 1430 if (off + class->size <= PAGE_SIZE) { 1431 if (!ZsHugePage(zspage)) 1432 off += ZS_HANDLE_SIZE; 1433 handle_mem -= off; 1434 kunmap_local(handle_mem); 1435 } 1436 1437 zspage_read_unlock(zspage); 1438 } 1439 EXPORT_SYMBOL_GPL(zs_obj_read_end); 1440 1441 void zs_obj_write(struct zs_pool *pool, unsigned long handle, 1442 void *handle_mem, size_t mem_len) 1443 { 1444 struct zspage *zspage; 1445 struct zpdesc *zpdesc; 1446 unsigned long obj, off; 1447 unsigned int obj_idx; 1448 struct size_class *class; 1449 1450 /* Guarantee we can get zspage from handle safely */ 1451 read_lock(&pool->lock); 1452 obj = handle_to_obj(handle); 1453 obj_to_location(obj, &zpdesc, &obj_idx); 1454 zspage = get_zspage(zpdesc); 1455 1456 /* Make sure migration doesn't move any pages in this zspage */ 1457 zspage_read_lock(zspage); 1458 read_unlock(&pool->lock); 1459 1460 class = zspage_class(pool, zspage); 1461 off = offset_in_page(class->size * obj_idx); 1462 1463 if (off + class->size <= PAGE_SIZE) { 1464 /* this object is contained entirely within a page */ 1465 void *dst = kmap_local_zpdesc(zpdesc); 1466 1467 if (!ZsHugePage(zspage)) 1468 off += ZS_HANDLE_SIZE; 1469 memcpy(dst + off, handle_mem, mem_len); 1470 kunmap_local(dst); 1471 } else { 1472 /* this object spans two pages */ 1473 size_t sizes[2]; 1474 1475 off += ZS_HANDLE_SIZE; 1476 sizes[0] = PAGE_SIZE - off; 1477 sizes[1] = mem_len - sizes[0]; 1478 1479 memcpy_to_page(zpdesc_page(zpdesc), off, 1480 handle_mem, sizes[0]); 1481 zpdesc = get_next_zpdesc(zpdesc); 1482 memcpy_to_page(zpdesc_page(zpdesc), 0, 1483 handle_mem + sizes[0], sizes[1]); 1484 } 1485 1486 zspage_read_unlock(zspage); 1487 } 1488 EXPORT_SYMBOL_GPL(zs_obj_write); 1489 1490 /** 1491 * zs_huge_class_size() - Returns the size (in bytes) of the first huge 1492 * zsmalloc &size_class. 1493 * @pool: zsmalloc pool to use 1494 * 1495 * The function returns the size of the first huge class - any object of equal 1496 * or bigger size will be stored in zspage consisting of a single physical 1497 * page. 1498 * 1499 * Context: Any context. 1500 * 1501 * Return: the size (in bytes) of the first huge zsmalloc &size_class. 1502 */ 1503 size_t zs_huge_class_size(struct zs_pool *pool) 1504 { 1505 return huge_class_size; 1506 } 1507 EXPORT_SYMBOL_GPL(zs_huge_class_size); 1508 1509 static unsigned long obj_malloc(struct zs_pool *pool, 1510 struct zspage *zspage, unsigned long handle) 1511 { 1512 int i, nr_zpdesc, offset; 1513 unsigned long obj; 1514 struct link_free *link; 1515 struct size_class *class; 1516 1517 struct zpdesc *m_zpdesc; 1518 unsigned long m_offset; 1519 void *vaddr; 1520 1521 class = pool->size_class[zspage->class]; 1522 obj = get_freeobj(zspage); 1523 1524 offset = obj * class->size; 1525 nr_zpdesc = offset >> PAGE_SHIFT; 1526 m_offset = offset_in_page(offset); 1527 m_zpdesc = get_first_zpdesc(zspage); 1528 1529 for (i = 0; i < nr_zpdesc; i++) 1530 m_zpdesc = get_next_zpdesc(m_zpdesc); 1531 1532 vaddr = kmap_local_zpdesc(m_zpdesc); 1533 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1534 set_freeobj(zspage, link->next >> OBJ_TAG_BITS); 1535 if (likely(!ZsHugePage(zspage))) 1536 /* record handle in the header of allocated chunk */ 1537 link->handle = handle | OBJ_ALLOCATED_TAG; 1538 else 1539 zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; 1540 1541 kunmap_local(vaddr); 1542 mod_zspage_inuse(zspage, 1); 1543 1544 obj = location_to_obj(m_zpdesc, obj); 1545 record_obj(handle, obj); 1546 1547 return obj; 1548 } 1549 1550 1551 /** 1552 * zs_malloc - Allocate block of given size from pool. 1553 * @pool: pool to allocate from 1554 * @size: size of block to allocate 1555 * @gfp: gfp flags when allocating object 1556 * 1557 * On success, handle to the allocated object is returned, 1558 * otherwise an ERR_PTR(). 1559 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1560 */ 1561 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) 1562 { 1563 unsigned long handle; 1564 struct size_class *class; 1565 int newfg; 1566 struct zspage *zspage; 1567 1568 if (unlikely(!size)) 1569 return (unsigned long)ERR_PTR(-EINVAL); 1570 1571 if (unlikely(size > ZS_MAX_ALLOC_SIZE)) 1572 return (unsigned long)ERR_PTR(-ENOSPC); 1573 1574 handle = cache_alloc_handle(pool, gfp); 1575 if (!handle) 1576 return (unsigned long)ERR_PTR(-ENOMEM); 1577 1578 /* extra space in chunk to keep the handle */ 1579 size += ZS_HANDLE_SIZE; 1580 class = pool->size_class[get_size_class_index(size)]; 1581 1582 /* class->lock effectively protects the zpage migration */ 1583 spin_lock(&class->lock); 1584 zspage = find_get_zspage(class); 1585 if (likely(zspage)) { 1586 obj_malloc(pool, zspage, handle); 1587 /* Now move the zspage to another fullness group, if required */ 1588 fix_fullness_group(class, zspage); 1589 class_stat_add(class, ZS_OBJS_INUSE, 1); 1590 1591 goto out; 1592 } 1593 1594 spin_unlock(&class->lock); 1595 1596 zspage = alloc_zspage(pool, class, gfp); 1597 if (!zspage) { 1598 cache_free_handle(pool, handle); 1599 return (unsigned long)ERR_PTR(-ENOMEM); 1600 } 1601 1602 spin_lock(&class->lock); 1603 obj_malloc(pool, zspage, handle); 1604 newfg = get_fullness_group(class, zspage); 1605 insert_zspage(class, zspage, newfg); 1606 atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); 1607 class_stat_add(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); 1608 class_stat_add(class, ZS_OBJS_INUSE, 1); 1609 1610 /* We completely set up zspage so mark them as movable */ 1611 SetZsPageMovable(pool, zspage); 1612 out: 1613 spin_unlock(&class->lock); 1614 1615 return handle; 1616 } 1617 EXPORT_SYMBOL_GPL(zs_malloc); 1618 1619 static void obj_free(int class_size, unsigned long obj) 1620 { 1621 struct link_free *link; 1622 struct zspage *zspage; 1623 struct zpdesc *f_zpdesc; 1624 unsigned long f_offset; 1625 unsigned int f_objidx; 1626 void *vaddr; 1627 1628 1629 obj_to_location(obj, &f_zpdesc, &f_objidx); 1630 f_offset = offset_in_page(class_size * f_objidx); 1631 zspage = get_zspage(f_zpdesc); 1632 1633 vaddr = kmap_local_zpdesc(f_zpdesc); 1634 link = (struct link_free *)(vaddr + f_offset); 1635 1636 /* Insert this object in containing zspage's freelist */ 1637 if (likely(!ZsHugePage(zspage))) 1638 link->next = get_freeobj(zspage) << OBJ_TAG_BITS; 1639 else 1640 f_zpdesc->handle = 0; 1641 set_freeobj(zspage, f_objidx); 1642 1643 kunmap_local(vaddr); 1644 mod_zspage_inuse(zspage, -1); 1645 } 1646 1647 void zs_free(struct zs_pool *pool, unsigned long handle) 1648 { 1649 struct zspage *zspage; 1650 struct zpdesc *f_zpdesc; 1651 unsigned long obj; 1652 struct size_class *class; 1653 int fullness; 1654 1655 if (IS_ERR_OR_NULL((void *)handle)) 1656 return; 1657 1658 /* 1659 * The pool->lock protects the race with zpage's migration 1660 * so it's safe to get the page from handle. 1661 */ 1662 read_lock(&pool->lock); 1663 obj = handle_to_obj(handle); 1664 obj_to_zpdesc(obj, &f_zpdesc); 1665 zspage = get_zspage(f_zpdesc); 1666 class = zspage_class(pool, zspage); 1667 spin_lock(&class->lock); 1668 read_unlock(&pool->lock); 1669 1670 class_stat_sub(class, ZS_OBJS_INUSE, 1); 1671 obj_free(class->size, obj); 1672 1673 fullness = fix_fullness_group(class, zspage); 1674 if (fullness == ZS_INUSE_RATIO_0) 1675 free_zspage(pool, class, zspage); 1676 1677 spin_unlock(&class->lock); 1678 cache_free_handle(pool, handle); 1679 } 1680 EXPORT_SYMBOL_GPL(zs_free); 1681 1682 static void zs_object_copy(struct size_class *class, unsigned long dst, 1683 unsigned long src) 1684 { 1685 struct zpdesc *s_zpdesc, *d_zpdesc; 1686 unsigned int s_objidx, d_objidx; 1687 unsigned long s_off, d_off; 1688 void *s_addr, *d_addr; 1689 int s_size, d_size, size; 1690 int written = 0; 1691 1692 s_size = d_size = class->size; 1693 1694 obj_to_location(src, &s_zpdesc, &s_objidx); 1695 obj_to_location(dst, &d_zpdesc, &d_objidx); 1696 1697 s_off = offset_in_page(class->size * s_objidx); 1698 d_off = offset_in_page(class->size * d_objidx); 1699 1700 if (s_off + class->size > PAGE_SIZE) 1701 s_size = PAGE_SIZE - s_off; 1702 1703 if (d_off + class->size > PAGE_SIZE) 1704 d_size = PAGE_SIZE - d_off; 1705 1706 s_addr = kmap_local_zpdesc(s_zpdesc); 1707 d_addr = kmap_local_zpdesc(d_zpdesc); 1708 1709 while (1) { 1710 size = min(s_size, d_size); 1711 memcpy(d_addr + d_off, s_addr + s_off, size); 1712 written += size; 1713 1714 if (written == class->size) 1715 break; 1716 1717 s_off += size; 1718 s_size -= size; 1719 d_off += size; 1720 d_size -= size; 1721 1722 /* 1723 * Calling kunmap_local(d_addr) is necessary. kunmap_local() 1724 * calls must occurs in reverse order of calls to kmap_local_page(). 1725 * So, to call kunmap_local(s_addr) we should first call 1726 * kunmap_local(d_addr). For more details see 1727 * Documentation/mm/highmem.rst. 1728 */ 1729 if (s_off >= PAGE_SIZE) { 1730 kunmap_local(d_addr); 1731 kunmap_local(s_addr); 1732 s_zpdesc = get_next_zpdesc(s_zpdesc); 1733 s_addr = kmap_local_zpdesc(s_zpdesc); 1734 d_addr = kmap_local_zpdesc(d_zpdesc); 1735 s_size = class->size - written; 1736 s_off = 0; 1737 } 1738 1739 if (d_off >= PAGE_SIZE) { 1740 kunmap_local(d_addr); 1741 d_zpdesc = get_next_zpdesc(d_zpdesc); 1742 d_addr = kmap_local_zpdesc(d_zpdesc); 1743 d_size = class->size - written; 1744 d_off = 0; 1745 } 1746 } 1747 1748 kunmap_local(d_addr); 1749 kunmap_local(s_addr); 1750 } 1751 1752 /* 1753 * Find alloced object in zspage from index object and 1754 * return handle. 1755 */ 1756 static unsigned long find_alloced_obj(struct size_class *class, 1757 struct zpdesc *zpdesc, int *obj_idx) 1758 { 1759 unsigned int offset; 1760 int index = *obj_idx; 1761 unsigned long handle = 0; 1762 void *addr = kmap_local_zpdesc(zpdesc); 1763 1764 offset = get_first_obj_offset(zpdesc); 1765 offset += class->size * index; 1766 1767 while (offset < PAGE_SIZE) { 1768 if (obj_allocated(zpdesc, addr + offset, &handle)) 1769 break; 1770 1771 offset += class->size; 1772 index++; 1773 } 1774 1775 kunmap_local(addr); 1776 1777 *obj_idx = index; 1778 1779 return handle; 1780 } 1781 1782 static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, 1783 struct zspage *dst_zspage) 1784 { 1785 unsigned long used_obj, free_obj; 1786 unsigned long handle; 1787 int obj_idx = 0; 1788 struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); 1789 struct size_class *class = pool->size_class[src_zspage->class]; 1790 1791 while (1) { 1792 handle = find_alloced_obj(class, s_zpdesc, &obj_idx); 1793 if (!handle) { 1794 s_zpdesc = get_next_zpdesc(s_zpdesc); 1795 if (!s_zpdesc) 1796 break; 1797 obj_idx = 0; 1798 continue; 1799 } 1800 1801 used_obj = handle_to_obj(handle); 1802 free_obj = obj_malloc(pool, dst_zspage, handle); 1803 zs_object_copy(class, free_obj, used_obj); 1804 obj_idx++; 1805 obj_free(class->size, used_obj); 1806 1807 /* Stop if there is no more space */ 1808 if (zspage_full(class, dst_zspage)) 1809 break; 1810 1811 /* Stop if there are no more objects to migrate */ 1812 if (zspage_empty(src_zspage)) 1813 break; 1814 } 1815 } 1816 1817 static struct zspage *isolate_src_zspage(struct size_class *class) 1818 { 1819 struct zspage *zspage; 1820 int fg; 1821 1822 for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) { 1823 zspage = list_first_entry_or_null(&class->fullness_list[fg], 1824 struct zspage, list); 1825 if (zspage) { 1826 remove_zspage(class, zspage); 1827 return zspage; 1828 } 1829 } 1830 1831 return zspage; 1832 } 1833 1834 static struct zspage *isolate_dst_zspage(struct size_class *class) 1835 { 1836 struct zspage *zspage; 1837 int fg; 1838 1839 for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) { 1840 zspage = list_first_entry_or_null(&class->fullness_list[fg], 1841 struct zspage, list); 1842 if (zspage) { 1843 remove_zspage(class, zspage); 1844 return zspage; 1845 } 1846 } 1847 1848 return zspage; 1849 } 1850 1851 /* 1852 * putback_zspage - add @zspage into right class's fullness list 1853 * @class: destination class 1854 * @zspage: target page 1855 * 1856 * Return @zspage's fullness status 1857 */ 1858 static int putback_zspage(struct size_class *class, struct zspage *zspage) 1859 { 1860 int fullness; 1861 1862 fullness = get_fullness_group(class, zspage); 1863 insert_zspage(class, zspage, fullness); 1864 1865 return fullness; 1866 } 1867 1868 #ifdef CONFIG_COMPACTION 1869 /* 1870 * To prevent zspage destroy during migration, zspage freeing should 1871 * hold locks of all pages in the zspage. 1872 */ 1873 static void lock_zspage(struct zspage *zspage) 1874 { 1875 struct zpdesc *curr_zpdesc, *zpdesc; 1876 1877 /* 1878 * Pages we haven't locked yet can be migrated off the list while we're 1879 * trying to lock them, so we need to be careful and only attempt to 1880 * lock each page under zspage_read_lock(). Otherwise, the page we lock 1881 * may no longer belong to the zspage. This means that we may wait for 1882 * the wrong page to unlock, so we must take a reference to the page 1883 * prior to waiting for it to unlock outside zspage_read_lock(). 1884 */ 1885 while (1) { 1886 zspage_read_lock(zspage); 1887 zpdesc = get_first_zpdesc(zspage); 1888 if (zpdesc_trylock(zpdesc)) 1889 break; 1890 zpdesc_get(zpdesc); 1891 zspage_read_unlock(zspage); 1892 zpdesc_wait_locked(zpdesc); 1893 zpdesc_put(zpdesc); 1894 } 1895 1896 curr_zpdesc = zpdesc; 1897 while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { 1898 if (zpdesc_trylock(zpdesc)) { 1899 curr_zpdesc = zpdesc; 1900 } else { 1901 zpdesc_get(zpdesc); 1902 zspage_read_unlock(zspage); 1903 zpdesc_wait_locked(zpdesc); 1904 zpdesc_put(zpdesc); 1905 zspage_read_lock(zspage); 1906 } 1907 } 1908 zspage_read_unlock(zspage); 1909 } 1910 #endif /* CONFIG_COMPACTION */ 1911 1912 #ifdef CONFIG_COMPACTION 1913 1914 static const struct movable_operations zsmalloc_mops; 1915 1916 static void replace_sub_page(struct size_class *class, struct zspage *zspage, 1917 struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) 1918 { 1919 struct zpdesc *zpdesc; 1920 struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; 1921 unsigned int first_obj_offset; 1922 int idx = 0; 1923 1924 zpdesc = get_first_zpdesc(zspage); 1925 do { 1926 if (zpdesc == oldzpdesc) 1927 zpdescs[idx] = newzpdesc; 1928 else 1929 zpdescs[idx] = zpdesc; 1930 idx++; 1931 } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); 1932 1933 create_page_chain(class, zspage, zpdescs); 1934 first_obj_offset = get_first_obj_offset(oldzpdesc); 1935 set_first_obj_offset(newzpdesc, first_obj_offset); 1936 if (unlikely(ZsHugePage(zspage))) 1937 newzpdesc->handle = oldzpdesc->handle; 1938 __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); 1939 } 1940 1941 static bool zs_page_isolate(struct page *page, isolate_mode_t mode) 1942 { 1943 /* 1944 * Page is locked so zspage couldn't be destroyed. For detail, look at 1945 * lock_zspage in free_zspage. 1946 */ 1947 VM_BUG_ON_PAGE(PageIsolated(page), page); 1948 1949 return true; 1950 } 1951 1952 static int zs_page_migrate(struct page *newpage, struct page *page, 1953 enum migrate_mode mode) 1954 { 1955 struct zs_pool *pool; 1956 struct size_class *class; 1957 struct zspage *zspage; 1958 struct zpdesc *dummy; 1959 struct zpdesc *newzpdesc = page_zpdesc(newpage); 1960 struct zpdesc *zpdesc = page_zpdesc(page); 1961 void *s_addr, *d_addr, *addr; 1962 unsigned int offset; 1963 unsigned long handle; 1964 unsigned long old_obj, new_obj; 1965 unsigned int obj_idx; 1966 1967 VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); 1968 1969 /* The page is locked, so this pointer must remain valid */ 1970 zspage = get_zspage(zpdesc); 1971 pool = zspage->pool; 1972 1973 /* 1974 * The pool migrate_lock protects the race between zpage migration 1975 * and zs_free. 1976 */ 1977 write_lock(&pool->lock); 1978 class = zspage_class(pool, zspage); 1979 1980 /* 1981 * the class lock protects zpage alloc/free in the zspage. 1982 */ 1983 spin_lock(&class->lock); 1984 /* the zspage write_lock protects zpage access via zs_map_object */ 1985 if (!zspage_write_trylock(zspage)) { 1986 spin_unlock(&class->lock); 1987 write_unlock(&pool->lock); 1988 return -EINVAL; 1989 } 1990 1991 /* We're committed, tell the world that this is a Zsmalloc page. */ 1992 __zpdesc_set_zsmalloc(newzpdesc); 1993 1994 offset = get_first_obj_offset(zpdesc); 1995 s_addr = kmap_local_zpdesc(zpdesc); 1996 1997 /* 1998 * Here, any user cannot access all objects in the zspage so let's move. 1999 */ 2000 d_addr = kmap_local_zpdesc(newzpdesc); 2001 copy_page(d_addr, s_addr); 2002 kunmap_local(d_addr); 2003 2004 for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; 2005 addr += class->size) { 2006 if (obj_allocated(zpdesc, addr, &handle)) { 2007 2008 old_obj = handle_to_obj(handle); 2009 obj_to_location(old_obj, &dummy, &obj_idx); 2010 new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); 2011 record_obj(handle, new_obj); 2012 } 2013 } 2014 kunmap_local(s_addr); 2015 2016 replace_sub_page(class, zspage, newzpdesc, zpdesc); 2017 /* 2018 * Since we complete the data copy and set up new zspage structure, 2019 * it's okay to release migration_lock. 2020 */ 2021 write_unlock(&pool->lock); 2022 spin_unlock(&class->lock); 2023 zspage_write_unlock(zspage); 2024 2025 zpdesc_get(newzpdesc); 2026 if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { 2027 zpdesc_dec_zone_page_state(zpdesc); 2028 zpdesc_inc_zone_page_state(newzpdesc); 2029 } 2030 2031 reset_zpdesc(zpdesc); 2032 zpdesc_put(zpdesc); 2033 2034 return MIGRATEPAGE_SUCCESS; 2035 } 2036 2037 static void zs_page_putback(struct page *page) 2038 { 2039 VM_BUG_ON_PAGE(!PageIsolated(page), page); 2040 } 2041 2042 static const struct movable_operations zsmalloc_mops = { 2043 .isolate_page = zs_page_isolate, 2044 .migrate_page = zs_page_migrate, 2045 .putback_page = zs_page_putback, 2046 }; 2047 2048 /* 2049 * Caller should hold page_lock of all pages in the zspage 2050 * In here, we cannot use zspage meta data. 2051 */ 2052 static void async_free_zspage(struct work_struct *work) 2053 { 2054 int i; 2055 struct size_class *class; 2056 struct zspage *zspage, *tmp; 2057 LIST_HEAD(free_pages); 2058 struct zs_pool *pool = container_of(work, struct zs_pool, 2059 free_work); 2060 2061 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 2062 class = pool->size_class[i]; 2063 if (class->index != i) 2064 continue; 2065 2066 spin_lock(&class->lock); 2067 list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0], 2068 &free_pages); 2069 spin_unlock(&class->lock); 2070 } 2071 2072 list_for_each_entry_safe(zspage, tmp, &free_pages, list) { 2073 list_del(&zspage->list); 2074 lock_zspage(zspage); 2075 2076 class = zspage_class(pool, zspage); 2077 spin_lock(&class->lock); 2078 class_stat_sub(class, ZS_INUSE_RATIO_0, 1); 2079 __free_zspage(pool, class, zspage); 2080 spin_unlock(&class->lock); 2081 } 2082 }; 2083 2084 static void kick_deferred_free(struct zs_pool *pool) 2085 { 2086 schedule_work(&pool->free_work); 2087 } 2088 2089 static void zs_flush_migration(struct zs_pool *pool) 2090 { 2091 flush_work(&pool->free_work); 2092 } 2093 2094 static void init_deferred_free(struct zs_pool *pool) 2095 { 2096 INIT_WORK(&pool->free_work, async_free_zspage); 2097 } 2098 2099 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) 2100 { 2101 struct zpdesc *zpdesc = get_first_zpdesc(zspage); 2102 2103 do { 2104 WARN_ON(!zpdesc_trylock(zpdesc)); 2105 __zpdesc_set_movable(zpdesc, &zsmalloc_mops); 2106 zpdesc_unlock(zpdesc); 2107 } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); 2108 } 2109 #else 2110 static inline void zs_flush_migration(struct zs_pool *pool) { } 2111 #endif 2112 2113 /* 2114 * 2115 * Based on the number of unused allocated objects calculate 2116 * and return the number of pages that we can free. 2117 */ 2118 static unsigned long zs_can_compact(struct size_class *class) 2119 { 2120 unsigned long obj_wasted; 2121 unsigned long obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED); 2122 unsigned long obj_used = class_stat_read(class, ZS_OBJS_INUSE); 2123 2124 if (obj_allocated <= obj_used) 2125 return 0; 2126 2127 obj_wasted = obj_allocated - obj_used; 2128 obj_wasted /= class->objs_per_zspage; 2129 2130 return obj_wasted * class->pages_per_zspage; 2131 } 2132 2133 static unsigned long __zs_compact(struct zs_pool *pool, 2134 struct size_class *class) 2135 { 2136 struct zspage *src_zspage = NULL; 2137 struct zspage *dst_zspage = NULL; 2138 unsigned long pages_freed = 0; 2139 2140 /* 2141 * protect the race between zpage migration and zs_free 2142 * as well as zpage allocation/free 2143 */ 2144 write_lock(&pool->lock); 2145 spin_lock(&class->lock); 2146 while (zs_can_compact(class)) { 2147 int fg; 2148 2149 if (!dst_zspage) { 2150 dst_zspage = isolate_dst_zspage(class); 2151 if (!dst_zspage) 2152 break; 2153 } 2154 2155 src_zspage = isolate_src_zspage(class); 2156 if (!src_zspage) 2157 break; 2158 2159 if (!zspage_write_trylock(src_zspage)) 2160 break; 2161 2162 migrate_zspage(pool, src_zspage, dst_zspage); 2163 zspage_write_unlock(src_zspage); 2164 2165 fg = putback_zspage(class, src_zspage); 2166 if (fg == ZS_INUSE_RATIO_0) { 2167 free_zspage(pool, class, src_zspage); 2168 pages_freed += class->pages_per_zspage; 2169 } 2170 src_zspage = NULL; 2171 2172 if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 2173 || rwlock_is_contended(&pool->lock)) { 2174 putback_zspage(class, dst_zspage); 2175 dst_zspage = NULL; 2176 2177 spin_unlock(&class->lock); 2178 write_unlock(&pool->lock); 2179 cond_resched(); 2180 write_lock(&pool->lock); 2181 spin_lock(&class->lock); 2182 } 2183 } 2184 2185 if (src_zspage) 2186 putback_zspage(class, src_zspage); 2187 2188 if (dst_zspage) 2189 putback_zspage(class, dst_zspage); 2190 2191 spin_unlock(&class->lock); 2192 write_unlock(&pool->lock); 2193 2194 return pages_freed; 2195 } 2196 2197 unsigned long zs_compact(struct zs_pool *pool) 2198 { 2199 int i; 2200 struct size_class *class; 2201 unsigned long pages_freed = 0; 2202 2203 /* 2204 * Pool compaction is performed under pool->lock so it is basically 2205 * single-threaded. Having more than one thread in __zs_compact() 2206 * will increase pool->lock contention, which will impact other 2207 * zsmalloc operations that need pool->lock. 2208 */ 2209 if (atomic_xchg(&pool->compaction_in_progress, 1)) 2210 return 0; 2211 2212 for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { 2213 class = pool->size_class[i]; 2214 if (class->index != i) 2215 continue; 2216 pages_freed += __zs_compact(pool, class); 2217 } 2218 atomic_long_add(pages_freed, &pool->stats.pages_compacted); 2219 atomic_set(&pool->compaction_in_progress, 0); 2220 2221 return pages_freed; 2222 } 2223 EXPORT_SYMBOL_GPL(zs_compact); 2224 2225 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 2226 { 2227 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 2228 } 2229 EXPORT_SYMBOL_GPL(zs_pool_stats); 2230 2231 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 2232 struct shrink_control *sc) 2233 { 2234 unsigned long pages_freed; 2235 struct zs_pool *pool = shrinker->private_data; 2236 2237 /* 2238 * Compact classes and calculate compaction delta. 2239 * Can run concurrently with a manually triggered 2240 * (by user) compaction. 2241 */ 2242 pages_freed = zs_compact(pool); 2243 2244 return pages_freed ? pages_freed : SHRINK_STOP; 2245 } 2246 2247 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 2248 struct shrink_control *sc) 2249 { 2250 int i; 2251 struct size_class *class; 2252 unsigned long pages_to_free = 0; 2253 struct zs_pool *pool = shrinker->private_data; 2254 2255 for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { 2256 class = pool->size_class[i]; 2257 if (class->index != i) 2258 continue; 2259 2260 pages_to_free += zs_can_compact(class); 2261 } 2262 2263 return pages_to_free; 2264 } 2265 2266 static void zs_unregister_shrinker(struct zs_pool *pool) 2267 { 2268 shrinker_free(pool->shrinker); 2269 } 2270 2271 static int zs_register_shrinker(struct zs_pool *pool) 2272 { 2273 pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); 2274 if (!pool->shrinker) 2275 return -ENOMEM; 2276 2277 pool->shrinker->scan_objects = zs_shrinker_scan; 2278 pool->shrinker->count_objects = zs_shrinker_count; 2279 pool->shrinker->batch = 0; 2280 pool->shrinker->private_data = pool; 2281 2282 shrinker_register(pool->shrinker); 2283 2284 return 0; 2285 } 2286 2287 static int calculate_zspage_chain_size(int class_size) 2288 { 2289 int i, min_waste = INT_MAX; 2290 int chain_size = 1; 2291 2292 if (is_power_of_2(class_size)) 2293 return chain_size; 2294 2295 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 2296 int waste; 2297 2298 waste = (i * PAGE_SIZE) % class_size; 2299 if (waste < min_waste) { 2300 min_waste = waste; 2301 chain_size = i; 2302 } 2303 } 2304 2305 return chain_size; 2306 } 2307 2308 /** 2309 * zs_create_pool - Creates an allocation pool to work from. 2310 * @name: pool name to be created 2311 * 2312 * This function must be called before anything when using 2313 * the zsmalloc allocator. 2314 * 2315 * On success, a pointer to the newly created pool is returned, 2316 * otherwise NULL. 2317 */ 2318 struct zs_pool *zs_create_pool(const char *name) 2319 { 2320 int i; 2321 struct zs_pool *pool; 2322 struct size_class *prev_class = NULL; 2323 2324 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 2325 if (!pool) 2326 return NULL; 2327 2328 init_deferred_free(pool); 2329 rwlock_init(&pool->lock); 2330 atomic_set(&pool->compaction_in_progress, 0); 2331 2332 pool->name = kstrdup(name, GFP_KERNEL); 2333 if (!pool->name) 2334 goto err; 2335 2336 if (create_cache(pool)) 2337 goto err; 2338 2339 /* 2340 * Iterate reversely, because, size of size_class that we want to use 2341 * for merging should be larger or equal to current size. 2342 */ 2343 for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { 2344 int size; 2345 int pages_per_zspage; 2346 int objs_per_zspage; 2347 struct size_class *class; 2348 int fullness; 2349 2350 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 2351 if (size > ZS_MAX_ALLOC_SIZE) 2352 size = ZS_MAX_ALLOC_SIZE; 2353 pages_per_zspage = calculate_zspage_chain_size(size); 2354 objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; 2355 2356 /* 2357 * We iterate from biggest down to smallest classes, 2358 * so huge_class_size holds the size of the first huge 2359 * class. Any object bigger than or equal to that will 2360 * endup in the huge class. 2361 */ 2362 if (pages_per_zspage != 1 && objs_per_zspage != 1 && 2363 !huge_class_size) { 2364 huge_class_size = size; 2365 /* 2366 * The object uses ZS_HANDLE_SIZE bytes to store the 2367 * handle. We need to subtract it, because zs_malloc() 2368 * unconditionally adds handle size before it performs 2369 * size class search - so object may be smaller than 2370 * huge class size, yet it still can end up in the huge 2371 * class because it grows by ZS_HANDLE_SIZE extra bytes 2372 * right before class lookup. 2373 */ 2374 huge_class_size -= (ZS_HANDLE_SIZE - 1); 2375 } 2376 2377 /* 2378 * size_class is used for normal zsmalloc operation such 2379 * as alloc/free for that size. Although it is natural that we 2380 * have one size_class for each size, there is a chance that we 2381 * can get more memory utilization if we use one size_class for 2382 * many different sizes whose size_class have same 2383 * characteristics. So, we makes size_class point to 2384 * previous size_class if possible. 2385 */ 2386 if (prev_class) { 2387 if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { 2388 pool->size_class[i] = prev_class; 2389 continue; 2390 } 2391 } 2392 2393 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 2394 if (!class) 2395 goto err; 2396 2397 class->size = size; 2398 class->index = i; 2399 class->pages_per_zspage = pages_per_zspage; 2400 class->objs_per_zspage = objs_per_zspage; 2401 spin_lock_init(&class->lock); 2402 pool->size_class[i] = class; 2403 2404 fullness = ZS_INUSE_RATIO_0; 2405 while (fullness < NR_FULLNESS_GROUPS) { 2406 INIT_LIST_HEAD(&class->fullness_list[fullness]); 2407 fullness++; 2408 } 2409 2410 prev_class = class; 2411 } 2412 2413 /* debug only, don't abort if it fails */ 2414 zs_pool_stat_create(pool, name); 2415 2416 /* 2417 * Not critical since shrinker is only used to trigger internal 2418 * defragmentation of the pool which is pretty optional thing. If 2419 * registration fails we still can use the pool normally and user can 2420 * trigger compaction manually. Thus, ignore return code. 2421 */ 2422 zs_register_shrinker(pool); 2423 2424 return pool; 2425 2426 err: 2427 zs_destroy_pool(pool); 2428 return NULL; 2429 } 2430 EXPORT_SYMBOL_GPL(zs_create_pool); 2431 2432 void zs_destroy_pool(struct zs_pool *pool) 2433 { 2434 int i; 2435 2436 zs_unregister_shrinker(pool); 2437 zs_flush_migration(pool); 2438 zs_pool_stat_destroy(pool); 2439 2440 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 2441 int fg; 2442 struct size_class *class = pool->size_class[i]; 2443 2444 if (!class) 2445 continue; 2446 2447 if (class->index != i) 2448 continue; 2449 2450 for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) { 2451 if (list_empty(&class->fullness_list[fg])) 2452 continue; 2453 2454 pr_err("Class-%d fullness group %d is not empty\n", 2455 class->size, fg); 2456 } 2457 kfree(class); 2458 } 2459 2460 destroy_cache(pool); 2461 kfree(pool->name); 2462 kfree(pool); 2463 } 2464 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2465 2466 static int __init zs_init(void) 2467 { 2468 int ret; 2469 2470 ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", 2471 zs_cpu_prepare, zs_cpu_dead); 2472 if (ret) 2473 goto out; 2474 2475 #ifdef CONFIG_ZPOOL 2476 zpool_register_driver(&zs_zpool_driver); 2477 #endif 2478 2479 zs_stat_init(); 2480 2481 return 0; 2482 2483 out: 2484 return ret; 2485 } 2486 2487 static void __exit zs_exit(void) 2488 { 2489 #ifdef CONFIG_ZPOOL 2490 zpool_unregister_driver(&zs_zpool_driver); 2491 #endif 2492 cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); 2493 2494 zs_stat_exit(); 2495 } 2496 2497 module_init(zs_init); 2498 module_exit(zs_exit); 2499 2500 MODULE_LICENSE("Dual BSD/GPL"); 2501 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2502 MODULE_DESCRIPTION("zsmalloc memory allocator"); 2503