1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->private: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private: refers to the component page after the first page 30 * If the page is first_page for huge object, it stores handle. 31 * Look at size_class->huge. 32 * page->freelist: points to the first free object in zspage. 33 * Free objects are linked together using in-place 34 * metadata. 35 * page->objects: maximum number of objects we can store in this 36 * zspage (class->zspage_order * PAGE_SIZE / class->size) 37 * page->lru: links together first pages of various zspages. 38 * Basically forming list of zspages in a fullness group. 39 * page->mapping: class index and fullness group of the zspage 40 * page->inuse: the number of objects that are used in this zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/preempt.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 }; 170 171 #ifdef CONFIG_ZSMALLOC_STAT 172 #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) 173 #else 174 #define NR_ZS_STAT_TYPE (OBJ_USED + 1) 175 #endif 176 177 struct zs_size_stat { 178 unsigned long objs[NR_ZS_STAT_TYPE]; 179 }; 180 181 #ifdef CONFIG_ZSMALLOC_STAT 182 static struct dentry *zs_stat_root; 183 #endif 184 185 /* 186 * number of size_classes 187 */ 188 static int zs_size_classes; 189 190 /* 191 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 192 * n <= N / f, where 193 * n = number of allocated objects 194 * N = total number of objects zspage can store 195 * f = fullness_threshold_frac 196 * 197 * Similarly, we assign zspage to: 198 * ZS_ALMOST_FULL when n > N / f 199 * ZS_EMPTY when n == 0 200 * ZS_FULL when n == N 201 * 202 * (see: fix_fullness_group()) 203 */ 204 static const int fullness_threshold_frac = 4; 205 206 struct size_class { 207 spinlock_t lock; 208 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 209 /* 210 * Size of objects stored in this class. Must be multiple 211 * of ZS_ALIGN. 212 */ 213 int size; 214 unsigned int index; 215 216 struct zs_size_stat stats; 217 218 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 219 int pages_per_zspage; 220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 221 bool huge; 222 }; 223 224 /* 225 * Placed within free objects to form a singly linked list. 226 * For every zspage, first_page->freelist gives head of this list. 227 * 228 * This must be power of 2 and less than or equal to ZS_ALIGN 229 */ 230 struct link_free { 231 union { 232 /* 233 * Position of next free chunk (encodes <PFN, obj_idx>) 234 * It's valid for non-allocated object 235 */ 236 void *next; 237 /* 238 * Handle of allocated object. 239 */ 240 unsigned long handle; 241 }; 242 }; 243 244 struct zs_pool { 245 const char *name; 246 247 struct size_class **size_class; 248 struct kmem_cache *handle_cachep; 249 250 atomic_long_t pages_allocated; 251 252 struct zs_pool_stats stats; 253 254 /* Compact classes */ 255 struct shrinker shrinker; 256 /* 257 * To signify that register_shrinker() was successful 258 * and unregister_shrinker() will not Oops. 259 */ 260 bool shrinker_enabled; 261 #ifdef CONFIG_ZSMALLOC_STAT 262 struct dentry *stat_dentry; 263 #endif 264 }; 265 266 /* 267 * A zspage's class index and fullness group 268 * are encoded in its (first)page->mapping 269 */ 270 #define CLASS_IDX_BITS 28 271 #define FULLNESS_BITS 4 272 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 273 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 274 275 struct mapping_area { 276 #ifdef CONFIG_PGTABLE_MAPPING 277 struct vm_struct *vm; /* vm area for mapping object that span pages */ 278 #else 279 char *vm_buf; /* copy buffer for objects that span pages */ 280 #endif 281 char *vm_addr; /* address of kmap_atomic()'ed pages */ 282 enum zs_mapmode vm_mm; /* mapping mode */ 283 }; 284 285 static int create_handle_cache(struct zs_pool *pool) 286 { 287 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 288 0, 0, NULL); 289 return pool->handle_cachep ? 0 : 1; 290 } 291 292 static void destroy_handle_cache(struct zs_pool *pool) 293 { 294 kmem_cache_destroy(pool->handle_cachep); 295 } 296 297 static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) 298 { 299 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 300 gfp & ~__GFP_HIGHMEM); 301 } 302 303 static void free_handle(struct zs_pool *pool, unsigned long handle) 304 { 305 kmem_cache_free(pool->handle_cachep, (void *)handle); 306 } 307 308 static void record_obj(unsigned long handle, unsigned long obj) 309 { 310 /* 311 * lsb of @obj represents handle lock while other bits 312 * represent object value the handle is pointing so 313 * updating shouldn't do store tearing. 314 */ 315 WRITE_ONCE(*(unsigned long *)handle, obj); 316 } 317 318 /* zpool driver */ 319 320 #ifdef CONFIG_ZPOOL 321 322 static void *zs_zpool_create(const char *name, gfp_t gfp, 323 const struct zpool_ops *zpool_ops, 324 struct zpool *zpool) 325 { 326 /* 327 * Ignore global gfp flags: zs_malloc() may be invoked from 328 * different contexts and its caller must provide a valid 329 * gfp mask. 330 */ 331 return zs_create_pool(name); 332 } 333 334 static void zs_zpool_destroy(void *pool) 335 { 336 zs_destroy_pool(pool); 337 } 338 339 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 340 unsigned long *handle) 341 { 342 *handle = zs_malloc(pool, size, gfp); 343 return *handle ? 0 : -1; 344 } 345 static void zs_zpool_free(void *pool, unsigned long handle) 346 { 347 zs_free(pool, handle); 348 } 349 350 static int zs_zpool_shrink(void *pool, unsigned int pages, 351 unsigned int *reclaimed) 352 { 353 return -EINVAL; 354 } 355 356 static void *zs_zpool_map(void *pool, unsigned long handle, 357 enum zpool_mapmode mm) 358 { 359 enum zs_mapmode zs_mm; 360 361 switch (mm) { 362 case ZPOOL_MM_RO: 363 zs_mm = ZS_MM_RO; 364 break; 365 case ZPOOL_MM_WO: 366 zs_mm = ZS_MM_WO; 367 break; 368 case ZPOOL_MM_RW: /* fallthru */ 369 default: 370 zs_mm = ZS_MM_RW; 371 break; 372 } 373 374 return zs_map_object(pool, handle, zs_mm); 375 } 376 static void zs_zpool_unmap(void *pool, unsigned long handle) 377 { 378 zs_unmap_object(pool, handle); 379 } 380 381 static u64 zs_zpool_total_size(void *pool) 382 { 383 return zs_get_total_pages(pool) << PAGE_SHIFT; 384 } 385 386 static struct zpool_driver zs_zpool_driver = { 387 .type = "zsmalloc", 388 .owner = THIS_MODULE, 389 .create = zs_zpool_create, 390 .destroy = zs_zpool_destroy, 391 .malloc = zs_zpool_malloc, 392 .free = zs_zpool_free, 393 .shrink = zs_zpool_shrink, 394 .map = zs_zpool_map, 395 .unmap = zs_zpool_unmap, 396 .total_size = zs_zpool_total_size, 397 }; 398 399 MODULE_ALIAS("zpool-zsmalloc"); 400 #endif /* CONFIG_ZPOOL */ 401 402 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 403 { 404 return pages_per_zspage * PAGE_SIZE / size; 405 } 406 407 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 408 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 409 410 static int is_first_page(struct page *page) 411 { 412 return PagePrivate(page); 413 } 414 415 static int is_last_page(struct page *page) 416 { 417 return PagePrivate2(page); 418 } 419 420 static void get_zspage_mapping(struct page *first_page, 421 unsigned int *class_idx, 422 enum fullness_group *fullness) 423 { 424 unsigned long m; 425 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 426 427 m = (unsigned long)first_page->mapping; 428 *fullness = m & FULLNESS_MASK; 429 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 430 } 431 432 static void set_zspage_mapping(struct page *first_page, 433 unsigned int class_idx, 434 enum fullness_group fullness) 435 { 436 unsigned long m; 437 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 438 439 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 440 (fullness & FULLNESS_MASK); 441 first_page->mapping = (struct address_space *)m; 442 } 443 444 /* 445 * zsmalloc divides the pool into various size classes where each 446 * class maintains a list of zspages where each zspage is divided 447 * into equal sized chunks. Each allocation falls into one of these 448 * classes depending on its size. This function returns index of the 449 * size class which has chunk size big enough to hold the give size. 450 */ 451 static int get_size_class_index(int size) 452 { 453 int idx = 0; 454 455 if (likely(size > ZS_MIN_ALLOC_SIZE)) 456 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 457 ZS_SIZE_CLASS_DELTA); 458 459 return min(zs_size_classes - 1, idx); 460 } 461 462 static inline void zs_stat_inc(struct size_class *class, 463 enum zs_stat_type type, unsigned long cnt) 464 { 465 if (type < NR_ZS_STAT_TYPE) 466 class->stats.objs[type] += cnt; 467 } 468 469 static inline void zs_stat_dec(struct size_class *class, 470 enum zs_stat_type type, unsigned long cnt) 471 { 472 if (type < NR_ZS_STAT_TYPE) 473 class->stats.objs[type] -= cnt; 474 } 475 476 static inline unsigned long zs_stat_get(struct size_class *class, 477 enum zs_stat_type type) 478 { 479 if (type < NR_ZS_STAT_TYPE) 480 return class->stats.objs[type]; 481 return 0; 482 } 483 484 #ifdef CONFIG_ZSMALLOC_STAT 485 486 static int __init zs_stat_init(void) 487 { 488 if (!debugfs_initialized()) 489 return -ENODEV; 490 491 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 492 if (!zs_stat_root) 493 return -ENOMEM; 494 495 return 0; 496 } 497 498 static void __exit zs_stat_exit(void) 499 { 500 debugfs_remove_recursive(zs_stat_root); 501 } 502 503 static unsigned long zs_can_compact(struct size_class *class); 504 505 static int zs_stats_size_show(struct seq_file *s, void *v) 506 { 507 int i; 508 struct zs_pool *pool = s->private; 509 struct size_class *class; 510 int objs_per_zspage; 511 unsigned long class_almost_full, class_almost_empty; 512 unsigned long obj_allocated, obj_used, pages_used, freeable; 513 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 514 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 515 unsigned long total_freeable = 0; 516 517 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", 518 "class", "size", "almost_full", "almost_empty", 519 "obj_allocated", "obj_used", "pages_used", 520 "pages_per_zspage", "freeable"); 521 522 for (i = 0; i < zs_size_classes; i++) { 523 class = pool->size_class[i]; 524 525 if (class->index != i) 526 continue; 527 528 spin_lock(&class->lock); 529 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 530 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 531 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 532 obj_used = zs_stat_get(class, OBJ_USED); 533 freeable = zs_can_compact(class); 534 spin_unlock(&class->lock); 535 536 objs_per_zspage = get_maxobj_per_zspage(class->size, 537 class->pages_per_zspage); 538 pages_used = obj_allocated / objs_per_zspage * 539 class->pages_per_zspage; 540 541 seq_printf(s, " %5u %5u %11lu %12lu %13lu" 542 " %10lu %10lu %16d %8lu\n", 543 i, class->size, class_almost_full, class_almost_empty, 544 obj_allocated, obj_used, pages_used, 545 class->pages_per_zspage, freeable); 546 547 total_class_almost_full += class_almost_full; 548 total_class_almost_empty += class_almost_empty; 549 total_objs += obj_allocated; 550 total_used_objs += obj_used; 551 total_pages += pages_used; 552 total_freeable += freeable; 553 } 554 555 seq_puts(s, "\n"); 556 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", 557 "Total", "", total_class_almost_full, 558 total_class_almost_empty, total_objs, 559 total_used_objs, total_pages, "", total_freeable); 560 561 return 0; 562 } 563 564 static int zs_stats_size_open(struct inode *inode, struct file *file) 565 { 566 return single_open(file, zs_stats_size_show, inode->i_private); 567 } 568 569 static const struct file_operations zs_stat_size_ops = { 570 .open = zs_stats_size_open, 571 .read = seq_read, 572 .llseek = seq_lseek, 573 .release = single_release, 574 }; 575 576 static void zs_pool_stat_create(struct zs_pool *pool, const char *name) 577 { 578 struct dentry *entry; 579 580 if (!zs_stat_root) 581 return; 582 583 entry = debugfs_create_dir(name, zs_stat_root); 584 if (!entry) { 585 pr_warn("debugfs dir <%s> creation failed\n", name); 586 return; 587 } 588 pool->stat_dentry = entry; 589 590 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 591 pool->stat_dentry, pool, &zs_stat_size_ops); 592 if (!entry) { 593 pr_warn("%s: debugfs file entry <%s> creation failed\n", 594 name, "classes"); 595 return; 596 } 597 } 598 599 static void zs_pool_stat_destroy(struct zs_pool *pool) 600 { 601 debugfs_remove_recursive(pool->stat_dentry); 602 } 603 604 #else /* CONFIG_ZSMALLOC_STAT */ 605 static int __init zs_stat_init(void) 606 { 607 return 0; 608 } 609 610 static void __exit zs_stat_exit(void) 611 { 612 } 613 614 static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) 615 { 616 } 617 618 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 619 { 620 } 621 #endif 622 623 /* 624 * For each size class, zspages are divided into different groups 625 * depending on how "full" they are. This was done so that we could 626 * easily find empty or nearly empty zspages when we try to shrink 627 * the pool (not yet implemented). This function returns fullness 628 * status of the given page. 629 */ 630 static enum fullness_group get_fullness_group(struct page *first_page) 631 { 632 int inuse, max_objects; 633 enum fullness_group fg; 634 635 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 636 637 inuse = first_page->inuse; 638 max_objects = first_page->objects; 639 640 if (inuse == 0) 641 fg = ZS_EMPTY; 642 else if (inuse == max_objects) 643 fg = ZS_FULL; 644 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 645 fg = ZS_ALMOST_EMPTY; 646 else 647 fg = ZS_ALMOST_FULL; 648 649 return fg; 650 } 651 652 /* 653 * Each size class maintains various freelists and zspages are assigned 654 * to one of these freelists based on the number of live objects they 655 * have. This functions inserts the given zspage into the freelist 656 * identified by <class, fullness_group>. 657 */ 658 static void insert_zspage(struct size_class *class, 659 enum fullness_group fullness, 660 struct page *first_page) 661 { 662 struct page **head; 663 664 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 665 666 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 667 return; 668 669 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 670 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 671 672 head = &class->fullness_list[fullness]; 673 if (!*head) { 674 *head = first_page; 675 return; 676 } 677 678 /* 679 * We want to see more ZS_FULL pages and less almost 680 * empty/full. Put pages with higher ->inuse first. 681 */ 682 list_add_tail(&first_page->lru, &(*head)->lru); 683 if (first_page->inuse >= (*head)->inuse) 684 *head = first_page; 685 } 686 687 /* 688 * This function removes the given zspage from the freelist identified 689 * by <class, fullness_group>. 690 */ 691 static void remove_zspage(struct size_class *class, 692 enum fullness_group fullness, 693 struct page *first_page) 694 { 695 struct page **head; 696 697 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 698 699 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 700 return; 701 702 head = &class->fullness_list[fullness]; 703 VM_BUG_ON_PAGE(!*head, first_page); 704 if (list_empty(&(*head)->lru)) 705 *head = NULL; 706 else if (*head == first_page) 707 *head = (struct page *)list_entry((*head)->lru.next, 708 struct page, lru); 709 710 list_del_init(&first_page->lru); 711 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 712 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 713 } 714 715 /* 716 * Each size class maintains zspages in different fullness groups depending 717 * on the number of live objects they contain. When allocating or freeing 718 * objects, the fullness status of the page can change, say, from ALMOST_FULL 719 * to ALMOST_EMPTY when freeing an object. This function checks if such 720 * a status change has occurred for the given page and accordingly moves the 721 * page from the freelist of the old fullness group to that of the new 722 * fullness group. 723 */ 724 static enum fullness_group fix_fullness_group(struct size_class *class, 725 struct page *first_page) 726 { 727 int class_idx; 728 enum fullness_group currfg, newfg; 729 730 get_zspage_mapping(first_page, &class_idx, &currfg); 731 newfg = get_fullness_group(first_page); 732 if (newfg == currfg) 733 goto out; 734 735 remove_zspage(class, currfg, first_page); 736 insert_zspage(class, newfg, first_page); 737 set_zspage_mapping(first_page, class_idx, newfg); 738 739 out: 740 return newfg; 741 } 742 743 /* 744 * We have to decide on how many pages to link together 745 * to form a zspage for each size class. This is important 746 * to reduce wastage due to unusable space left at end of 747 * each zspage which is given as: 748 * wastage = Zp % class_size 749 * usage = Zp - wastage 750 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 751 * 752 * For example, for size class of 3/8 * PAGE_SIZE, we should 753 * link together 3 PAGE_SIZE sized pages to form a zspage 754 * since then we can perfectly fit in 8 such objects. 755 */ 756 static int get_pages_per_zspage(int class_size) 757 { 758 int i, max_usedpc = 0; 759 /* zspage order which gives maximum used size per KB */ 760 int max_usedpc_order = 1; 761 762 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 763 int zspage_size; 764 int waste, usedpc; 765 766 zspage_size = i * PAGE_SIZE; 767 waste = zspage_size % class_size; 768 usedpc = (zspage_size - waste) * 100 / zspage_size; 769 770 if (usedpc > max_usedpc) { 771 max_usedpc = usedpc; 772 max_usedpc_order = i; 773 } 774 } 775 776 return max_usedpc_order; 777 } 778 779 /* 780 * A single 'zspage' is composed of many system pages which are 781 * linked together using fields in struct page. This function finds 782 * the first/head page, given any component page of a zspage. 783 */ 784 static struct page *get_first_page(struct page *page) 785 { 786 if (is_first_page(page)) 787 return page; 788 else 789 return (struct page *)page_private(page); 790 } 791 792 static struct page *get_next_page(struct page *page) 793 { 794 struct page *next; 795 796 if (is_last_page(page)) 797 next = NULL; 798 else if (is_first_page(page)) 799 next = (struct page *)page_private(page); 800 else 801 next = list_entry(page->lru.next, struct page, lru); 802 803 return next; 804 } 805 806 /* 807 * Encode <page, obj_idx> as a single handle value. 808 * We use the least bit of handle for tagging. 809 */ 810 static void *location_to_obj(struct page *page, unsigned long obj_idx) 811 { 812 unsigned long obj; 813 814 if (!page) { 815 VM_BUG_ON(obj_idx); 816 return NULL; 817 } 818 819 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 820 obj |= ((obj_idx) & OBJ_INDEX_MASK); 821 obj <<= OBJ_TAG_BITS; 822 823 return (void *)obj; 824 } 825 826 /* 827 * Decode <page, obj_idx> pair from the given object handle. We adjust the 828 * decoded obj_idx back to its original value since it was adjusted in 829 * location_to_obj(). 830 */ 831 static void obj_to_location(unsigned long obj, struct page **page, 832 unsigned long *obj_idx) 833 { 834 obj >>= OBJ_TAG_BITS; 835 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 836 *obj_idx = (obj & OBJ_INDEX_MASK); 837 } 838 839 static unsigned long handle_to_obj(unsigned long handle) 840 { 841 return *(unsigned long *)handle; 842 } 843 844 static unsigned long obj_to_head(struct size_class *class, struct page *page, 845 void *obj) 846 { 847 if (class->huge) { 848 VM_BUG_ON_PAGE(!is_first_page(page), page); 849 return page_private(page); 850 } else 851 return *(unsigned long *)obj; 852 } 853 854 static unsigned long obj_idx_to_offset(struct page *page, 855 unsigned long obj_idx, int class_size) 856 { 857 unsigned long off = 0; 858 859 if (!is_first_page(page)) 860 off = page->index; 861 862 return off + obj_idx * class_size; 863 } 864 865 static inline int trypin_tag(unsigned long handle) 866 { 867 unsigned long *ptr = (unsigned long *)handle; 868 869 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 870 } 871 872 static void pin_tag(unsigned long handle) 873 { 874 while (!trypin_tag(handle)); 875 } 876 877 static void unpin_tag(unsigned long handle) 878 { 879 unsigned long *ptr = (unsigned long *)handle; 880 881 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 882 } 883 884 static void reset_page(struct page *page) 885 { 886 clear_bit(PG_private, &page->flags); 887 clear_bit(PG_private_2, &page->flags); 888 set_page_private(page, 0); 889 page->mapping = NULL; 890 page->freelist = NULL; 891 page_mapcount_reset(page); 892 } 893 894 static void free_zspage(struct page *first_page) 895 { 896 struct page *nextp, *tmp, *head_extra; 897 898 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 899 VM_BUG_ON_PAGE(first_page->inuse, first_page); 900 901 head_extra = (struct page *)page_private(first_page); 902 903 reset_page(first_page); 904 __free_page(first_page); 905 906 /* zspage with only 1 system page */ 907 if (!head_extra) 908 return; 909 910 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 911 list_del(&nextp->lru); 912 reset_page(nextp); 913 __free_page(nextp); 914 } 915 reset_page(head_extra); 916 __free_page(head_extra); 917 } 918 919 /* Initialize a newly allocated zspage */ 920 static void init_zspage(struct size_class *class, struct page *first_page) 921 { 922 unsigned long off = 0; 923 struct page *page = first_page; 924 925 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 926 927 while (page) { 928 struct page *next_page; 929 struct link_free *link; 930 unsigned int i = 1; 931 void *vaddr; 932 933 /* 934 * page->index stores offset of first object starting 935 * in the page. For the first page, this is always 0, 936 * so we use first_page->index (aka ->freelist) to store 937 * head of corresponding zspage's freelist. 938 */ 939 if (page != first_page) 940 page->index = off; 941 942 vaddr = kmap_atomic(page); 943 link = (struct link_free *)vaddr + off / sizeof(*link); 944 945 while ((off += class->size) < PAGE_SIZE) { 946 link->next = location_to_obj(page, i++); 947 link += class->size / sizeof(*link); 948 } 949 950 /* 951 * We now come to the last (full or partial) object on this 952 * page, which must point to the first object on the next 953 * page (if present) 954 */ 955 next_page = get_next_page(page); 956 link->next = location_to_obj(next_page, 0); 957 kunmap_atomic(vaddr); 958 page = next_page; 959 off %= PAGE_SIZE; 960 } 961 } 962 963 /* 964 * Allocate a zspage for the given size class 965 */ 966 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 967 { 968 int i, error; 969 struct page *first_page = NULL, *uninitialized_var(prev_page); 970 971 /* 972 * Allocate individual pages and link them together as: 973 * 1. first page->private = first sub-page 974 * 2. all sub-pages are linked together using page->lru 975 * 3. each sub-page is linked to the first page using page->private 976 * 977 * For each size class, First/Head pages are linked together using 978 * page->lru. Also, we set PG_private to identify the first page 979 * (i.e. no other sub-page has this flag set) and PG_private_2 to 980 * identify the last page. 981 */ 982 error = -ENOMEM; 983 for (i = 0; i < class->pages_per_zspage; i++) { 984 struct page *page; 985 986 page = alloc_page(flags); 987 if (!page) 988 goto cleanup; 989 990 INIT_LIST_HEAD(&page->lru); 991 if (i == 0) { /* first page */ 992 SetPagePrivate(page); 993 set_page_private(page, 0); 994 first_page = page; 995 first_page->inuse = 0; 996 } 997 if (i == 1) 998 set_page_private(first_page, (unsigned long)page); 999 if (i >= 1) 1000 set_page_private(page, (unsigned long)first_page); 1001 if (i >= 2) 1002 list_add(&page->lru, &prev_page->lru); 1003 if (i == class->pages_per_zspage - 1) /* last page */ 1004 SetPagePrivate2(page); 1005 prev_page = page; 1006 } 1007 1008 init_zspage(class, first_page); 1009 1010 first_page->freelist = location_to_obj(first_page, 0); 1011 /* Maximum number of objects we can store in this zspage */ 1012 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 1013 1014 error = 0; /* Success */ 1015 1016 cleanup: 1017 if (unlikely(error) && first_page) { 1018 free_zspage(first_page); 1019 first_page = NULL; 1020 } 1021 1022 return first_page; 1023 } 1024 1025 static struct page *find_get_zspage(struct size_class *class) 1026 { 1027 int i; 1028 struct page *page; 1029 1030 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1031 page = class->fullness_list[i]; 1032 if (page) 1033 break; 1034 } 1035 1036 return page; 1037 } 1038 1039 #ifdef CONFIG_PGTABLE_MAPPING 1040 static inline int __zs_cpu_up(struct mapping_area *area) 1041 { 1042 /* 1043 * Make sure we don't leak memory if a cpu UP notification 1044 * and zs_init() race and both call zs_cpu_up() on the same cpu 1045 */ 1046 if (area->vm) 1047 return 0; 1048 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1049 if (!area->vm) 1050 return -ENOMEM; 1051 return 0; 1052 } 1053 1054 static inline void __zs_cpu_down(struct mapping_area *area) 1055 { 1056 if (area->vm) 1057 free_vm_area(area->vm); 1058 area->vm = NULL; 1059 } 1060 1061 static inline void *__zs_map_object(struct mapping_area *area, 1062 struct page *pages[2], int off, int size) 1063 { 1064 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1065 area->vm_addr = area->vm->addr; 1066 return area->vm_addr + off; 1067 } 1068 1069 static inline void __zs_unmap_object(struct mapping_area *area, 1070 struct page *pages[2], int off, int size) 1071 { 1072 unsigned long addr = (unsigned long)area->vm_addr; 1073 1074 unmap_kernel_range(addr, PAGE_SIZE * 2); 1075 } 1076 1077 #else /* CONFIG_PGTABLE_MAPPING */ 1078 1079 static inline int __zs_cpu_up(struct mapping_area *area) 1080 { 1081 /* 1082 * Make sure we don't leak memory if a cpu UP notification 1083 * and zs_init() race and both call zs_cpu_up() on the same cpu 1084 */ 1085 if (area->vm_buf) 1086 return 0; 1087 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1088 if (!area->vm_buf) 1089 return -ENOMEM; 1090 return 0; 1091 } 1092 1093 static inline void __zs_cpu_down(struct mapping_area *area) 1094 { 1095 kfree(area->vm_buf); 1096 area->vm_buf = NULL; 1097 } 1098 1099 static void *__zs_map_object(struct mapping_area *area, 1100 struct page *pages[2], int off, int size) 1101 { 1102 int sizes[2]; 1103 void *addr; 1104 char *buf = area->vm_buf; 1105 1106 /* disable page faults to match kmap_atomic() return conditions */ 1107 pagefault_disable(); 1108 1109 /* no read fastpath */ 1110 if (area->vm_mm == ZS_MM_WO) 1111 goto out; 1112 1113 sizes[0] = PAGE_SIZE - off; 1114 sizes[1] = size - sizes[0]; 1115 1116 /* copy object to per-cpu buffer */ 1117 addr = kmap_atomic(pages[0]); 1118 memcpy(buf, addr + off, sizes[0]); 1119 kunmap_atomic(addr); 1120 addr = kmap_atomic(pages[1]); 1121 memcpy(buf + sizes[0], addr, sizes[1]); 1122 kunmap_atomic(addr); 1123 out: 1124 return area->vm_buf; 1125 } 1126 1127 static void __zs_unmap_object(struct mapping_area *area, 1128 struct page *pages[2], int off, int size) 1129 { 1130 int sizes[2]; 1131 void *addr; 1132 char *buf; 1133 1134 /* no write fastpath */ 1135 if (area->vm_mm == ZS_MM_RO) 1136 goto out; 1137 1138 buf = area->vm_buf; 1139 buf = buf + ZS_HANDLE_SIZE; 1140 size -= ZS_HANDLE_SIZE; 1141 off += ZS_HANDLE_SIZE; 1142 1143 sizes[0] = PAGE_SIZE - off; 1144 sizes[1] = size - sizes[0]; 1145 1146 /* copy per-cpu buffer to object */ 1147 addr = kmap_atomic(pages[0]); 1148 memcpy(addr + off, buf, sizes[0]); 1149 kunmap_atomic(addr); 1150 addr = kmap_atomic(pages[1]); 1151 memcpy(addr, buf + sizes[0], sizes[1]); 1152 kunmap_atomic(addr); 1153 1154 out: 1155 /* enable page faults to match kunmap_atomic() return conditions */ 1156 pagefault_enable(); 1157 } 1158 1159 #endif /* CONFIG_PGTABLE_MAPPING */ 1160 1161 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1162 void *pcpu) 1163 { 1164 int ret, cpu = (long)pcpu; 1165 struct mapping_area *area; 1166 1167 switch (action) { 1168 case CPU_UP_PREPARE: 1169 area = &per_cpu(zs_map_area, cpu); 1170 ret = __zs_cpu_up(area); 1171 if (ret) 1172 return notifier_from_errno(ret); 1173 break; 1174 case CPU_DEAD: 1175 case CPU_UP_CANCELED: 1176 area = &per_cpu(zs_map_area, cpu); 1177 __zs_cpu_down(area); 1178 break; 1179 } 1180 1181 return NOTIFY_OK; 1182 } 1183 1184 static struct notifier_block zs_cpu_nb = { 1185 .notifier_call = zs_cpu_notifier 1186 }; 1187 1188 static int zs_register_cpu_notifier(void) 1189 { 1190 int cpu, uninitialized_var(ret); 1191 1192 cpu_notifier_register_begin(); 1193 1194 __register_cpu_notifier(&zs_cpu_nb); 1195 for_each_online_cpu(cpu) { 1196 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1197 if (notifier_to_errno(ret)) 1198 break; 1199 } 1200 1201 cpu_notifier_register_done(); 1202 return notifier_to_errno(ret); 1203 } 1204 1205 static void zs_unregister_cpu_notifier(void) 1206 { 1207 int cpu; 1208 1209 cpu_notifier_register_begin(); 1210 1211 for_each_online_cpu(cpu) 1212 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1213 __unregister_cpu_notifier(&zs_cpu_nb); 1214 1215 cpu_notifier_register_done(); 1216 } 1217 1218 static void init_zs_size_classes(void) 1219 { 1220 int nr; 1221 1222 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1223 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1224 nr += 1; 1225 1226 zs_size_classes = nr; 1227 } 1228 1229 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1230 { 1231 if (prev->pages_per_zspage != pages_per_zspage) 1232 return false; 1233 1234 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1235 != get_maxobj_per_zspage(size, pages_per_zspage)) 1236 return false; 1237 1238 return true; 1239 } 1240 1241 static bool zspage_full(struct page *first_page) 1242 { 1243 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 1244 1245 return first_page->inuse == first_page->objects; 1246 } 1247 1248 unsigned long zs_get_total_pages(struct zs_pool *pool) 1249 { 1250 return atomic_long_read(&pool->pages_allocated); 1251 } 1252 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1253 1254 /** 1255 * zs_map_object - get address of allocated object from handle. 1256 * @pool: pool from which the object was allocated 1257 * @handle: handle returned from zs_malloc 1258 * 1259 * Before using an object allocated from zs_malloc, it must be mapped using 1260 * this function. When done with the object, it must be unmapped using 1261 * zs_unmap_object. 1262 * 1263 * Only one object can be mapped per cpu at a time. There is no protection 1264 * against nested mappings. 1265 * 1266 * This function returns with preemption and page faults disabled. 1267 */ 1268 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1269 enum zs_mapmode mm) 1270 { 1271 struct page *page; 1272 unsigned long obj, obj_idx, off; 1273 1274 unsigned int class_idx; 1275 enum fullness_group fg; 1276 struct size_class *class; 1277 struct mapping_area *area; 1278 struct page *pages[2]; 1279 void *ret; 1280 1281 /* 1282 * Because we use per-cpu mapping areas shared among the 1283 * pools/users, we can't allow mapping in interrupt context 1284 * because it can corrupt another users mappings. 1285 */ 1286 WARN_ON_ONCE(in_interrupt()); 1287 1288 /* From now on, migration cannot move the object */ 1289 pin_tag(handle); 1290 1291 obj = handle_to_obj(handle); 1292 obj_to_location(obj, &page, &obj_idx); 1293 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1294 class = pool->size_class[class_idx]; 1295 off = obj_idx_to_offset(page, obj_idx, class->size); 1296 1297 area = &get_cpu_var(zs_map_area); 1298 area->vm_mm = mm; 1299 if (off + class->size <= PAGE_SIZE) { 1300 /* this object is contained entirely within a page */ 1301 area->vm_addr = kmap_atomic(page); 1302 ret = area->vm_addr + off; 1303 goto out; 1304 } 1305 1306 /* this object spans two pages */ 1307 pages[0] = page; 1308 pages[1] = get_next_page(page); 1309 BUG_ON(!pages[1]); 1310 1311 ret = __zs_map_object(area, pages, off, class->size); 1312 out: 1313 if (!class->huge) 1314 ret += ZS_HANDLE_SIZE; 1315 1316 return ret; 1317 } 1318 EXPORT_SYMBOL_GPL(zs_map_object); 1319 1320 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1321 { 1322 struct page *page; 1323 unsigned long obj, obj_idx, off; 1324 1325 unsigned int class_idx; 1326 enum fullness_group fg; 1327 struct size_class *class; 1328 struct mapping_area *area; 1329 1330 obj = handle_to_obj(handle); 1331 obj_to_location(obj, &page, &obj_idx); 1332 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1333 class = pool->size_class[class_idx]; 1334 off = obj_idx_to_offset(page, obj_idx, class->size); 1335 1336 area = this_cpu_ptr(&zs_map_area); 1337 if (off + class->size <= PAGE_SIZE) 1338 kunmap_atomic(area->vm_addr); 1339 else { 1340 struct page *pages[2]; 1341 1342 pages[0] = page; 1343 pages[1] = get_next_page(page); 1344 BUG_ON(!pages[1]); 1345 1346 __zs_unmap_object(area, pages, off, class->size); 1347 } 1348 put_cpu_var(zs_map_area); 1349 unpin_tag(handle); 1350 } 1351 EXPORT_SYMBOL_GPL(zs_unmap_object); 1352 1353 static unsigned long obj_malloc(struct size_class *class, 1354 struct page *first_page, unsigned long handle) 1355 { 1356 unsigned long obj; 1357 struct link_free *link; 1358 1359 struct page *m_page; 1360 unsigned long m_objidx, m_offset; 1361 void *vaddr; 1362 1363 handle |= OBJ_ALLOCATED_TAG; 1364 obj = (unsigned long)first_page->freelist; 1365 obj_to_location(obj, &m_page, &m_objidx); 1366 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1367 1368 vaddr = kmap_atomic(m_page); 1369 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1370 first_page->freelist = link->next; 1371 if (!class->huge) 1372 /* record handle in the header of allocated chunk */ 1373 link->handle = handle; 1374 else 1375 /* record handle in first_page->private */ 1376 set_page_private(first_page, handle); 1377 kunmap_atomic(vaddr); 1378 first_page->inuse++; 1379 zs_stat_inc(class, OBJ_USED, 1); 1380 1381 return obj; 1382 } 1383 1384 1385 /** 1386 * zs_malloc - Allocate block of given size from pool. 1387 * @pool: pool to allocate from 1388 * @size: size of block to allocate 1389 * 1390 * On success, handle to the allocated object is returned, 1391 * otherwise 0. 1392 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1393 */ 1394 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) 1395 { 1396 unsigned long handle, obj; 1397 struct size_class *class; 1398 struct page *first_page; 1399 1400 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1401 return 0; 1402 1403 handle = alloc_handle(pool, gfp); 1404 if (!handle) 1405 return 0; 1406 1407 /* extra space in chunk to keep the handle */ 1408 size += ZS_HANDLE_SIZE; 1409 class = pool->size_class[get_size_class_index(size)]; 1410 1411 spin_lock(&class->lock); 1412 first_page = find_get_zspage(class); 1413 1414 if (!first_page) { 1415 spin_unlock(&class->lock); 1416 first_page = alloc_zspage(class, gfp); 1417 if (unlikely(!first_page)) { 1418 free_handle(pool, handle); 1419 return 0; 1420 } 1421 1422 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1423 atomic_long_add(class->pages_per_zspage, 1424 &pool->pages_allocated); 1425 1426 spin_lock(&class->lock); 1427 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1428 class->size, class->pages_per_zspage)); 1429 } 1430 1431 obj = obj_malloc(class, first_page, handle); 1432 /* Now move the zspage to another fullness group, if required */ 1433 fix_fullness_group(class, first_page); 1434 record_obj(handle, obj); 1435 spin_unlock(&class->lock); 1436 1437 return handle; 1438 } 1439 EXPORT_SYMBOL_GPL(zs_malloc); 1440 1441 static void obj_free(struct size_class *class, unsigned long obj) 1442 { 1443 struct link_free *link; 1444 struct page *first_page, *f_page; 1445 unsigned long f_objidx, f_offset; 1446 void *vaddr; 1447 1448 obj &= ~OBJ_ALLOCATED_TAG; 1449 obj_to_location(obj, &f_page, &f_objidx); 1450 first_page = get_first_page(f_page); 1451 1452 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1453 1454 vaddr = kmap_atomic(f_page); 1455 1456 /* Insert this object in containing zspage's freelist */ 1457 link = (struct link_free *)(vaddr + f_offset); 1458 link->next = first_page->freelist; 1459 if (class->huge) 1460 set_page_private(first_page, 0); 1461 kunmap_atomic(vaddr); 1462 first_page->freelist = (void *)obj; 1463 first_page->inuse--; 1464 zs_stat_dec(class, OBJ_USED, 1); 1465 } 1466 1467 void zs_free(struct zs_pool *pool, unsigned long handle) 1468 { 1469 struct page *first_page, *f_page; 1470 unsigned long obj, f_objidx; 1471 int class_idx; 1472 struct size_class *class; 1473 enum fullness_group fullness; 1474 1475 if (unlikely(!handle)) 1476 return; 1477 1478 pin_tag(handle); 1479 obj = handle_to_obj(handle); 1480 obj_to_location(obj, &f_page, &f_objidx); 1481 first_page = get_first_page(f_page); 1482 1483 get_zspage_mapping(first_page, &class_idx, &fullness); 1484 class = pool->size_class[class_idx]; 1485 1486 spin_lock(&class->lock); 1487 obj_free(class, obj); 1488 fullness = fix_fullness_group(class, first_page); 1489 if (fullness == ZS_EMPTY) { 1490 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1491 class->size, class->pages_per_zspage)); 1492 atomic_long_sub(class->pages_per_zspage, 1493 &pool->pages_allocated); 1494 free_zspage(first_page); 1495 } 1496 spin_unlock(&class->lock); 1497 unpin_tag(handle); 1498 1499 free_handle(pool, handle); 1500 } 1501 EXPORT_SYMBOL_GPL(zs_free); 1502 1503 static void zs_object_copy(struct size_class *class, unsigned long dst, 1504 unsigned long src) 1505 { 1506 struct page *s_page, *d_page; 1507 unsigned long s_objidx, d_objidx; 1508 unsigned long s_off, d_off; 1509 void *s_addr, *d_addr; 1510 int s_size, d_size, size; 1511 int written = 0; 1512 1513 s_size = d_size = class->size; 1514 1515 obj_to_location(src, &s_page, &s_objidx); 1516 obj_to_location(dst, &d_page, &d_objidx); 1517 1518 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1519 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1520 1521 if (s_off + class->size > PAGE_SIZE) 1522 s_size = PAGE_SIZE - s_off; 1523 1524 if (d_off + class->size > PAGE_SIZE) 1525 d_size = PAGE_SIZE - d_off; 1526 1527 s_addr = kmap_atomic(s_page); 1528 d_addr = kmap_atomic(d_page); 1529 1530 while (1) { 1531 size = min(s_size, d_size); 1532 memcpy(d_addr + d_off, s_addr + s_off, size); 1533 written += size; 1534 1535 if (written == class->size) 1536 break; 1537 1538 s_off += size; 1539 s_size -= size; 1540 d_off += size; 1541 d_size -= size; 1542 1543 if (s_off >= PAGE_SIZE) { 1544 kunmap_atomic(d_addr); 1545 kunmap_atomic(s_addr); 1546 s_page = get_next_page(s_page); 1547 s_addr = kmap_atomic(s_page); 1548 d_addr = kmap_atomic(d_page); 1549 s_size = class->size - written; 1550 s_off = 0; 1551 } 1552 1553 if (d_off >= PAGE_SIZE) { 1554 kunmap_atomic(d_addr); 1555 d_page = get_next_page(d_page); 1556 d_addr = kmap_atomic(d_page); 1557 d_size = class->size - written; 1558 d_off = 0; 1559 } 1560 } 1561 1562 kunmap_atomic(d_addr); 1563 kunmap_atomic(s_addr); 1564 } 1565 1566 /* 1567 * Find alloced object in zspage from index object and 1568 * return handle. 1569 */ 1570 static unsigned long find_alloced_obj(struct size_class *class, 1571 struct page *page, int index) 1572 { 1573 unsigned long head; 1574 int offset = 0; 1575 unsigned long handle = 0; 1576 void *addr = kmap_atomic(page); 1577 1578 if (!is_first_page(page)) 1579 offset = page->index; 1580 offset += class->size * index; 1581 1582 while (offset < PAGE_SIZE) { 1583 head = obj_to_head(class, page, addr + offset); 1584 if (head & OBJ_ALLOCATED_TAG) { 1585 handle = head & ~OBJ_ALLOCATED_TAG; 1586 if (trypin_tag(handle)) 1587 break; 1588 handle = 0; 1589 } 1590 1591 offset += class->size; 1592 index++; 1593 } 1594 1595 kunmap_atomic(addr); 1596 return handle; 1597 } 1598 1599 struct zs_compact_control { 1600 /* Source page for migration which could be a subpage of zspage. */ 1601 struct page *s_page; 1602 /* Destination page for migration which should be a first page 1603 * of zspage. */ 1604 struct page *d_page; 1605 /* Starting object index within @s_page which used for live object 1606 * in the subpage. */ 1607 int index; 1608 }; 1609 1610 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1611 struct zs_compact_control *cc) 1612 { 1613 unsigned long used_obj, free_obj; 1614 unsigned long handle; 1615 struct page *s_page = cc->s_page; 1616 struct page *d_page = cc->d_page; 1617 unsigned long index = cc->index; 1618 int ret = 0; 1619 1620 while (1) { 1621 handle = find_alloced_obj(class, s_page, index); 1622 if (!handle) { 1623 s_page = get_next_page(s_page); 1624 if (!s_page) 1625 break; 1626 index = 0; 1627 continue; 1628 } 1629 1630 /* Stop if there is no more space */ 1631 if (zspage_full(d_page)) { 1632 unpin_tag(handle); 1633 ret = -ENOMEM; 1634 break; 1635 } 1636 1637 used_obj = handle_to_obj(handle); 1638 free_obj = obj_malloc(class, d_page, handle); 1639 zs_object_copy(class, free_obj, used_obj); 1640 index++; 1641 /* 1642 * record_obj updates handle's value to free_obj and it will 1643 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which 1644 * breaks synchronization using pin_tag(e,g, zs_free) so 1645 * let's keep the lock bit. 1646 */ 1647 free_obj |= BIT(HANDLE_PIN_BIT); 1648 record_obj(handle, free_obj); 1649 unpin_tag(handle); 1650 obj_free(class, used_obj); 1651 } 1652 1653 /* Remember last position in this iteration */ 1654 cc->s_page = s_page; 1655 cc->index = index; 1656 1657 return ret; 1658 } 1659 1660 static struct page *isolate_target_page(struct size_class *class) 1661 { 1662 int i; 1663 struct page *page; 1664 1665 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1666 page = class->fullness_list[i]; 1667 if (page) { 1668 remove_zspage(class, i, page); 1669 break; 1670 } 1671 } 1672 1673 return page; 1674 } 1675 1676 /* 1677 * putback_zspage - add @first_page into right class's fullness list 1678 * @pool: target pool 1679 * @class: destination class 1680 * @first_page: target page 1681 * 1682 * Return @fist_page's fullness_group 1683 */ 1684 static enum fullness_group putback_zspage(struct zs_pool *pool, 1685 struct size_class *class, 1686 struct page *first_page) 1687 { 1688 enum fullness_group fullness; 1689 1690 fullness = get_fullness_group(first_page); 1691 insert_zspage(class, fullness, first_page); 1692 set_zspage_mapping(first_page, class->index, fullness); 1693 1694 if (fullness == ZS_EMPTY) { 1695 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1696 class->size, class->pages_per_zspage)); 1697 atomic_long_sub(class->pages_per_zspage, 1698 &pool->pages_allocated); 1699 1700 free_zspage(first_page); 1701 } 1702 1703 return fullness; 1704 } 1705 1706 static struct page *isolate_source_page(struct size_class *class) 1707 { 1708 int i; 1709 struct page *page = NULL; 1710 1711 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1712 page = class->fullness_list[i]; 1713 if (!page) 1714 continue; 1715 1716 remove_zspage(class, i, page); 1717 break; 1718 } 1719 1720 return page; 1721 } 1722 1723 /* 1724 * 1725 * Based on the number of unused allocated objects calculate 1726 * and return the number of pages that we can free. 1727 */ 1728 static unsigned long zs_can_compact(struct size_class *class) 1729 { 1730 unsigned long obj_wasted; 1731 unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 1732 unsigned long obj_used = zs_stat_get(class, OBJ_USED); 1733 1734 if (obj_allocated <= obj_used) 1735 return 0; 1736 1737 obj_wasted = obj_allocated - obj_used; 1738 obj_wasted /= get_maxobj_per_zspage(class->size, 1739 class->pages_per_zspage); 1740 1741 return obj_wasted * class->pages_per_zspage; 1742 } 1743 1744 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1745 { 1746 struct zs_compact_control cc; 1747 struct page *src_page; 1748 struct page *dst_page = NULL; 1749 1750 spin_lock(&class->lock); 1751 while ((src_page = isolate_source_page(class))) { 1752 1753 if (!zs_can_compact(class)) 1754 break; 1755 1756 cc.index = 0; 1757 cc.s_page = src_page; 1758 1759 while ((dst_page = isolate_target_page(class))) { 1760 cc.d_page = dst_page; 1761 /* 1762 * If there is no more space in dst_page, resched 1763 * and see if anyone had allocated another zspage. 1764 */ 1765 if (!migrate_zspage(pool, class, &cc)) 1766 break; 1767 1768 putback_zspage(pool, class, dst_page); 1769 } 1770 1771 /* Stop if we couldn't find slot */ 1772 if (dst_page == NULL) 1773 break; 1774 1775 putback_zspage(pool, class, dst_page); 1776 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1777 pool->stats.pages_compacted += class->pages_per_zspage; 1778 spin_unlock(&class->lock); 1779 cond_resched(); 1780 spin_lock(&class->lock); 1781 } 1782 1783 if (src_page) 1784 putback_zspage(pool, class, src_page); 1785 1786 spin_unlock(&class->lock); 1787 } 1788 1789 unsigned long zs_compact(struct zs_pool *pool) 1790 { 1791 int i; 1792 struct size_class *class; 1793 1794 for (i = zs_size_classes - 1; i >= 0; i--) { 1795 class = pool->size_class[i]; 1796 if (!class) 1797 continue; 1798 if (class->index != i) 1799 continue; 1800 __zs_compact(pool, class); 1801 } 1802 1803 return pool->stats.pages_compacted; 1804 } 1805 EXPORT_SYMBOL_GPL(zs_compact); 1806 1807 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1808 { 1809 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1810 } 1811 EXPORT_SYMBOL_GPL(zs_pool_stats); 1812 1813 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1814 struct shrink_control *sc) 1815 { 1816 unsigned long pages_freed; 1817 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1818 shrinker); 1819 1820 pages_freed = pool->stats.pages_compacted; 1821 /* 1822 * Compact classes and calculate compaction delta. 1823 * Can run concurrently with a manually triggered 1824 * (by user) compaction. 1825 */ 1826 pages_freed = zs_compact(pool) - pages_freed; 1827 1828 return pages_freed ? pages_freed : SHRINK_STOP; 1829 } 1830 1831 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1832 struct shrink_control *sc) 1833 { 1834 int i; 1835 struct size_class *class; 1836 unsigned long pages_to_free = 0; 1837 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1838 shrinker); 1839 1840 for (i = zs_size_classes - 1; i >= 0; i--) { 1841 class = pool->size_class[i]; 1842 if (!class) 1843 continue; 1844 if (class->index != i) 1845 continue; 1846 1847 pages_to_free += zs_can_compact(class); 1848 } 1849 1850 return pages_to_free; 1851 } 1852 1853 static void zs_unregister_shrinker(struct zs_pool *pool) 1854 { 1855 if (pool->shrinker_enabled) { 1856 unregister_shrinker(&pool->shrinker); 1857 pool->shrinker_enabled = false; 1858 } 1859 } 1860 1861 static int zs_register_shrinker(struct zs_pool *pool) 1862 { 1863 pool->shrinker.scan_objects = zs_shrinker_scan; 1864 pool->shrinker.count_objects = zs_shrinker_count; 1865 pool->shrinker.batch = 0; 1866 pool->shrinker.seeks = DEFAULT_SEEKS; 1867 1868 return register_shrinker(&pool->shrinker); 1869 } 1870 1871 /** 1872 * zs_create_pool - Creates an allocation pool to work from. 1873 * @flags: allocation flags used to allocate pool metadata 1874 * 1875 * This function must be called before anything when using 1876 * the zsmalloc allocator. 1877 * 1878 * On success, a pointer to the newly created pool is returned, 1879 * otherwise NULL. 1880 */ 1881 struct zs_pool *zs_create_pool(const char *name) 1882 { 1883 int i; 1884 struct zs_pool *pool; 1885 struct size_class *prev_class = NULL; 1886 1887 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1888 if (!pool) 1889 return NULL; 1890 1891 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1892 GFP_KERNEL); 1893 if (!pool->size_class) { 1894 kfree(pool); 1895 return NULL; 1896 } 1897 1898 pool->name = kstrdup(name, GFP_KERNEL); 1899 if (!pool->name) 1900 goto err; 1901 1902 if (create_handle_cache(pool)) 1903 goto err; 1904 1905 /* 1906 * Iterate reversly, because, size of size_class that we want to use 1907 * for merging should be larger or equal to current size. 1908 */ 1909 for (i = zs_size_classes - 1; i >= 0; i--) { 1910 int size; 1911 int pages_per_zspage; 1912 struct size_class *class; 1913 1914 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1915 if (size > ZS_MAX_ALLOC_SIZE) 1916 size = ZS_MAX_ALLOC_SIZE; 1917 pages_per_zspage = get_pages_per_zspage(size); 1918 1919 /* 1920 * size_class is used for normal zsmalloc operation such 1921 * as alloc/free for that size. Although it is natural that we 1922 * have one size_class for each size, there is a chance that we 1923 * can get more memory utilization if we use one size_class for 1924 * many different sizes whose size_class have same 1925 * characteristics. So, we makes size_class point to 1926 * previous size_class if possible. 1927 */ 1928 if (prev_class) { 1929 if (can_merge(prev_class, size, pages_per_zspage)) { 1930 pool->size_class[i] = prev_class; 1931 continue; 1932 } 1933 } 1934 1935 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1936 if (!class) 1937 goto err; 1938 1939 class->size = size; 1940 class->index = i; 1941 class->pages_per_zspage = pages_per_zspage; 1942 if (pages_per_zspage == 1 && 1943 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1944 class->huge = true; 1945 spin_lock_init(&class->lock); 1946 pool->size_class[i] = class; 1947 1948 prev_class = class; 1949 } 1950 1951 /* debug only, don't abort if it fails */ 1952 zs_pool_stat_create(pool, name); 1953 1954 /* 1955 * Not critical, we still can use the pool 1956 * and user can trigger compaction manually. 1957 */ 1958 if (zs_register_shrinker(pool) == 0) 1959 pool->shrinker_enabled = true; 1960 return pool; 1961 1962 err: 1963 zs_destroy_pool(pool); 1964 return NULL; 1965 } 1966 EXPORT_SYMBOL_GPL(zs_create_pool); 1967 1968 void zs_destroy_pool(struct zs_pool *pool) 1969 { 1970 int i; 1971 1972 zs_unregister_shrinker(pool); 1973 zs_pool_stat_destroy(pool); 1974 1975 for (i = 0; i < zs_size_classes; i++) { 1976 int fg; 1977 struct size_class *class = pool->size_class[i]; 1978 1979 if (!class) 1980 continue; 1981 1982 if (class->index != i) 1983 continue; 1984 1985 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1986 if (class->fullness_list[fg]) { 1987 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1988 class->size, fg); 1989 } 1990 } 1991 kfree(class); 1992 } 1993 1994 destroy_handle_cache(pool); 1995 kfree(pool->size_class); 1996 kfree(pool->name); 1997 kfree(pool); 1998 } 1999 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2000 2001 static int __init zs_init(void) 2002 { 2003 int ret = zs_register_cpu_notifier(); 2004 2005 if (ret) 2006 goto notifier_fail; 2007 2008 init_zs_size_classes(); 2009 2010 #ifdef CONFIG_ZPOOL 2011 zpool_register_driver(&zs_zpool_driver); 2012 #endif 2013 2014 ret = zs_stat_init(); 2015 if (ret) { 2016 pr_err("zs stat initialization failed\n"); 2017 goto stat_fail; 2018 } 2019 return 0; 2020 2021 stat_fail: 2022 #ifdef CONFIG_ZPOOL 2023 zpool_unregister_driver(&zs_zpool_driver); 2024 #endif 2025 notifier_fail: 2026 zs_unregister_cpu_notifier(); 2027 2028 return ret; 2029 } 2030 2031 static void __exit zs_exit(void) 2032 { 2033 #ifdef CONFIG_ZPOOL 2034 zpool_unregister_driver(&zs_zpool_driver); 2035 #endif 2036 zs_unregister_cpu_notifier(); 2037 2038 zs_stat_exit(); 2039 } 2040 2041 module_init(zs_init); 2042 module_exit(zs_exit); 2043 2044 MODULE_LICENSE("Dual BSD/GPL"); 2045 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2046