1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->private: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private: refers to the component page after the first page 30 * If the page is first_page for huge object, it stores handle. 31 * Look at size_class->huge. 32 * page->freelist: points to the first free object in zspage. 33 * Free objects are linked together using in-place 34 * metadata. 35 * page->objects: maximum number of objects we can store in this 36 * zspage (class->zspage_order * PAGE_SIZE / class->size) 37 * page->lru: links together first pages of various zspages. 38 * Basically forming list of zspages in a fullness group. 39 * page->mapping: class index and fullness group of the zspage 40 * page->inuse: the number of objects that are used in this zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 49 50 #include <linux/module.h> 51 #include <linux/kernel.h> 52 #include <linux/sched.h> 53 #include <linux/bitops.h> 54 #include <linux/errno.h> 55 #include <linux/highmem.h> 56 #include <linux/string.h> 57 #include <linux/slab.h> 58 #include <asm/tlbflush.h> 59 #include <asm/pgtable.h> 60 #include <linux/cpumask.h> 61 #include <linux/cpu.h> 62 #include <linux/vmalloc.h> 63 #include <linux/preempt.h> 64 #include <linux/spinlock.h> 65 #include <linux/types.h> 66 #include <linux/debugfs.h> 67 #include <linux/zsmalloc.h> 68 #include <linux/zpool.h> 69 70 /* 71 * This must be power of 2 and greater than of equal to sizeof(link_free). 72 * These two conditions ensure that any 'struct link_free' itself doesn't 73 * span more than 1 page which avoids complex case of mapping 2 pages simply 74 * to restore link_free pointer values. 75 */ 76 #define ZS_ALIGN 8 77 78 /* 79 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 80 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 81 */ 82 #define ZS_MAX_ZSPAGE_ORDER 2 83 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 84 85 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 86 87 /* 88 * Object location (<PFN>, <obj_idx>) is encoded as 89 * as single (unsigned long) handle value. 90 * 91 * Note that object index <obj_idx> is relative to system 92 * page <PFN> it is stored in, so for each sub-page belonging 93 * to a zspage, obj_idx starts with 0. 94 * 95 * This is made more complicated by various memory models and PAE. 96 */ 97 98 #ifndef MAX_PHYSMEM_BITS 99 #ifdef CONFIG_HIGHMEM64G 100 #define MAX_PHYSMEM_BITS 36 101 #else /* !CONFIG_HIGHMEM64G */ 102 /* 103 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 104 * be PAGE_SHIFT 105 */ 106 #define MAX_PHYSMEM_BITS BITS_PER_LONG 107 #endif 108 #endif 109 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 110 111 /* 112 * Memory for allocating for handle keeps object position by 113 * encoding <page, obj_idx> and the encoded value has a room 114 * in least bit(ie, look at obj_to_location). 115 * We use the bit to synchronize between object access by 116 * user and migration. 117 */ 118 #define HANDLE_PIN_BIT 0 119 120 /* 121 * Head in allocated object should have OBJ_ALLOCATED_TAG 122 * to identify the object was allocated or not. 123 * It's okay to add the status bit in the least bit because 124 * header keeps handle which is 4byte-aligned address so we 125 * have room for two bit at least. 126 */ 127 #define OBJ_ALLOCATED_TAG 1 128 #define OBJ_TAG_BITS 1 129 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 130 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 131 132 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 133 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 134 #define ZS_MIN_ALLOC_SIZE \ 135 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 136 /* each chunk includes extra space to keep handle */ 137 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 138 139 /* 140 * On systems with 4K page size, this gives 255 size classes! There is a 141 * trader-off here: 142 * - Large number of size classes is potentially wasteful as free page are 143 * spread across these classes 144 * - Small number of size classes causes large internal fragmentation 145 * - Probably its better to use specific size classes (empirically 146 * determined). NOTE: all those class sizes must be set as multiple of 147 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 148 * 149 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 150 * (reason above) 151 */ 152 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 153 154 /* 155 * We do not maintain any list for completely empty or full pages 156 */ 157 enum fullness_group { 158 ZS_ALMOST_FULL, 159 ZS_ALMOST_EMPTY, 160 _ZS_NR_FULLNESS_GROUPS, 161 162 ZS_EMPTY, 163 ZS_FULL 164 }; 165 166 enum zs_stat_type { 167 OBJ_ALLOCATED, 168 OBJ_USED, 169 CLASS_ALMOST_FULL, 170 CLASS_ALMOST_EMPTY, 171 }; 172 173 #ifdef CONFIG_ZSMALLOC_STAT 174 #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) 175 #else 176 #define NR_ZS_STAT_TYPE (OBJ_USED + 1) 177 #endif 178 179 struct zs_size_stat { 180 unsigned long objs[NR_ZS_STAT_TYPE]; 181 }; 182 183 #ifdef CONFIG_ZSMALLOC_STAT 184 static struct dentry *zs_stat_root; 185 #endif 186 187 /* 188 * number of size_classes 189 */ 190 static int zs_size_classes; 191 192 /* 193 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 194 * n <= N / f, where 195 * n = number of allocated objects 196 * N = total number of objects zspage can store 197 * f = fullness_threshold_frac 198 * 199 * Similarly, we assign zspage to: 200 * ZS_ALMOST_FULL when n > N / f 201 * ZS_EMPTY when n == 0 202 * ZS_FULL when n == N 203 * 204 * (see: fix_fullness_group()) 205 */ 206 static const int fullness_threshold_frac = 4; 207 208 struct size_class { 209 spinlock_t lock; 210 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 211 /* 212 * Size of objects stored in this class. Must be multiple 213 * of ZS_ALIGN. 214 */ 215 int size; 216 unsigned int index; 217 218 struct zs_size_stat stats; 219 220 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 221 int pages_per_zspage; 222 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 223 bool huge; 224 }; 225 226 /* 227 * Placed within free objects to form a singly linked list. 228 * For every zspage, first_page->freelist gives head of this list. 229 * 230 * This must be power of 2 and less than or equal to ZS_ALIGN 231 */ 232 struct link_free { 233 union { 234 /* 235 * Position of next free chunk (encodes <PFN, obj_idx>) 236 * It's valid for non-allocated object 237 */ 238 void *next; 239 /* 240 * Handle of allocated object. 241 */ 242 unsigned long handle; 243 }; 244 }; 245 246 struct zs_pool { 247 const char *name; 248 249 struct size_class **size_class; 250 struct kmem_cache *handle_cachep; 251 252 atomic_long_t pages_allocated; 253 254 struct zs_pool_stats stats; 255 256 /* Compact classes */ 257 struct shrinker shrinker; 258 /* 259 * To signify that register_shrinker() was successful 260 * and unregister_shrinker() will not Oops. 261 */ 262 bool shrinker_enabled; 263 #ifdef CONFIG_ZSMALLOC_STAT 264 struct dentry *stat_dentry; 265 #endif 266 }; 267 268 /* 269 * A zspage's class index and fullness group 270 * are encoded in its (first)page->mapping 271 */ 272 #define CLASS_IDX_BITS 28 273 #define FULLNESS_BITS 4 274 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 275 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 276 277 struct mapping_area { 278 #ifdef CONFIG_PGTABLE_MAPPING 279 struct vm_struct *vm; /* vm area for mapping object that span pages */ 280 #else 281 char *vm_buf; /* copy buffer for objects that span pages */ 282 #endif 283 char *vm_addr; /* address of kmap_atomic()'ed pages */ 284 enum zs_mapmode vm_mm; /* mapping mode */ 285 }; 286 287 static int create_handle_cache(struct zs_pool *pool) 288 { 289 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 290 0, 0, NULL); 291 return pool->handle_cachep ? 0 : 1; 292 } 293 294 static void destroy_handle_cache(struct zs_pool *pool) 295 { 296 kmem_cache_destroy(pool->handle_cachep); 297 } 298 299 static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) 300 { 301 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 302 gfp & ~__GFP_HIGHMEM); 303 } 304 305 static void free_handle(struct zs_pool *pool, unsigned long handle) 306 { 307 kmem_cache_free(pool->handle_cachep, (void *)handle); 308 } 309 310 static void record_obj(unsigned long handle, unsigned long obj) 311 { 312 /* 313 * lsb of @obj represents handle lock while other bits 314 * represent object value the handle is pointing so 315 * updating shouldn't do store tearing. 316 */ 317 WRITE_ONCE(*(unsigned long *)handle, obj); 318 } 319 320 /* zpool driver */ 321 322 #ifdef CONFIG_ZPOOL 323 324 static void *zs_zpool_create(const char *name, gfp_t gfp, 325 const struct zpool_ops *zpool_ops, 326 struct zpool *zpool) 327 { 328 /* 329 * Ignore global gfp flags: zs_malloc() may be invoked from 330 * different contexts and its caller must provide a valid 331 * gfp mask. 332 */ 333 return zs_create_pool(name); 334 } 335 336 static void zs_zpool_destroy(void *pool) 337 { 338 zs_destroy_pool(pool); 339 } 340 341 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 342 unsigned long *handle) 343 { 344 *handle = zs_malloc(pool, size, gfp); 345 return *handle ? 0 : -1; 346 } 347 static void zs_zpool_free(void *pool, unsigned long handle) 348 { 349 zs_free(pool, handle); 350 } 351 352 static int zs_zpool_shrink(void *pool, unsigned int pages, 353 unsigned int *reclaimed) 354 { 355 return -EINVAL; 356 } 357 358 static void *zs_zpool_map(void *pool, unsigned long handle, 359 enum zpool_mapmode mm) 360 { 361 enum zs_mapmode zs_mm; 362 363 switch (mm) { 364 case ZPOOL_MM_RO: 365 zs_mm = ZS_MM_RO; 366 break; 367 case ZPOOL_MM_WO: 368 zs_mm = ZS_MM_WO; 369 break; 370 case ZPOOL_MM_RW: /* fallthru */ 371 default: 372 zs_mm = ZS_MM_RW; 373 break; 374 } 375 376 return zs_map_object(pool, handle, zs_mm); 377 } 378 static void zs_zpool_unmap(void *pool, unsigned long handle) 379 { 380 zs_unmap_object(pool, handle); 381 } 382 383 static u64 zs_zpool_total_size(void *pool) 384 { 385 return zs_get_total_pages(pool) << PAGE_SHIFT; 386 } 387 388 static struct zpool_driver zs_zpool_driver = { 389 .type = "zsmalloc", 390 .owner = THIS_MODULE, 391 .create = zs_zpool_create, 392 .destroy = zs_zpool_destroy, 393 .malloc = zs_zpool_malloc, 394 .free = zs_zpool_free, 395 .shrink = zs_zpool_shrink, 396 .map = zs_zpool_map, 397 .unmap = zs_zpool_unmap, 398 .total_size = zs_zpool_total_size, 399 }; 400 401 MODULE_ALIAS("zpool-zsmalloc"); 402 #endif /* CONFIG_ZPOOL */ 403 404 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 405 { 406 return pages_per_zspage * PAGE_SIZE / size; 407 } 408 409 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 410 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 411 412 static int is_first_page(struct page *page) 413 { 414 return PagePrivate(page); 415 } 416 417 static int is_last_page(struct page *page) 418 { 419 return PagePrivate2(page); 420 } 421 422 static void get_zspage_mapping(struct page *first_page, 423 unsigned int *class_idx, 424 enum fullness_group *fullness) 425 { 426 unsigned long m; 427 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 428 429 m = (unsigned long)first_page->mapping; 430 *fullness = m & FULLNESS_MASK; 431 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 432 } 433 434 static void set_zspage_mapping(struct page *first_page, 435 unsigned int class_idx, 436 enum fullness_group fullness) 437 { 438 unsigned long m; 439 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 440 441 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 442 (fullness & FULLNESS_MASK); 443 first_page->mapping = (struct address_space *)m; 444 } 445 446 /* 447 * zsmalloc divides the pool into various size classes where each 448 * class maintains a list of zspages where each zspage is divided 449 * into equal sized chunks. Each allocation falls into one of these 450 * classes depending on its size. This function returns index of the 451 * size class which has chunk size big enough to hold the give size. 452 */ 453 static int get_size_class_index(int size) 454 { 455 int idx = 0; 456 457 if (likely(size > ZS_MIN_ALLOC_SIZE)) 458 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 459 ZS_SIZE_CLASS_DELTA); 460 461 return min(zs_size_classes - 1, idx); 462 } 463 464 static inline void zs_stat_inc(struct size_class *class, 465 enum zs_stat_type type, unsigned long cnt) 466 { 467 if (type < NR_ZS_STAT_TYPE) 468 class->stats.objs[type] += cnt; 469 } 470 471 static inline void zs_stat_dec(struct size_class *class, 472 enum zs_stat_type type, unsigned long cnt) 473 { 474 if (type < NR_ZS_STAT_TYPE) 475 class->stats.objs[type] -= cnt; 476 } 477 478 static inline unsigned long zs_stat_get(struct size_class *class, 479 enum zs_stat_type type) 480 { 481 if (type < NR_ZS_STAT_TYPE) 482 return class->stats.objs[type]; 483 return 0; 484 } 485 486 #ifdef CONFIG_ZSMALLOC_STAT 487 488 static void __init zs_stat_init(void) 489 { 490 if (!debugfs_initialized()) { 491 pr_warn("debugfs not available, stat dir not created\n"); 492 return; 493 } 494 495 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 496 if (!zs_stat_root) 497 pr_warn("debugfs 'zsmalloc' stat dir creation failed\n"); 498 } 499 500 static void __exit zs_stat_exit(void) 501 { 502 debugfs_remove_recursive(zs_stat_root); 503 } 504 505 static unsigned long zs_can_compact(struct size_class *class); 506 507 static int zs_stats_size_show(struct seq_file *s, void *v) 508 { 509 int i; 510 struct zs_pool *pool = s->private; 511 struct size_class *class; 512 int objs_per_zspage; 513 unsigned long class_almost_full, class_almost_empty; 514 unsigned long obj_allocated, obj_used, pages_used, freeable; 515 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 516 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 517 unsigned long total_freeable = 0; 518 519 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", 520 "class", "size", "almost_full", "almost_empty", 521 "obj_allocated", "obj_used", "pages_used", 522 "pages_per_zspage", "freeable"); 523 524 for (i = 0; i < zs_size_classes; i++) { 525 class = pool->size_class[i]; 526 527 if (class->index != i) 528 continue; 529 530 spin_lock(&class->lock); 531 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 532 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 533 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 534 obj_used = zs_stat_get(class, OBJ_USED); 535 freeable = zs_can_compact(class); 536 spin_unlock(&class->lock); 537 538 objs_per_zspage = get_maxobj_per_zspage(class->size, 539 class->pages_per_zspage); 540 pages_used = obj_allocated / objs_per_zspage * 541 class->pages_per_zspage; 542 543 seq_printf(s, " %5u %5u %11lu %12lu %13lu" 544 " %10lu %10lu %16d %8lu\n", 545 i, class->size, class_almost_full, class_almost_empty, 546 obj_allocated, obj_used, pages_used, 547 class->pages_per_zspage, freeable); 548 549 total_class_almost_full += class_almost_full; 550 total_class_almost_empty += class_almost_empty; 551 total_objs += obj_allocated; 552 total_used_objs += obj_used; 553 total_pages += pages_used; 554 total_freeable += freeable; 555 } 556 557 seq_puts(s, "\n"); 558 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", 559 "Total", "", total_class_almost_full, 560 total_class_almost_empty, total_objs, 561 total_used_objs, total_pages, "", total_freeable); 562 563 return 0; 564 } 565 566 static int zs_stats_size_open(struct inode *inode, struct file *file) 567 { 568 return single_open(file, zs_stats_size_show, inode->i_private); 569 } 570 571 static const struct file_operations zs_stat_size_ops = { 572 .open = zs_stats_size_open, 573 .read = seq_read, 574 .llseek = seq_lseek, 575 .release = single_release, 576 }; 577 578 static void zs_pool_stat_create(struct zs_pool *pool, const char *name) 579 { 580 struct dentry *entry; 581 582 if (!zs_stat_root) { 583 pr_warn("no root stat dir, not creating <%s> stat dir\n", name); 584 return; 585 } 586 587 entry = debugfs_create_dir(name, zs_stat_root); 588 if (!entry) { 589 pr_warn("debugfs dir <%s> creation failed\n", name); 590 return; 591 } 592 pool->stat_dentry = entry; 593 594 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 595 pool->stat_dentry, pool, &zs_stat_size_ops); 596 if (!entry) { 597 pr_warn("%s: debugfs file entry <%s> creation failed\n", 598 name, "classes"); 599 debugfs_remove_recursive(pool->stat_dentry); 600 pool->stat_dentry = NULL; 601 } 602 } 603 604 static void zs_pool_stat_destroy(struct zs_pool *pool) 605 { 606 debugfs_remove_recursive(pool->stat_dentry); 607 } 608 609 #else /* CONFIG_ZSMALLOC_STAT */ 610 static void __init zs_stat_init(void) 611 { 612 } 613 614 static void __exit zs_stat_exit(void) 615 { 616 } 617 618 static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) 619 { 620 } 621 622 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 623 { 624 } 625 #endif 626 627 /* 628 * For each size class, zspages are divided into different groups 629 * depending on how "full" they are. This was done so that we could 630 * easily find empty or nearly empty zspages when we try to shrink 631 * the pool (not yet implemented). This function returns fullness 632 * status of the given page. 633 */ 634 static enum fullness_group get_fullness_group(struct page *first_page) 635 { 636 int inuse, max_objects; 637 enum fullness_group fg; 638 639 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 640 641 inuse = first_page->inuse; 642 max_objects = first_page->objects; 643 644 if (inuse == 0) 645 fg = ZS_EMPTY; 646 else if (inuse == max_objects) 647 fg = ZS_FULL; 648 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 649 fg = ZS_ALMOST_EMPTY; 650 else 651 fg = ZS_ALMOST_FULL; 652 653 return fg; 654 } 655 656 /* 657 * Each size class maintains various freelists and zspages are assigned 658 * to one of these freelists based on the number of live objects they 659 * have. This functions inserts the given zspage into the freelist 660 * identified by <class, fullness_group>. 661 */ 662 static void insert_zspage(struct size_class *class, 663 enum fullness_group fullness, 664 struct page *first_page) 665 { 666 struct page **head; 667 668 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 669 670 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 671 return; 672 673 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 674 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 675 676 head = &class->fullness_list[fullness]; 677 if (!*head) { 678 *head = first_page; 679 return; 680 } 681 682 /* 683 * We want to see more ZS_FULL pages and less almost 684 * empty/full. Put pages with higher ->inuse first. 685 */ 686 list_add_tail(&first_page->lru, &(*head)->lru); 687 if (first_page->inuse >= (*head)->inuse) 688 *head = first_page; 689 } 690 691 /* 692 * This function removes the given zspage from the freelist identified 693 * by <class, fullness_group>. 694 */ 695 static void remove_zspage(struct size_class *class, 696 enum fullness_group fullness, 697 struct page *first_page) 698 { 699 struct page **head; 700 701 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 702 703 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 704 return; 705 706 head = &class->fullness_list[fullness]; 707 VM_BUG_ON_PAGE(!*head, first_page); 708 if (list_empty(&(*head)->lru)) 709 *head = NULL; 710 else if (*head == first_page) 711 *head = (struct page *)list_entry((*head)->lru.next, 712 struct page, lru); 713 714 list_del_init(&first_page->lru); 715 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 716 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 717 } 718 719 /* 720 * Each size class maintains zspages in different fullness groups depending 721 * on the number of live objects they contain. When allocating or freeing 722 * objects, the fullness status of the page can change, say, from ALMOST_FULL 723 * to ALMOST_EMPTY when freeing an object. This function checks if such 724 * a status change has occurred for the given page and accordingly moves the 725 * page from the freelist of the old fullness group to that of the new 726 * fullness group. 727 */ 728 static enum fullness_group fix_fullness_group(struct size_class *class, 729 struct page *first_page) 730 { 731 int class_idx; 732 enum fullness_group currfg, newfg; 733 734 get_zspage_mapping(first_page, &class_idx, &currfg); 735 newfg = get_fullness_group(first_page); 736 if (newfg == currfg) 737 goto out; 738 739 remove_zspage(class, currfg, first_page); 740 insert_zspage(class, newfg, first_page); 741 set_zspage_mapping(first_page, class_idx, newfg); 742 743 out: 744 return newfg; 745 } 746 747 /* 748 * We have to decide on how many pages to link together 749 * to form a zspage for each size class. This is important 750 * to reduce wastage due to unusable space left at end of 751 * each zspage which is given as: 752 * wastage = Zp % class_size 753 * usage = Zp - wastage 754 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 755 * 756 * For example, for size class of 3/8 * PAGE_SIZE, we should 757 * link together 3 PAGE_SIZE sized pages to form a zspage 758 * since then we can perfectly fit in 8 such objects. 759 */ 760 static int get_pages_per_zspage(int class_size) 761 { 762 int i, max_usedpc = 0; 763 /* zspage order which gives maximum used size per KB */ 764 int max_usedpc_order = 1; 765 766 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 767 int zspage_size; 768 int waste, usedpc; 769 770 zspage_size = i * PAGE_SIZE; 771 waste = zspage_size % class_size; 772 usedpc = (zspage_size - waste) * 100 / zspage_size; 773 774 if (usedpc > max_usedpc) { 775 max_usedpc = usedpc; 776 max_usedpc_order = i; 777 } 778 } 779 780 return max_usedpc_order; 781 } 782 783 /* 784 * A single 'zspage' is composed of many system pages which are 785 * linked together using fields in struct page. This function finds 786 * the first/head page, given any component page of a zspage. 787 */ 788 static struct page *get_first_page(struct page *page) 789 { 790 if (is_first_page(page)) 791 return page; 792 else 793 return (struct page *)page_private(page); 794 } 795 796 static struct page *get_next_page(struct page *page) 797 { 798 struct page *next; 799 800 if (is_last_page(page)) 801 next = NULL; 802 else if (is_first_page(page)) 803 next = (struct page *)page_private(page); 804 else 805 next = list_entry(page->lru.next, struct page, lru); 806 807 return next; 808 } 809 810 /* 811 * Encode <page, obj_idx> as a single handle value. 812 * We use the least bit of handle for tagging. 813 */ 814 static void *location_to_obj(struct page *page, unsigned long obj_idx) 815 { 816 unsigned long obj; 817 818 if (!page) { 819 VM_BUG_ON(obj_idx); 820 return NULL; 821 } 822 823 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 824 obj |= ((obj_idx) & OBJ_INDEX_MASK); 825 obj <<= OBJ_TAG_BITS; 826 827 return (void *)obj; 828 } 829 830 /* 831 * Decode <page, obj_idx> pair from the given object handle. We adjust the 832 * decoded obj_idx back to its original value since it was adjusted in 833 * location_to_obj(). 834 */ 835 static void obj_to_location(unsigned long obj, struct page **page, 836 unsigned long *obj_idx) 837 { 838 obj >>= OBJ_TAG_BITS; 839 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 840 *obj_idx = (obj & OBJ_INDEX_MASK); 841 } 842 843 static unsigned long handle_to_obj(unsigned long handle) 844 { 845 return *(unsigned long *)handle; 846 } 847 848 static unsigned long obj_to_head(struct size_class *class, struct page *page, 849 void *obj) 850 { 851 if (class->huge) { 852 VM_BUG_ON_PAGE(!is_first_page(page), page); 853 return page_private(page); 854 } else 855 return *(unsigned long *)obj; 856 } 857 858 static unsigned long obj_idx_to_offset(struct page *page, 859 unsigned long obj_idx, int class_size) 860 { 861 unsigned long off = 0; 862 863 if (!is_first_page(page)) 864 off = page->index; 865 866 return off + obj_idx * class_size; 867 } 868 869 static inline int trypin_tag(unsigned long handle) 870 { 871 unsigned long *ptr = (unsigned long *)handle; 872 873 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 874 } 875 876 static void pin_tag(unsigned long handle) 877 { 878 while (!trypin_tag(handle)); 879 } 880 881 static void unpin_tag(unsigned long handle) 882 { 883 unsigned long *ptr = (unsigned long *)handle; 884 885 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 886 } 887 888 static void reset_page(struct page *page) 889 { 890 clear_bit(PG_private, &page->flags); 891 clear_bit(PG_private_2, &page->flags); 892 set_page_private(page, 0); 893 page->mapping = NULL; 894 page->freelist = NULL; 895 page_mapcount_reset(page); 896 } 897 898 static void free_zspage(struct page *first_page) 899 { 900 struct page *nextp, *tmp, *head_extra; 901 902 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 903 VM_BUG_ON_PAGE(first_page->inuse, first_page); 904 905 head_extra = (struct page *)page_private(first_page); 906 907 reset_page(first_page); 908 __free_page(first_page); 909 910 /* zspage with only 1 system page */ 911 if (!head_extra) 912 return; 913 914 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 915 list_del(&nextp->lru); 916 reset_page(nextp); 917 __free_page(nextp); 918 } 919 reset_page(head_extra); 920 __free_page(head_extra); 921 } 922 923 /* Initialize a newly allocated zspage */ 924 static void init_zspage(struct size_class *class, struct page *first_page) 925 { 926 unsigned long off = 0; 927 struct page *page = first_page; 928 929 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 930 931 while (page) { 932 struct page *next_page; 933 struct link_free *link; 934 unsigned int i = 1; 935 void *vaddr; 936 937 /* 938 * page->index stores offset of first object starting 939 * in the page. For the first page, this is always 0, 940 * so we use first_page->index (aka ->freelist) to store 941 * head of corresponding zspage's freelist. 942 */ 943 if (page != first_page) 944 page->index = off; 945 946 vaddr = kmap_atomic(page); 947 link = (struct link_free *)vaddr + off / sizeof(*link); 948 949 while ((off += class->size) < PAGE_SIZE) { 950 link->next = location_to_obj(page, i++); 951 link += class->size / sizeof(*link); 952 } 953 954 /* 955 * We now come to the last (full or partial) object on this 956 * page, which must point to the first object on the next 957 * page (if present) 958 */ 959 next_page = get_next_page(page); 960 link->next = location_to_obj(next_page, 0); 961 kunmap_atomic(vaddr); 962 page = next_page; 963 off %= PAGE_SIZE; 964 } 965 } 966 967 /* 968 * Allocate a zspage for the given size class 969 */ 970 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 971 { 972 int i, error; 973 struct page *first_page = NULL, *uninitialized_var(prev_page); 974 975 /* 976 * Allocate individual pages and link them together as: 977 * 1. first page->private = first sub-page 978 * 2. all sub-pages are linked together using page->lru 979 * 3. each sub-page is linked to the first page using page->private 980 * 981 * For each size class, First/Head pages are linked together using 982 * page->lru. Also, we set PG_private to identify the first page 983 * (i.e. no other sub-page has this flag set) and PG_private_2 to 984 * identify the last page. 985 */ 986 error = -ENOMEM; 987 for (i = 0; i < class->pages_per_zspage; i++) { 988 struct page *page; 989 990 page = alloc_page(flags); 991 if (!page) 992 goto cleanup; 993 994 INIT_LIST_HEAD(&page->lru); 995 if (i == 0) { /* first page */ 996 SetPagePrivate(page); 997 set_page_private(page, 0); 998 first_page = page; 999 first_page->inuse = 0; 1000 } 1001 if (i == 1) 1002 set_page_private(first_page, (unsigned long)page); 1003 if (i >= 1) 1004 set_page_private(page, (unsigned long)first_page); 1005 if (i >= 2) 1006 list_add(&page->lru, &prev_page->lru); 1007 if (i == class->pages_per_zspage - 1) /* last page */ 1008 SetPagePrivate2(page); 1009 prev_page = page; 1010 } 1011 1012 init_zspage(class, first_page); 1013 1014 first_page->freelist = location_to_obj(first_page, 0); 1015 /* Maximum number of objects we can store in this zspage */ 1016 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 1017 1018 error = 0; /* Success */ 1019 1020 cleanup: 1021 if (unlikely(error) && first_page) { 1022 free_zspage(first_page); 1023 first_page = NULL; 1024 } 1025 1026 return first_page; 1027 } 1028 1029 static struct page *find_get_zspage(struct size_class *class) 1030 { 1031 int i; 1032 struct page *page; 1033 1034 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1035 page = class->fullness_list[i]; 1036 if (page) 1037 break; 1038 } 1039 1040 return page; 1041 } 1042 1043 #ifdef CONFIG_PGTABLE_MAPPING 1044 static inline int __zs_cpu_up(struct mapping_area *area) 1045 { 1046 /* 1047 * Make sure we don't leak memory if a cpu UP notification 1048 * and zs_init() race and both call zs_cpu_up() on the same cpu 1049 */ 1050 if (area->vm) 1051 return 0; 1052 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1053 if (!area->vm) 1054 return -ENOMEM; 1055 return 0; 1056 } 1057 1058 static inline void __zs_cpu_down(struct mapping_area *area) 1059 { 1060 if (area->vm) 1061 free_vm_area(area->vm); 1062 area->vm = NULL; 1063 } 1064 1065 static inline void *__zs_map_object(struct mapping_area *area, 1066 struct page *pages[2], int off, int size) 1067 { 1068 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1069 area->vm_addr = area->vm->addr; 1070 return area->vm_addr + off; 1071 } 1072 1073 static inline void __zs_unmap_object(struct mapping_area *area, 1074 struct page *pages[2], int off, int size) 1075 { 1076 unsigned long addr = (unsigned long)area->vm_addr; 1077 1078 unmap_kernel_range(addr, PAGE_SIZE * 2); 1079 } 1080 1081 #else /* CONFIG_PGTABLE_MAPPING */ 1082 1083 static inline int __zs_cpu_up(struct mapping_area *area) 1084 { 1085 /* 1086 * Make sure we don't leak memory if a cpu UP notification 1087 * and zs_init() race and both call zs_cpu_up() on the same cpu 1088 */ 1089 if (area->vm_buf) 1090 return 0; 1091 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1092 if (!area->vm_buf) 1093 return -ENOMEM; 1094 return 0; 1095 } 1096 1097 static inline void __zs_cpu_down(struct mapping_area *area) 1098 { 1099 kfree(area->vm_buf); 1100 area->vm_buf = NULL; 1101 } 1102 1103 static void *__zs_map_object(struct mapping_area *area, 1104 struct page *pages[2], int off, int size) 1105 { 1106 int sizes[2]; 1107 void *addr; 1108 char *buf = area->vm_buf; 1109 1110 /* disable page faults to match kmap_atomic() return conditions */ 1111 pagefault_disable(); 1112 1113 /* no read fastpath */ 1114 if (area->vm_mm == ZS_MM_WO) 1115 goto out; 1116 1117 sizes[0] = PAGE_SIZE - off; 1118 sizes[1] = size - sizes[0]; 1119 1120 /* copy object to per-cpu buffer */ 1121 addr = kmap_atomic(pages[0]); 1122 memcpy(buf, addr + off, sizes[0]); 1123 kunmap_atomic(addr); 1124 addr = kmap_atomic(pages[1]); 1125 memcpy(buf + sizes[0], addr, sizes[1]); 1126 kunmap_atomic(addr); 1127 out: 1128 return area->vm_buf; 1129 } 1130 1131 static void __zs_unmap_object(struct mapping_area *area, 1132 struct page *pages[2], int off, int size) 1133 { 1134 int sizes[2]; 1135 void *addr; 1136 char *buf; 1137 1138 /* no write fastpath */ 1139 if (area->vm_mm == ZS_MM_RO) 1140 goto out; 1141 1142 buf = area->vm_buf; 1143 buf = buf + ZS_HANDLE_SIZE; 1144 size -= ZS_HANDLE_SIZE; 1145 off += ZS_HANDLE_SIZE; 1146 1147 sizes[0] = PAGE_SIZE - off; 1148 sizes[1] = size - sizes[0]; 1149 1150 /* copy per-cpu buffer to object */ 1151 addr = kmap_atomic(pages[0]); 1152 memcpy(addr + off, buf, sizes[0]); 1153 kunmap_atomic(addr); 1154 addr = kmap_atomic(pages[1]); 1155 memcpy(addr, buf + sizes[0], sizes[1]); 1156 kunmap_atomic(addr); 1157 1158 out: 1159 /* enable page faults to match kunmap_atomic() return conditions */ 1160 pagefault_enable(); 1161 } 1162 1163 #endif /* CONFIG_PGTABLE_MAPPING */ 1164 1165 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1166 void *pcpu) 1167 { 1168 int ret, cpu = (long)pcpu; 1169 struct mapping_area *area; 1170 1171 switch (action) { 1172 case CPU_UP_PREPARE: 1173 area = &per_cpu(zs_map_area, cpu); 1174 ret = __zs_cpu_up(area); 1175 if (ret) 1176 return notifier_from_errno(ret); 1177 break; 1178 case CPU_DEAD: 1179 case CPU_UP_CANCELED: 1180 area = &per_cpu(zs_map_area, cpu); 1181 __zs_cpu_down(area); 1182 break; 1183 } 1184 1185 return NOTIFY_OK; 1186 } 1187 1188 static struct notifier_block zs_cpu_nb = { 1189 .notifier_call = zs_cpu_notifier 1190 }; 1191 1192 static int zs_register_cpu_notifier(void) 1193 { 1194 int cpu, uninitialized_var(ret); 1195 1196 cpu_notifier_register_begin(); 1197 1198 __register_cpu_notifier(&zs_cpu_nb); 1199 for_each_online_cpu(cpu) { 1200 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1201 if (notifier_to_errno(ret)) 1202 break; 1203 } 1204 1205 cpu_notifier_register_done(); 1206 return notifier_to_errno(ret); 1207 } 1208 1209 static void zs_unregister_cpu_notifier(void) 1210 { 1211 int cpu; 1212 1213 cpu_notifier_register_begin(); 1214 1215 for_each_online_cpu(cpu) 1216 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1217 __unregister_cpu_notifier(&zs_cpu_nb); 1218 1219 cpu_notifier_register_done(); 1220 } 1221 1222 static void init_zs_size_classes(void) 1223 { 1224 int nr; 1225 1226 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1227 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1228 nr += 1; 1229 1230 zs_size_classes = nr; 1231 } 1232 1233 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1234 { 1235 if (prev->pages_per_zspage != pages_per_zspage) 1236 return false; 1237 1238 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1239 != get_maxobj_per_zspage(size, pages_per_zspage)) 1240 return false; 1241 1242 return true; 1243 } 1244 1245 static bool zspage_full(struct page *first_page) 1246 { 1247 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 1248 1249 return first_page->inuse == first_page->objects; 1250 } 1251 1252 unsigned long zs_get_total_pages(struct zs_pool *pool) 1253 { 1254 return atomic_long_read(&pool->pages_allocated); 1255 } 1256 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1257 1258 /** 1259 * zs_map_object - get address of allocated object from handle. 1260 * @pool: pool from which the object was allocated 1261 * @handle: handle returned from zs_malloc 1262 * 1263 * Before using an object allocated from zs_malloc, it must be mapped using 1264 * this function. When done with the object, it must be unmapped using 1265 * zs_unmap_object. 1266 * 1267 * Only one object can be mapped per cpu at a time. There is no protection 1268 * against nested mappings. 1269 * 1270 * This function returns with preemption and page faults disabled. 1271 */ 1272 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1273 enum zs_mapmode mm) 1274 { 1275 struct page *page; 1276 unsigned long obj, obj_idx, off; 1277 1278 unsigned int class_idx; 1279 enum fullness_group fg; 1280 struct size_class *class; 1281 struct mapping_area *area; 1282 struct page *pages[2]; 1283 void *ret; 1284 1285 /* 1286 * Because we use per-cpu mapping areas shared among the 1287 * pools/users, we can't allow mapping in interrupt context 1288 * because it can corrupt another users mappings. 1289 */ 1290 WARN_ON_ONCE(in_interrupt()); 1291 1292 /* From now on, migration cannot move the object */ 1293 pin_tag(handle); 1294 1295 obj = handle_to_obj(handle); 1296 obj_to_location(obj, &page, &obj_idx); 1297 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1298 class = pool->size_class[class_idx]; 1299 off = obj_idx_to_offset(page, obj_idx, class->size); 1300 1301 area = &get_cpu_var(zs_map_area); 1302 area->vm_mm = mm; 1303 if (off + class->size <= PAGE_SIZE) { 1304 /* this object is contained entirely within a page */ 1305 area->vm_addr = kmap_atomic(page); 1306 ret = area->vm_addr + off; 1307 goto out; 1308 } 1309 1310 /* this object spans two pages */ 1311 pages[0] = page; 1312 pages[1] = get_next_page(page); 1313 BUG_ON(!pages[1]); 1314 1315 ret = __zs_map_object(area, pages, off, class->size); 1316 out: 1317 if (!class->huge) 1318 ret += ZS_HANDLE_SIZE; 1319 1320 return ret; 1321 } 1322 EXPORT_SYMBOL_GPL(zs_map_object); 1323 1324 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1325 { 1326 struct page *page; 1327 unsigned long obj, obj_idx, off; 1328 1329 unsigned int class_idx; 1330 enum fullness_group fg; 1331 struct size_class *class; 1332 struct mapping_area *area; 1333 1334 obj = handle_to_obj(handle); 1335 obj_to_location(obj, &page, &obj_idx); 1336 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1337 class = pool->size_class[class_idx]; 1338 off = obj_idx_to_offset(page, obj_idx, class->size); 1339 1340 area = this_cpu_ptr(&zs_map_area); 1341 if (off + class->size <= PAGE_SIZE) 1342 kunmap_atomic(area->vm_addr); 1343 else { 1344 struct page *pages[2]; 1345 1346 pages[0] = page; 1347 pages[1] = get_next_page(page); 1348 BUG_ON(!pages[1]); 1349 1350 __zs_unmap_object(area, pages, off, class->size); 1351 } 1352 put_cpu_var(zs_map_area); 1353 unpin_tag(handle); 1354 } 1355 EXPORT_SYMBOL_GPL(zs_unmap_object); 1356 1357 static unsigned long obj_malloc(struct size_class *class, 1358 struct page *first_page, unsigned long handle) 1359 { 1360 unsigned long obj; 1361 struct link_free *link; 1362 1363 struct page *m_page; 1364 unsigned long m_objidx, m_offset; 1365 void *vaddr; 1366 1367 handle |= OBJ_ALLOCATED_TAG; 1368 obj = (unsigned long)first_page->freelist; 1369 obj_to_location(obj, &m_page, &m_objidx); 1370 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1371 1372 vaddr = kmap_atomic(m_page); 1373 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1374 first_page->freelist = link->next; 1375 if (!class->huge) 1376 /* record handle in the header of allocated chunk */ 1377 link->handle = handle; 1378 else 1379 /* record handle in first_page->private */ 1380 set_page_private(first_page, handle); 1381 kunmap_atomic(vaddr); 1382 first_page->inuse++; 1383 zs_stat_inc(class, OBJ_USED, 1); 1384 1385 return obj; 1386 } 1387 1388 1389 /** 1390 * zs_malloc - Allocate block of given size from pool. 1391 * @pool: pool to allocate from 1392 * @size: size of block to allocate 1393 * 1394 * On success, handle to the allocated object is returned, 1395 * otherwise 0. 1396 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1397 */ 1398 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) 1399 { 1400 unsigned long handle, obj; 1401 struct size_class *class; 1402 struct page *first_page; 1403 1404 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1405 return 0; 1406 1407 handle = alloc_handle(pool, gfp); 1408 if (!handle) 1409 return 0; 1410 1411 /* extra space in chunk to keep the handle */ 1412 size += ZS_HANDLE_SIZE; 1413 class = pool->size_class[get_size_class_index(size)]; 1414 1415 spin_lock(&class->lock); 1416 first_page = find_get_zspage(class); 1417 1418 if (!first_page) { 1419 spin_unlock(&class->lock); 1420 first_page = alloc_zspage(class, gfp); 1421 if (unlikely(!first_page)) { 1422 free_handle(pool, handle); 1423 return 0; 1424 } 1425 1426 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1427 atomic_long_add(class->pages_per_zspage, 1428 &pool->pages_allocated); 1429 1430 spin_lock(&class->lock); 1431 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1432 class->size, class->pages_per_zspage)); 1433 } 1434 1435 obj = obj_malloc(class, first_page, handle); 1436 /* Now move the zspage to another fullness group, if required */ 1437 fix_fullness_group(class, first_page); 1438 record_obj(handle, obj); 1439 spin_unlock(&class->lock); 1440 1441 return handle; 1442 } 1443 EXPORT_SYMBOL_GPL(zs_malloc); 1444 1445 static void obj_free(struct size_class *class, unsigned long obj) 1446 { 1447 struct link_free *link; 1448 struct page *first_page, *f_page; 1449 unsigned long f_objidx, f_offset; 1450 void *vaddr; 1451 1452 obj &= ~OBJ_ALLOCATED_TAG; 1453 obj_to_location(obj, &f_page, &f_objidx); 1454 first_page = get_first_page(f_page); 1455 1456 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1457 1458 vaddr = kmap_atomic(f_page); 1459 1460 /* Insert this object in containing zspage's freelist */ 1461 link = (struct link_free *)(vaddr + f_offset); 1462 link->next = first_page->freelist; 1463 if (class->huge) 1464 set_page_private(first_page, 0); 1465 kunmap_atomic(vaddr); 1466 first_page->freelist = (void *)obj; 1467 first_page->inuse--; 1468 zs_stat_dec(class, OBJ_USED, 1); 1469 } 1470 1471 void zs_free(struct zs_pool *pool, unsigned long handle) 1472 { 1473 struct page *first_page, *f_page; 1474 unsigned long obj, f_objidx; 1475 int class_idx; 1476 struct size_class *class; 1477 enum fullness_group fullness; 1478 1479 if (unlikely(!handle)) 1480 return; 1481 1482 pin_tag(handle); 1483 obj = handle_to_obj(handle); 1484 obj_to_location(obj, &f_page, &f_objidx); 1485 first_page = get_first_page(f_page); 1486 1487 get_zspage_mapping(first_page, &class_idx, &fullness); 1488 class = pool->size_class[class_idx]; 1489 1490 spin_lock(&class->lock); 1491 obj_free(class, obj); 1492 fullness = fix_fullness_group(class, first_page); 1493 if (fullness == ZS_EMPTY) { 1494 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1495 class->size, class->pages_per_zspage)); 1496 atomic_long_sub(class->pages_per_zspage, 1497 &pool->pages_allocated); 1498 free_zspage(first_page); 1499 } 1500 spin_unlock(&class->lock); 1501 unpin_tag(handle); 1502 1503 free_handle(pool, handle); 1504 } 1505 EXPORT_SYMBOL_GPL(zs_free); 1506 1507 static void zs_object_copy(struct size_class *class, unsigned long dst, 1508 unsigned long src) 1509 { 1510 struct page *s_page, *d_page; 1511 unsigned long s_objidx, d_objidx; 1512 unsigned long s_off, d_off; 1513 void *s_addr, *d_addr; 1514 int s_size, d_size, size; 1515 int written = 0; 1516 1517 s_size = d_size = class->size; 1518 1519 obj_to_location(src, &s_page, &s_objidx); 1520 obj_to_location(dst, &d_page, &d_objidx); 1521 1522 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1523 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1524 1525 if (s_off + class->size > PAGE_SIZE) 1526 s_size = PAGE_SIZE - s_off; 1527 1528 if (d_off + class->size > PAGE_SIZE) 1529 d_size = PAGE_SIZE - d_off; 1530 1531 s_addr = kmap_atomic(s_page); 1532 d_addr = kmap_atomic(d_page); 1533 1534 while (1) { 1535 size = min(s_size, d_size); 1536 memcpy(d_addr + d_off, s_addr + s_off, size); 1537 written += size; 1538 1539 if (written == class->size) 1540 break; 1541 1542 s_off += size; 1543 s_size -= size; 1544 d_off += size; 1545 d_size -= size; 1546 1547 if (s_off >= PAGE_SIZE) { 1548 kunmap_atomic(d_addr); 1549 kunmap_atomic(s_addr); 1550 s_page = get_next_page(s_page); 1551 s_addr = kmap_atomic(s_page); 1552 d_addr = kmap_atomic(d_page); 1553 s_size = class->size - written; 1554 s_off = 0; 1555 } 1556 1557 if (d_off >= PAGE_SIZE) { 1558 kunmap_atomic(d_addr); 1559 d_page = get_next_page(d_page); 1560 d_addr = kmap_atomic(d_page); 1561 d_size = class->size - written; 1562 d_off = 0; 1563 } 1564 } 1565 1566 kunmap_atomic(d_addr); 1567 kunmap_atomic(s_addr); 1568 } 1569 1570 /* 1571 * Find alloced object in zspage from index object and 1572 * return handle. 1573 */ 1574 static unsigned long find_alloced_obj(struct size_class *class, 1575 struct page *page, int index) 1576 { 1577 unsigned long head; 1578 int offset = 0; 1579 unsigned long handle = 0; 1580 void *addr = kmap_atomic(page); 1581 1582 if (!is_first_page(page)) 1583 offset = page->index; 1584 offset += class->size * index; 1585 1586 while (offset < PAGE_SIZE) { 1587 head = obj_to_head(class, page, addr + offset); 1588 if (head & OBJ_ALLOCATED_TAG) { 1589 handle = head & ~OBJ_ALLOCATED_TAG; 1590 if (trypin_tag(handle)) 1591 break; 1592 handle = 0; 1593 } 1594 1595 offset += class->size; 1596 index++; 1597 } 1598 1599 kunmap_atomic(addr); 1600 return handle; 1601 } 1602 1603 struct zs_compact_control { 1604 /* Source page for migration which could be a subpage of zspage. */ 1605 struct page *s_page; 1606 /* Destination page for migration which should be a first page 1607 * of zspage. */ 1608 struct page *d_page; 1609 /* Starting object index within @s_page which used for live object 1610 * in the subpage. */ 1611 int index; 1612 }; 1613 1614 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1615 struct zs_compact_control *cc) 1616 { 1617 unsigned long used_obj, free_obj; 1618 unsigned long handle; 1619 struct page *s_page = cc->s_page; 1620 struct page *d_page = cc->d_page; 1621 unsigned long index = cc->index; 1622 int ret = 0; 1623 1624 while (1) { 1625 handle = find_alloced_obj(class, s_page, index); 1626 if (!handle) { 1627 s_page = get_next_page(s_page); 1628 if (!s_page) 1629 break; 1630 index = 0; 1631 continue; 1632 } 1633 1634 /* Stop if there is no more space */ 1635 if (zspage_full(d_page)) { 1636 unpin_tag(handle); 1637 ret = -ENOMEM; 1638 break; 1639 } 1640 1641 used_obj = handle_to_obj(handle); 1642 free_obj = obj_malloc(class, d_page, handle); 1643 zs_object_copy(class, free_obj, used_obj); 1644 index++; 1645 /* 1646 * record_obj updates handle's value to free_obj and it will 1647 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which 1648 * breaks synchronization using pin_tag(e,g, zs_free) so 1649 * let's keep the lock bit. 1650 */ 1651 free_obj |= BIT(HANDLE_PIN_BIT); 1652 record_obj(handle, free_obj); 1653 unpin_tag(handle); 1654 obj_free(class, used_obj); 1655 } 1656 1657 /* Remember last position in this iteration */ 1658 cc->s_page = s_page; 1659 cc->index = index; 1660 1661 return ret; 1662 } 1663 1664 static struct page *isolate_target_page(struct size_class *class) 1665 { 1666 int i; 1667 struct page *page; 1668 1669 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1670 page = class->fullness_list[i]; 1671 if (page) { 1672 remove_zspage(class, i, page); 1673 break; 1674 } 1675 } 1676 1677 return page; 1678 } 1679 1680 /* 1681 * putback_zspage - add @first_page into right class's fullness list 1682 * @pool: target pool 1683 * @class: destination class 1684 * @first_page: target page 1685 * 1686 * Return @fist_page's fullness_group 1687 */ 1688 static enum fullness_group putback_zspage(struct zs_pool *pool, 1689 struct size_class *class, 1690 struct page *first_page) 1691 { 1692 enum fullness_group fullness; 1693 1694 fullness = get_fullness_group(first_page); 1695 insert_zspage(class, fullness, first_page); 1696 set_zspage_mapping(first_page, class->index, fullness); 1697 1698 if (fullness == ZS_EMPTY) { 1699 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1700 class->size, class->pages_per_zspage)); 1701 atomic_long_sub(class->pages_per_zspage, 1702 &pool->pages_allocated); 1703 1704 free_zspage(first_page); 1705 } 1706 1707 return fullness; 1708 } 1709 1710 static struct page *isolate_source_page(struct size_class *class) 1711 { 1712 int i; 1713 struct page *page = NULL; 1714 1715 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1716 page = class->fullness_list[i]; 1717 if (!page) 1718 continue; 1719 1720 remove_zspage(class, i, page); 1721 break; 1722 } 1723 1724 return page; 1725 } 1726 1727 /* 1728 * 1729 * Based on the number of unused allocated objects calculate 1730 * and return the number of pages that we can free. 1731 */ 1732 static unsigned long zs_can_compact(struct size_class *class) 1733 { 1734 unsigned long obj_wasted; 1735 unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 1736 unsigned long obj_used = zs_stat_get(class, OBJ_USED); 1737 1738 if (obj_allocated <= obj_used) 1739 return 0; 1740 1741 obj_wasted = obj_allocated - obj_used; 1742 obj_wasted /= get_maxobj_per_zspage(class->size, 1743 class->pages_per_zspage); 1744 1745 return obj_wasted * class->pages_per_zspage; 1746 } 1747 1748 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1749 { 1750 struct zs_compact_control cc; 1751 struct page *src_page; 1752 struct page *dst_page = NULL; 1753 1754 spin_lock(&class->lock); 1755 while ((src_page = isolate_source_page(class))) { 1756 1757 if (!zs_can_compact(class)) 1758 break; 1759 1760 cc.index = 0; 1761 cc.s_page = src_page; 1762 1763 while ((dst_page = isolate_target_page(class))) { 1764 cc.d_page = dst_page; 1765 /* 1766 * If there is no more space in dst_page, resched 1767 * and see if anyone had allocated another zspage. 1768 */ 1769 if (!migrate_zspage(pool, class, &cc)) 1770 break; 1771 1772 putback_zspage(pool, class, dst_page); 1773 } 1774 1775 /* Stop if we couldn't find slot */ 1776 if (dst_page == NULL) 1777 break; 1778 1779 putback_zspage(pool, class, dst_page); 1780 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1781 pool->stats.pages_compacted += class->pages_per_zspage; 1782 spin_unlock(&class->lock); 1783 cond_resched(); 1784 spin_lock(&class->lock); 1785 } 1786 1787 if (src_page) 1788 putback_zspage(pool, class, src_page); 1789 1790 spin_unlock(&class->lock); 1791 } 1792 1793 unsigned long zs_compact(struct zs_pool *pool) 1794 { 1795 int i; 1796 struct size_class *class; 1797 1798 for (i = zs_size_classes - 1; i >= 0; i--) { 1799 class = pool->size_class[i]; 1800 if (!class) 1801 continue; 1802 if (class->index != i) 1803 continue; 1804 __zs_compact(pool, class); 1805 } 1806 1807 return pool->stats.pages_compacted; 1808 } 1809 EXPORT_SYMBOL_GPL(zs_compact); 1810 1811 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1812 { 1813 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1814 } 1815 EXPORT_SYMBOL_GPL(zs_pool_stats); 1816 1817 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1818 struct shrink_control *sc) 1819 { 1820 unsigned long pages_freed; 1821 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1822 shrinker); 1823 1824 pages_freed = pool->stats.pages_compacted; 1825 /* 1826 * Compact classes and calculate compaction delta. 1827 * Can run concurrently with a manually triggered 1828 * (by user) compaction. 1829 */ 1830 pages_freed = zs_compact(pool) - pages_freed; 1831 1832 return pages_freed ? pages_freed : SHRINK_STOP; 1833 } 1834 1835 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1836 struct shrink_control *sc) 1837 { 1838 int i; 1839 struct size_class *class; 1840 unsigned long pages_to_free = 0; 1841 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1842 shrinker); 1843 1844 for (i = zs_size_classes - 1; i >= 0; i--) { 1845 class = pool->size_class[i]; 1846 if (!class) 1847 continue; 1848 if (class->index != i) 1849 continue; 1850 1851 pages_to_free += zs_can_compact(class); 1852 } 1853 1854 return pages_to_free; 1855 } 1856 1857 static void zs_unregister_shrinker(struct zs_pool *pool) 1858 { 1859 if (pool->shrinker_enabled) { 1860 unregister_shrinker(&pool->shrinker); 1861 pool->shrinker_enabled = false; 1862 } 1863 } 1864 1865 static int zs_register_shrinker(struct zs_pool *pool) 1866 { 1867 pool->shrinker.scan_objects = zs_shrinker_scan; 1868 pool->shrinker.count_objects = zs_shrinker_count; 1869 pool->shrinker.batch = 0; 1870 pool->shrinker.seeks = DEFAULT_SEEKS; 1871 1872 return register_shrinker(&pool->shrinker); 1873 } 1874 1875 /** 1876 * zs_create_pool - Creates an allocation pool to work from. 1877 * @flags: allocation flags used to allocate pool metadata 1878 * 1879 * This function must be called before anything when using 1880 * the zsmalloc allocator. 1881 * 1882 * On success, a pointer to the newly created pool is returned, 1883 * otherwise NULL. 1884 */ 1885 struct zs_pool *zs_create_pool(const char *name) 1886 { 1887 int i; 1888 struct zs_pool *pool; 1889 struct size_class *prev_class = NULL; 1890 1891 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1892 if (!pool) 1893 return NULL; 1894 1895 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1896 GFP_KERNEL); 1897 if (!pool->size_class) { 1898 kfree(pool); 1899 return NULL; 1900 } 1901 1902 pool->name = kstrdup(name, GFP_KERNEL); 1903 if (!pool->name) 1904 goto err; 1905 1906 if (create_handle_cache(pool)) 1907 goto err; 1908 1909 /* 1910 * Iterate reversly, because, size of size_class that we want to use 1911 * for merging should be larger or equal to current size. 1912 */ 1913 for (i = zs_size_classes - 1; i >= 0; i--) { 1914 int size; 1915 int pages_per_zspage; 1916 struct size_class *class; 1917 1918 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1919 if (size > ZS_MAX_ALLOC_SIZE) 1920 size = ZS_MAX_ALLOC_SIZE; 1921 pages_per_zspage = get_pages_per_zspage(size); 1922 1923 /* 1924 * size_class is used for normal zsmalloc operation such 1925 * as alloc/free for that size. Although it is natural that we 1926 * have one size_class for each size, there is a chance that we 1927 * can get more memory utilization if we use one size_class for 1928 * many different sizes whose size_class have same 1929 * characteristics. So, we makes size_class point to 1930 * previous size_class if possible. 1931 */ 1932 if (prev_class) { 1933 if (can_merge(prev_class, size, pages_per_zspage)) { 1934 pool->size_class[i] = prev_class; 1935 continue; 1936 } 1937 } 1938 1939 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1940 if (!class) 1941 goto err; 1942 1943 class->size = size; 1944 class->index = i; 1945 class->pages_per_zspage = pages_per_zspage; 1946 if (pages_per_zspage == 1 && 1947 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1948 class->huge = true; 1949 spin_lock_init(&class->lock); 1950 pool->size_class[i] = class; 1951 1952 prev_class = class; 1953 } 1954 1955 /* debug only, don't abort if it fails */ 1956 zs_pool_stat_create(pool, name); 1957 1958 /* 1959 * Not critical, we still can use the pool 1960 * and user can trigger compaction manually. 1961 */ 1962 if (zs_register_shrinker(pool) == 0) 1963 pool->shrinker_enabled = true; 1964 return pool; 1965 1966 err: 1967 zs_destroy_pool(pool); 1968 return NULL; 1969 } 1970 EXPORT_SYMBOL_GPL(zs_create_pool); 1971 1972 void zs_destroy_pool(struct zs_pool *pool) 1973 { 1974 int i; 1975 1976 zs_unregister_shrinker(pool); 1977 zs_pool_stat_destroy(pool); 1978 1979 for (i = 0; i < zs_size_classes; i++) { 1980 int fg; 1981 struct size_class *class = pool->size_class[i]; 1982 1983 if (!class) 1984 continue; 1985 1986 if (class->index != i) 1987 continue; 1988 1989 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1990 if (class->fullness_list[fg]) { 1991 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1992 class->size, fg); 1993 } 1994 } 1995 kfree(class); 1996 } 1997 1998 destroy_handle_cache(pool); 1999 kfree(pool->size_class); 2000 kfree(pool->name); 2001 kfree(pool); 2002 } 2003 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2004 2005 static int __init zs_init(void) 2006 { 2007 int ret = zs_register_cpu_notifier(); 2008 2009 if (ret) 2010 goto notifier_fail; 2011 2012 init_zs_size_classes(); 2013 2014 #ifdef CONFIG_ZPOOL 2015 zpool_register_driver(&zs_zpool_driver); 2016 #endif 2017 2018 zs_stat_init(); 2019 2020 return 0; 2021 2022 notifier_fail: 2023 zs_unregister_cpu_notifier(); 2024 2025 return ret; 2026 } 2027 2028 static void __exit zs_exit(void) 2029 { 2030 #ifdef CONFIG_ZPOOL 2031 zpool_unregister_driver(&zs_zpool_driver); 2032 #endif 2033 zs_unregister_cpu_notifier(); 2034 2035 zs_stat_exit(); 2036 } 2037 2038 module_init(zs_init); 2039 module_exit(zs_exit); 2040 2041 MODULE_LICENSE("Dual BSD/GPL"); 2042 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2043