1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->first_page: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private (union with page->first_page): refers to the 30 * component page after the first page 31 * If the page is first_page for huge object, it stores handle. 32 * Look at size_class->huge. 33 * page->freelist: points to the first free object in zspage. 34 * Free objects are linked together using in-place 35 * metadata. 36 * page->objects: maximum number of objects we can store in this 37 * zspage (class->zspage_order * PAGE_SIZE / class->size) 38 * page->lru: links together first pages of various zspages. 39 * Basically forming list of zspages in a fullness group. 40 * page->mapping: class index and fullness group of the zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #ifdef CONFIG_ZSMALLOC_DEBUG 49 #define DEBUG 50 #endif 51 52 #include <linux/module.h> 53 #include <linux/kernel.h> 54 #include <linux/sched.h> 55 #include <linux/bitops.h> 56 #include <linux/errno.h> 57 #include <linux/highmem.h> 58 #include <linux/string.h> 59 #include <linux/slab.h> 60 #include <asm/tlbflush.h> 61 #include <asm/pgtable.h> 62 #include <linux/cpumask.h> 63 #include <linux/cpu.h> 64 #include <linux/vmalloc.h> 65 #include <linux/hardirq.h> 66 #include <linux/spinlock.h> 67 #include <linux/types.h> 68 #include <linux/debugfs.h> 69 #include <linux/zsmalloc.h> 70 #include <linux/zpool.h> 71 72 /* 73 * This must be power of 2 and greater than of equal to sizeof(link_free). 74 * These two conditions ensure that any 'struct link_free' itself doesn't 75 * span more than 1 page which avoids complex case of mapping 2 pages simply 76 * to restore link_free pointer values. 77 */ 78 #define ZS_ALIGN 8 79 80 /* 81 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 82 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 83 */ 84 #define ZS_MAX_ZSPAGE_ORDER 2 85 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 86 87 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 88 89 /* 90 * Object location (<PFN>, <obj_idx>) is encoded as 91 * as single (unsigned long) handle value. 92 * 93 * Note that object index <obj_idx> is relative to system 94 * page <PFN> it is stored in, so for each sub-page belonging 95 * to a zspage, obj_idx starts with 0. 96 * 97 * This is made more complicated by various memory models and PAE. 98 */ 99 100 #ifndef MAX_PHYSMEM_BITS 101 #ifdef CONFIG_HIGHMEM64G 102 #define MAX_PHYSMEM_BITS 36 103 #else /* !CONFIG_HIGHMEM64G */ 104 /* 105 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 106 * be PAGE_SHIFT 107 */ 108 #define MAX_PHYSMEM_BITS BITS_PER_LONG 109 #endif 110 #endif 111 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 112 113 /* 114 * Memory for allocating for handle keeps object position by 115 * encoding <page, obj_idx> and the encoded value has a room 116 * in least bit(ie, look at obj_to_location). 117 * We use the bit to synchronize between object access by 118 * user and migration. 119 */ 120 #define HANDLE_PIN_BIT 0 121 122 /* 123 * Head in allocated object should have OBJ_ALLOCATED_TAG 124 * to identify the object was allocated or not. 125 * It's okay to add the status bit in the least bit because 126 * header keeps handle which is 4byte-aligned address so we 127 * have room for two bit at least. 128 */ 129 #define OBJ_ALLOCATED_TAG 1 130 #define OBJ_TAG_BITS 1 131 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 132 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 133 134 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 135 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 136 #define ZS_MIN_ALLOC_SIZE \ 137 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 138 /* each chunk includes extra space to keep handle */ 139 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 140 141 /* 142 * On systems with 4K page size, this gives 255 size classes! There is a 143 * trader-off here: 144 * - Large number of size classes is potentially wasteful as free page are 145 * spread across these classes 146 * - Small number of size classes causes large internal fragmentation 147 * - Probably its better to use specific size classes (empirically 148 * determined). NOTE: all those class sizes must be set as multiple of 149 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 150 * 151 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 152 * (reason above) 153 */ 154 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 155 156 /* 157 * We do not maintain any list for completely empty or full pages 158 */ 159 enum fullness_group { 160 ZS_ALMOST_FULL, 161 ZS_ALMOST_EMPTY, 162 _ZS_NR_FULLNESS_GROUPS, 163 164 ZS_EMPTY, 165 ZS_FULL 166 }; 167 168 enum zs_stat_type { 169 OBJ_ALLOCATED, 170 OBJ_USED, 171 CLASS_ALMOST_FULL, 172 CLASS_ALMOST_EMPTY, 173 NR_ZS_STAT_TYPE, 174 }; 175 176 #ifdef CONFIG_ZSMALLOC_STAT 177 178 static struct dentry *zs_stat_root; 179 180 struct zs_size_stat { 181 unsigned long objs[NR_ZS_STAT_TYPE]; 182 }; 183 184 #endif 185 186 /* 187 * number of size_classes 188 */ 189 static int zs_size_classes; 190 191 /* 192 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 193 * n <= N / f, where 194 * n = number of allocated objects 195 * N = total number of objects zspage can store 196 * f = fullness_threshold_frac 197 * 198 * Similarly, we assign zspage to: 199 * ZS_ALMOST_FULL when n > N / f 200 * ZS_EMPTY when n == 0 201 * ZS_FULL when n == N 202 * 203 * (see: fix_fullness_group()) 204 */ 205 static const int fullness_threshold_frac = 4; 206 207 struct size_class { 208 /* 209 * Size of objects stored in this class. Must be multiple 210 * of ZS_ALIGN. 211 */ 212 int size; 213 unsigned int index; 214 215 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 216 int pages_per_zspage; 217 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 218 bool huge; 219 220 #ifdef CONFIG_ZSMALLOC_STAT 221 struct zs_size_stat stats; 222 #endif 223 224 spinlock_t lock; 225 226 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 227 }; 228 229 /* 230 * Placed within free objects to form a singly linked list. 231 * For every zspage, first_page->freelist gives head of this list. 232 * 233 * This must be power of 2 and less than or equal to ZS_ALIGN 234 */ 235 struct link_free { 236 union { 237 /* 238 * Position of next free chunk (encodes <PFN, obj_idx>) 239 * It's valid for non-allocated object 240 */ 241 void *next; 242 /* 243 * Handle of allocated object. 244 */ 245 unsigned long handle; 246 }; 247 }; 248 249 struct zs_pool { 250 char *name; 251 252 struct size_class **size_class; 253 struct kmem_cache *handle_cachep; 254 255 gfp_t flags; /* allocation flags used when growing pool */ 256 atomic_long_t pages_allocated; 257 258 #ifdef CONFIG_ZSMALLOC_STAT 259 struct dentry *stat_dentry; 260 #endif 261 }; 262 263 /* 264 * A zspage's class index and fullness group 265 * are encoded in its (first)page->mapping 266 */ 267 #define CLASS_IDX_BITS 28 268 #define FULLNESS_BITS 4 269 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 270 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 271 272 struct mapping_area { 273 #ifdef CONFIG_PGTABLE_MAPPING 274 struct vm_struct *vm; /* vm area for mapping object that span pages */ 275 #else 276 char *vm_buf; /* copy buffer for objects that span pages */ 277 #endif 278 char *vm_addr; /* address of kmap_atomic()'ed pages */ 279 enum zs_mapmode vm_mm; /* mapping mode */ 280 bool huge; 281 }; 282 283 static int create_handle_cache(struct zs_pool *pool) 284 { 285 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 286 0, 0, NULL); 287 return pool->handle_cachep ? 0 : 1; 288 } 289 290 static void destroy_handle_cache(struct zs_pool *pool) 291 { 292 kmem_cache_destroy(pool->handle_cachep); 293 } 294 295 static unsigned long alloc_handle(struct zs_pool *pool) 296 { 297 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 298 pool->flags & ~__GFP_HIGHMEM); 299 } 300 301 static void free_handle(struct zs_pool *pool, unsigned long handle) 302 { 303 kmem_cache_free(pool->handle_cachep, (void *)handle); 304 } 305 306 static void record_obj(unsigned long handle, unsigned long obj) 307 { 308 *(unsigned long *)handle = obj; 309 } 310 311 /* zpool driver */ 312 313 #ifdef CONFIG_ZPOOL 314 315 static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) 316 { 317 return zs_create_pool(name, gfp); 318 } 319 320 static void zs_zpool_destroy(void *pool) 321 { 322 zs_destroy_pool(pool); 323 } 324 325 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 326 unsigned long *handle) 327 { 328 *handle = zs_malloc(pool, size); 329 return *handle ? 0 : -1; 330 } 331 static void zs_zpool_free(void *pool, unsigned long handle) 332 { 333 zs_free(pool, handle); 334 } 335 336 static int zs_zpool_shrink(void *pool, unsigned int pages, 337 unsigned int *reclaimed) 338 { 339 return -EINVAL; 340 } 341 342 static void *zs_zpool_map(void *pool, unsigned long handle, 343 enum zpool_mapmode mm) 344 { 345 enum zs_mapmode zs_mm; 346 347 switch (mm) { 348 case ZPOOL_MM_RO: 349 zs_mm = ZS_MM_RO; 350 break; 351 case ZPOOL_MM_WO: 352 zs_mm = ZS_MM_WO; 353 break; 354 case ZPOOL_MM_RW: /* fallthru */ 355 default: 356 zs_mm = ZS_MM_RW; 357 break; 358 } 359 360 return zs_map_object(pool, handle, zs_mm); 361 } 362 static void zs_zpool_unmap(void *pool, unsigned long handle) 363 { 364 zs_unmap_object(pool, handle); 365 } 366 367 static u64 zs_zpool_total_size(void *pool) 368 { 369 return zs_get_total_pages(pool) << PAGE_SHIFT; 370 } 371 372 static struct zpool_driver zs_zpool_driver = { 373 .type = "zsmalloc", 374 .owner = THIS_MODULE, 375 .create = zs_zpool_create, 376 .destroy = zs_zpool_destroy, 377 .malloc = zs_zpool_malloc, 378 .free = zs_zpool_free, 379 .shrink = zs_zpool_shrink, 380 .map = zs_zpool_map, 381 .unmap = zs_zpool_unmap, 382 .total_size = zs_zpool_total_size, 383 }; 384 385 MODULE_ALIAS("zpool-zsmalloc"); 386 #endif /* CONFIG_ZPOOL */ 387 388 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 389 { 390 return pages_per_zspage * PAGE_SIZE / size; 391 } 392 393 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 394 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 395 396 static int is_first_page(struct page *page) 397 { 398 return PagePrivate(page); 399 } 400 401 static int is_last_page(struct page *page) 402 { 403 return PagePrivate2(page); 404 } 405 406 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 407 enum fullness_group *fullness) 408 { 409 unsigned long m; 410 BUG_ON(!is_first_page(page)); 411 412 m = (unsigned long)page->mapping; 413 *fullness = m & FULLNESS_MASK; 414 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 415 } 416 417 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 418 enum fullness_group fullness) 419 { 420 unsigned long m; 421 BUG_ON(!is_first_page(page)); 422 423 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 424 (fullness & FULLNESS_MASK); 425 page->mapping = (struct address_space *)m; 426 } 427 428 /* 429 * zsmalloc divides the pool into various size classes where each 430 * class maintains a list of zspages where each zspage is divided 431 * into equal sized chunks. Each allocation falls into one of these 432 * classes depending on its size. This function returns index of the 433 * size class which has chunk size big enough to hold the give size. 434 */ 435 static int get_size_class_index(int size) 436 { 437 int idx = 0; 438 439 if (likely(size > ZS_MIN_ALLOC_SIZE)) 440 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 441 ZS_SIZE_CLASS_DELTA); 442 443 return min(zs_size_classes - 1, idx); 444 } 445 446 #ifdef CONFIG_ZSMALLOC_STAT 447 448 static inline void zs_stat_inc(struct size_class *class, 449 enum zs_stat_type type, unsigned long cnt) 450 { 451 class->stats.objs[type] += cnt; 452 } 453 454 static inline void zs_stat_dec(struct size_class *class, 455 enum zs_stat_type type, unsigned long cnt) 456 { 457 class->stats.objs[type] -= cnt; 458 } 459 460 static inline unsigned long zs_stat_get(struct size_class *class, 461 enum zs_stat_type type) 462 { 463 return class->stats.objs[type]; 464 } 465 466 static int __init zs_stat_init(void) 467 { 468 if (!debugfs_initialized()) 469 return -ENODEV; 470 471 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 472 if (!zs_stat_root) 473 return -ENOMEM; 474 475 return 0; 476 } 477 478 static void __exit zs_stat_exit(void) 479 { 480 debugfs_remove_recursive(zs_stat_root); 481 } 482 483 static int zs_stats_size_show(struct seq_file *s, void *v) 484 { 485 int i; 486 struct zs_pool *pool = s->private; 487 struct size_class *class; 488 int objs_per_zspage; 489 unsigned long class_almost_full, class_almost_empty; 490 unsigned long obj_allocated, obj_used, pages_used; 491 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 492 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 493 494 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", 495 "class", "size", "almost_full", "almost_empty", 496 "obj_allocated", "obj_used", "pages_used", 497 "pages_per_zspage"); 498 499 for (i = 0; i < zs_size_classes; i++) { 500 class = pool->size_class[i]; 501 502 if (class->index != i) 503 continue; 504 505 spin_lock(&class->lock); 506 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 507 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 508 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 509 obj_used = zs_stat_get(class, OBJ_USED); 510 spin_unlock(&class->lock); 511 512 objs_per_zspage = get_maxobj_per_zspage(class->size, 513 class->pages_per_zspage); 514 pages_used = obj_allocated / objs_per_zspage * 515 class->pages_per_zspage; 516 517 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", 518 i, class->size, class_almost_full, class_almost_empty, 519 obj_allocated, obj_used, pages_used, 520 class->pages_per_zspage); 521 522 total_class_almost_full += class_almost_full; 523 total_class_almost_empty += class_almost_empty; 524 total_objs += obj_allocated; 525 total_used_objs += obj_used; 526 total_pages += pages_used; 527 } 528 529 seq_puts(s, "\n"); 530 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", 531 "Total", "", total_class_almost_full, 532 total_class_almost_empty, total_objs, 533 total_used_objs, total_pages); 534 535 return 0; 536 } 537 538 static int zs_stats_size_open(struct inode *inode, struct file *file) 539 { 540 return single_open(file, zs_stats_size_show, inode->i_private); 541 } 542 543 static const struct file_operations zs_stat_size_ops = { 544 .open = zs_stats_size_open, 545 .read = seq_read, 546 .llseek = seq_lseek, 547 .release = single_release, 548 }; 549 550 static int zs_pool_stat_create(char *name, struct zs_pool *pool) 551 { 552 struct dentry *entry; 553 554 if (!zs_stat_root) 555 return -ENODEV; 556 557 entry = debugfs_create_dir(name, zs_stat_root); 558 if (!entry) { 559 pr_warn("debugfs dir <%s> creation failed\n", name); 560 return -ENOMEM; 561 } 562 pool->stat_dentry = entry; 563 564 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 565 pool->stat_dentry, pool, &zs_stat_size_ops); 566 if (!entry) { 567 pr_warn("%s: debugfs file entry <%s> creation failed\n", 568 name, "classes"); 569 return -ENOMEM; 570 } 571 572 return 0; 573 } 574 575 static void zs_pool_stat_destroy(struct zs_pool *pool) 576 { 577 debugfs_remove_recursive(pool->stat_dentry); 578 } 579 580 #else /* CONFIG_ZSMALLOC_STAT */ 581 582 static inline void zs_stat_inc(struct size_class *class, 583 enum zs_stat_type type, unsigned long cnt) 584 { 585 } 586 587 static inline void zs_stat_dec(struct size_class *class, 588 enum zs_stat_type type, unsigned long cnt) 589 { 590 } 591 592 static inline unsigned long zs_stat_get(struct size_class *class, 593 enum zs_stat_type type) 594 { 595 return 0; 596 } 597 598 static int __init zs_stat_init(void) 599 { 600 return 0; 601 } 602 603 static void __exit zs_stat_exit(void) 604 { 605 } 606 607 static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) 608 { 609 return 0; 610 } 611 612 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 613 { 614 } 615 616 #endif 617 618 619 /* 620 * For each size class, zspages are divided into different groups 621 * depending on how "full" they are. This was done so that we could 622 * easily find empty or nearly empty zspages when we try to shrink 623 * the pool (not yet implemented). This function returns fullness 624 * status of the given page. 625 */ 626 static enum fullness_group get_fullness_group(struct page *page) 627 { 628 int inuse, max_objects; 629 enum fullness_group fg; 630 BUG_ON(!is_first_page(page)); 631 632 inuse = page->inuse; 633 max_objects = page->objects; 634 635 if (inuse == 0) 636 fg = ZS_EMPTY; 637 else if (inuse == max_objects) 638 fg = ZS_FULL; 639 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 640 fg = ZS_ALMOST_EMPTY; 641 else 642 fg = ZS_ALMOST_FULL; 643 644 return fg; 645 } 646 647 /* 648 * Each size class maintains various freelists and zspages are assigned 649 * to one of these freelists based on the number of live objects they 650 * have. This functions inserts the given zspage into the freelist 651 * identified by <class, fullness_group>. 652 */ 653 static void insert_zspage(struct page *page, struct size_class *class, 654 enum fullness_group fullness) 655 { 656 struct page **head; 657 658 BUG_ON(!is_first_page(page)); 659 660 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 661 return; 662 663 head = &class->fullness_list[fullness]; 664 if (*head) 665 list_add_tail(&page->lru, &(*head)->lru); 666 667 *head = page; 668 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 669 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 670 } 671 672 /* 673 * This function removes the given zspage from the freelist identified 674 * by <class, fullness_group>. 675 */ 676 static void remove_zspage(struct page *page, struct size_class *class, 677 enum fullness_group fullness) 678 { 679 struct page **head; 680 681 BUG_ON(!is_first_page(page)); 682 683 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 684 return; 685 686 head = &class->fullness_list[fullness]; 687 BUG_ON(!*head); 688 if (list_empty(&(*head)->lru)) 689 *head = NULL; 690 else if (*head == page) 691 *head = (struct page *)list_entry((*head)->lru.next, 692 struct page, lru); 693 694 list_del_init(&page->lru); 695 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 696 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 697 } 698 699 /* 700 * Each size class maintains zspages in different fullness groups depending 701 * on the number of live objects they contain. When allocating or freeing 702 * objects, the fullness status of the page can change, say, from ALMOST_FULL 703 * to ALMOST_EMPTY when freeing an object. This function checks if such 704 * a status change has occurred for the given page and accordingly moves the 705 * page from the freelist of the old fullness group to that of the new 706 * fullness group. 707 */ 708 static enum fullness_group fix_fullness_group(struct size_class *class, 709 struct page *page) 710 { 711 int class_idx; 712 enum fullness_group currfg, newfg; 713 714 BUG_ON(!is_first_page(page)); 715 716 get_zspage_mapping(page, &class_idx, &currfg); 717 newfg = get_fullness_group(page); 718 if (newfg == currfg) 719 goto out; 720 721 remove_zspage(page, class, currfg); 722 insert_zspage(page, class, newfg); 723 set_zspage_mapping(page, class_idx, newfg); 724 725 out: 726 return newfg; 727 } 728 729 /* 730 * We have to decide on how many pages to link together 731 * to form a zspage for each size class. This is important 732 * to reduce wastage due to unusable space left at end of 733 * each zspage which is given as: 734 * wastage = Zp % class_size 735 * usage = Zp - wastage 736 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 737 * 738 * For example, for size class of 3/8 * PAGE_SIZE, we should 739 * link together 3 PAGE_SIZE sized pages to form a zspage 740 * since then we can perfectly fit in 8 such objects. 741 */ 742 static int get_pages_per_zspage(int class_size) 743 { 744 int i, max_usedpc = 0; 745 /* zspage order which gives maximum used size per KB */ 746 int max_usedpc_order = 1; 747 748 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 749 int zspage_size; 750 int waste, usedpc; 751 752 zspage_size = i * PAGE_SIZE; 753 waste = zspage_size % class_size; 754 usedpc = (zspage_size - waste) * 100 / zspage_size; 755 756 if (usedpc > max_usedpc) { 757 max_usedpc = usedpc; 758 max_usedpc_order = i; 759 } 760 } 761 762 return max_usedpc_order; 763 } 764 765 /* 766 * A single 'zspage' is composed of many system pages which are 767 * linked together using fields in struct page. This function finds 768 * the first/head page, given any component page of a zspage. 769 */ 770 static struct page *get_first_page(struct page *page) 771 { 772 if (is_first_page(page)) 773 return page; 774 else 775 return page->first_page; 776 } 777 778 static struct page *get_next_page(struct page *page) 779 { 780 struct page *next; 781 782 if (is_last_page(page)) 783 next = NULL; 784 else if (is_first_page(page)) 785 next = (struct page *)page_private(page); 786 else 787 next = list_entry(page->lru.next, struct page, lru); 788 789 return next; 790 } 791 792 /* 793 * Encode <page, obj_idx> as a single handle value. 794 * We use the least bit of handle for tagging. 795 */ 796 static void *location_to_obj(struct page *page, unsigned long obj_idx) 797 { 798 unsigned long obj; 799 800 if (!page) { 801 BUG_ON(obj_idx); 802 return NULL; 803 } 804 805 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 806 obj |= ((obj_idx) & OBJ_INDEX_MASK); 807 obj <<= OBJ_TAG_BITS; 808 809 return (void *)obj; 810 } 811 812 /* 813 * Decode <page, obj_idx> pair from the given object handle. We adjust the 814 * decoded obj_idx back to its original value since it was adjusted in 815 * location_to_obj(). 816 */ 817 static void obj_to_location(unsigned long obj, struct page **page, 818 unsigned long *obj_idx) 819 { 820 obj >>= OBJ_TAG_BITS; 821 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 822 *obj_idx = (obj & OBJ_INDEX_MASK); 823 } 824 825 static unsigned long handle_to_obj(unsigned long handle) 826 { 827 return *(unsigned long *)handle; 828 } 829 830 static unsigned long obj_to_head(struct size_class *class, struct page *page, 831 void *obj) 832 { 833 if (class->huge) { 834 VM_BUG_ON(!is_first_page(page)); 835 return *(unsigned long *)page_private(page); 836 } else 837 return *(unsigned long *)obj; 838 } 839 840 static unsigned long obj_idx_to_offset(struct page *page, 841 unsigned long obj_idx, int class_size) 842 { 843 unsigned long off = 0; 844 845 if (!is_first_page(page)) 846 off = page->index; 847 848 return off + obj_idx * class_size; 849 } 850 851 static inline int trypin_tag(unsigned long handle) 852 { 853 unsigned long *ptr = (unsigned long *)handle; 854 855 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 856 } 857 858 static void pin_tag(unsigned long handle) 859 { 860 while (!trypin_tag(handle)); 861 } 862 863 static void unpin_tag(unsigned long handle) 864 { 865 unsigned long *ptr = (unsigned long *)handle; 866 867 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 868 } 869 870 static void reset_page(struct page *page) 871 { 872 clear_bit(PG_private, &page->flags); 873 clear_bit(PG_private_2, &page->flags); 874 set_page_private(page, 0); 875 page->mapping = NULL; 876 page->freelist = NULL; 877 page_mapcount_reset(page); 878 } 879 880 static void free_zspage(struct page *first_page) 881 { 882 struct page *nextp, *tmp, *head_extra; 883 884 BUG_ON(!is_first_page(first_page)); 885 BUG_ON(first_page->inuse); 886 887 head_extra = (struct page *)page_private(first_page); 888 889 reset_page(first_page); 890 __free_page(first_page); 891 892 /* zspage with only 1 system page */ 893 if (!head_extra) 894 return; 895 896 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 897 list_del(&nextp->lru); 898 reset_page(nextp); 899 __free_page(nextp); 900 } 901 reset_page(head_extra); 902 __free_page(head_extra); 903 } 904 905 /* Initialize a newly allocated zspage */ 906 static void init_zspage(struct page *first_page, struct size_class *class) 907 { 908 unsigned long off = 0; 909 struct page *page = first_page; 910 911 BUG_ON(!is_first_page(first_page)); 912 while (page) { 913 struct page *next_page; 914 struct link_free *link; 915 unsigned int i = 1; 916 void *vaddr; 917 918 /* 919 * page->index stores offset of first object starting 920 * in the page. For the first page, this is always 0, 921 * so we use first_page->index (aka ->freelist) to store 922 * head of corresponding zspage's freelist. 923 */ 924 if (page != first_page) 925 page->index = off; 926 927 vaddr = kmap_atomic(page); 928 link = (struct link_free *)vaddr + off / sizeof(*link); 929 930 while ((off += class->size) < PAGE_SIZE) { 931 link->next = location_to_obj(page, i++); 932 link += class->size / sizeof(*link); 933 } 934 935 /* 936 * We now come to the last (full or partial) object on this 937 * page, which must point to the first object on the next 938 * page (if present) 939 */ 940 next_page = get_next_page(page); 941 link->next = location_to_obj(next_page, 0); 942 kunmap_atomic(vaddr); 943 page = next_page; 944 off %= PAGE_SIZE; 945 } 946 } 947 948 /* 949 * Allocate a zspage for the given size class 950 */ 951 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 952 { 953 int i, error; 954 struct page *first_page = NULL, *uninitialized_var(prev_page); 955 956 /* 957 * Allocate individual pages and link them together as: 958 * 1. first page->private = first sub-page 959 * 2. all sub-pages are linked together using page->lru 960 * 3. each sub-page is linked to the first page using page->first_page 961 * 962 * For each size class, First/Head pages are linked together using 963 * page->lru. Also, we set PG_private to identify the first page 964 * (i.e. no other sub-page has this flag set) and PG_private_2 to 965 * identify the last page. 966 */ 967 error = -ENOMEM; 968 for (i = 0; i < class->pages_per_zspage; i++) { 969 struct page *page; 970 971 page = alloc_page(flags); 972 if (!page) 973 goto cleanup; 974 975 INIT_LIST_HEAD(&page->lru); 976 if (i == 0) { /* first page */ 977 SetPagePrivate(page); 978 set_page_private(page, 0); 979 first_page = page; 980 first_page->inuse = 0; 981 } 982 if (i == 1) 983 set_page_private(first_page, (unsigned long)page); 984 if (i >= 1) 985 page->first_page = first_page; 986 if (i >= 2) 987 list_add(&page->lru, &prev_page->lru); 988 if (i == class->pages_per_zspage - 1) /* last page */ 989 SetPagePrivate2(page); 990 prev_page = page; 991 } 992 993 init_zspage(first_page, class); 994 995 first_page->freelist = location_to_obj(first_page, 0); 996 /* Maximum number of objects we can store in this zspage */ 997 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 998 999 error = 0; /* Success */ 1000 1001 cleanup: 1002 if (unlikely(error) && first_page) { 1003 free_zspage(first_page); 1004 first_page = NULL; 1005 } 1006 1007 return first_page; 1008 } 1009 1010 static struct page *find_get_zspage(struct size_class *class) 1011 { 1012 int i; 1013 struct page *page; 1014 1015 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1016 page = class->fullness_list[i]; 1017 if (page) 1018 break; 1019 } 1020 1021 return page; 1022 } 1023 1024 #ifdef CONFIG_PGTABLE_MAPPING 1025 static inline int __zs_cpu_up(struct mapping_area *area) 1026 { 1027 /* 1028 * Make sure we don't leak memory if a cpu UP notification 1029 * and zs_init() race and both call zs_cpu_up() on the same cpu 1030 */ 1031 if (area->vm) 1032 return 0; 1033 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1034 if (!area->vm) 1035 return -ENOMEM; 1036 return 0; 1037 } 1038 1039 static inline void __zs_cpu_down(struct mapping_area *area) 1040 { 1041 if (area->vm) 1042 free_vm_area(area->vm); 1043 area->vm = NULL; 1044 } 1045 1046 static inline void *__zs_map_object(struct mapping_area *area, 1047 struct page *pages[2], int off, int size) 1048 { 1049 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1050 area->vm_addr = area->vm->addr; 1051 return area->vm_addr + off; 1052 } 1053 1054 static inline void __zs_unmap_object(struct mapping_area *area, 1055 struct page *pages[2], int off, int size) 1056 { 1057 unsigned long addr = (unsigned long)area->vm_addr; 1058 1059 unmap_kernel_range(addr, PAGE_SIZE * 2); 1060 } 1061 1062 #else /* CONFIG_PGTABLE_MAPPING */ 1063 1064 static inline int __zs_cpu_up(struct mapping_area *area) 1065 { 1066 /* 1067 * Make sure we don't leak memory if a cpu UP notification 1068 * and zs_init() race and both call zs_cpu_up() on the same cpu 1069 */ 1070 if (area->vm_buf) 1071 return 0; 1072 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1073 if (!area->vm_buf) 1074 return -ENOMEM; 1075 return 0; 1076 } 1077 1078 static inline void __zs_cpu_down(struct mapping_area *area) 1079 { 1080 kfree(area->vm_buf); 1081 area->vm_buf = NULL; 1082 } 1083 1084 static void *__zs_map_object(struct mapping_area *area, 1085 struct page *pages[2], int off, int size) 1086 { 1087 int sizes[2]; 1088 void *addr; 1089 char *buf = area->vm_buf; 1090 1091 /* disable page faults to match kmap_atomic() return conditions */ 1092 pagefault_disable(); 1093 1094 /* no read fastpath */ 1095 if (area->vm_mm == ZS_MM_WO) 1096 goto out; 1097 1098 sizes[0] = PAGE_SIZE - off; 1099 sizes[1] = size - sizes[0]; 1100 1101 /* copy object to per-cpu buffer */ 1102 addr = kmap_atomic(pages[0]); 1103 memcpy(buf, addr + off, sizes[0]); 1104 kunmap_atomic(addr); 1105 addr = kmap_atomic(pages[1]); 1106 memcpy(buf + sizes[0], addr, sizes[1]); 1107 kunmap_atomic(addr); 1108 out: 1109 return area->vm_buf; 1110 } 1111 1112 static void __zs_unmap_object(struct mapping_area *area, 1113 struct page *pages[2], int off, int size) 1114 { 1115 int sizes[2]; 1116 void *addr; 1117 char *buf; 1118 1119 /* no write fastpath */ 1120 if (area->vm_mm == ZS_MM_RO) 1121 goto out; 1122 1123 buf = area->vm_buf; 1124 if (!area->huge) { 1125 buf = buf + ZS_HANDLE_SIZE; 1126 size -= ZS_HANDLE_SIZE; 1127 off += ZS_HANDLE_SIZE; 1128 } 1129 1130 sizes[0] = PAGE_SIZE - off; 1131 sizes[1] = size - sizes[0]; 1132 1133 /* copy per-cpu buffer to object */ 1134 addr = kmap_atomic(pages[0]); 1135 memcpy(addr + off, buf, sizes[0]); 1136 kunmap_atomic(addr); 1137 addr = kmap_atomic(pages[1]); 1138 memcpy(addr, buf + sizes[0], sizes[1]); 1139 kunmap_atomic(addr); 1140 1141 out: 1142 /* enable page faults to match kunmap_atomic() return conditions */ 1143 pagefault_enable(); 1144 } 1145 1146 #endif /* CONFIG_PGTABLE_MAPPING */ 1147 1148 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1149 void *pcpu) 1150 { 1151 int ret, cpu = (long)pcpu; 1152 struct mapping_area *area; 1153 1154 switch (action) { 1155 case CPU_UP_PREPARE: 1156 area = &per_cpu(zs_map_area, cpu); 1157 ret = __zs_cpu_up(area); 1158 if (ret) 1159 return notifier_from_errno(ret); 1160 break; 1161 case CPU_DEAD: 1162 case CPU_UP_CANCELED: 1163 area = &per_cpu(zs_map_area, cpu); 1164 __zs_cpu_down(area); 1165 break; 1166 } 1167 1168 return NOTIFY_OK; 1169 } 1170 1171 static struct notifier_block zs_cpu_nb = { 1172 .notifier_call = zs_cpu_notifier 1173 }; 1174 1175 static int zs_register_cpu_notifier(void) 1176 { 1177 int cpu, uninitialized_var(ret); 1178 1179 cpu_notifier_register_begin(); 1180 1181 __register_cpu_notifier(&zs_cpu_nb); 1182 for_each_online_cpu(cpu) { 1183 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1184 if (notifier_to_errno(ret)) 1185 break; 1186 } 1187 1188 cpu_notifier_register_done(); 1189 return notifier_to_errno(ret); 1190 } 1191 1192 static void zs_unregister_cpu_notifier(void) 1193 { 1194 int cpu; 1195 1196 cpu_notifier_register_begin(); 1197 1198 for_each_online_cpu(cpu) 1199 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1200 __unregister_cpu_notifier(&zs_cpu_nb); 1201 1202 cpu_notifier_register_done(); 1203 } 1204 1205 static void init_zs_size_classes(void) 1206 { 1207 int nr; 1208 1209 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1210 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1211 nr += 1; 1212 1213 zs_size_classes = nr; 1214 } 1215 1216 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1217 { 1218 if (prev->pages_per_zspage != pages_per_zspage) 1219 return false; 1220 1221 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1222 != get_maxobj_per_zspage(size, pages_per_zspage)) 1223 return false; 1224 1225 return true; 1226 } 1227 1228 static bool zspage_full(struct page *page) 1229 { 1230 BUG_ON(!is_first_page(page)); 1231 1232 return page->inuse == page->objects; 1233 } 1234 1235 unsigned long zs_get_total_pages(struct zs_pool *pool) 1236 { 1237 return atomic_long_read(&pool->pages_allocated); 1238 } 1239 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1240 1241 /** 1242 * zs_map_object - get address of allocated object from handle. 1243 * @pool: pool from which the object was allocated 1244 * @handle: handle returned from zs_malloc 1245 * 1246 * Before using an object allocated from zs_malloc, it must be mapped using 1247 * this function. When done with the object, it must be unmapped using 1248 * zs_unmap_object. 1249 * 1250 * Only one object can be mapped per cpu at a time. There is no protection 1251 * against nested mappings. 1252 * 1253 * This function returns with preemption and page faults disabled. 1254 */ 1255 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1256 enum zs_mapmode mm) 1257 { 1258 struct page *page; 1259 unsigned long obj, obj_idx, off; 1260 1261 unsigned int class_idx; 1262 enum fullness_group fg; 1263 struct size_class *class; 1264 struct mapping_area *area; 1265 struct page *pages[2]; 1266 void *ret; 1267 1268 BUG_ON(!handle); 1269 1270 /* 1271 * Because we use per-cpu mapping areas shared among the 1272 * pools/users, we can't allow mapping in interrupt context 1273 * because it can corrupt another users mappings. 1274 */ 1275 BUG_ON(in_interrupt()); 1276 1277 /* From now on, migration cannot move the object */ 1278 pin_tag(handle); 1279 1280 obj = handle_to_obj(handle); 1281 obj_to_location(obj, &page, &obj_idx); 1282 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1283 class = pool->size_class[class_idx]; 1284 off = obj_idx_to_offset(page, obj_idx, class->size); 1285 1286 area = &get_cpu_var(zs_map_area); 1287 area->vm_mm = mm; 1288 if (off + class->size <= PAGE_SIZE) { 1289 /* this object is contained entirely within a page */ 1290 area->vm_addr = kmap_atomic(page); 1291 ret = area->vm_addr + off; 1292 goto out; 1293 } 1294 1295 /* this object spans two pages */ 1296 pages[0] = page; 1297 pages[1] = get_next_page(page); 1298 BUG_ON(!pages[1]); 1299 1300 ret = __zs_map_object(area, pages, off, class->size); 1301 out: 1302 if (!class->huge) 1303 ret += ZS_HANDLE_SIZE; 1304 1305 return ret; 1306 } 1307 EXPORT_SYMBOL_GPL(zs_map_object); 1308 1309 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1310 { 1311 struct page *page; 1312 unsigned long obj, obj_idx, off; 1313 1314 unsigned int class_idx; 1315 enum fullness_group fg; 1316 struct size_class *class; 1317 struct mapping_area *area; 1318 1319 BUG_ON(!handle); 1320 1321 obj = handle_to_obj(handle); 1322 obj_to_location(obj, &page, &obj_idx); 1323 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1324 class = pool->size_class[class_idx]; 1325 off = obj_idx_to_offset(page, obj_idx, class->size); 1326 1327 area = this_cpu_ptr(&zs_map_area); 1328 if (off + class->size <= PAGE_SIZE) 1329 kunmap_atomic(area->vm_addr); 1330 else { 1331 struct page *pages[2]; 1332 1333 pages[0] = page; 1334 pages[1] = get_next_page(page); 1335 BUG_ON(!pages[1]); 1336 1337 __zs_unmap_object(area, pages, off, class->size); 1338 } 1339 put_cpu_var(zs_map_area); 1340 unpin_tag(handle); 1341 } 1342 EXPORT_SYMBOL_GPL(zs_unmap_object); 1343 1344 static unsigned long obj_malloc(struct page *first_page, 1345 struct size_class *class, unsigned long handle) 1346 { 1347 unsigned long obj; 1348 struct link_free *link; 1349 1350 struct page *m_page; 1351 unsigned long m_objidx, m_offset; 1352 void *vaddr; 1353 1354 handle |= OBJ_ALLOCATED_TAG; 1355 obj = (unsigned long)first_page->freelist; 1356 obj_to_location(obj, &m_page, &m_objidx); 1357 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1358 1359 vaddr = kmap_atomic(m_page); 1360 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1361 first_page->freelist = link->next; 1362 if (!class->huge) 1363 /* record handle in the header of allocated chunk */ 1364 link->handle = handle; 1365 else 1366 /* record handle in first_page->private */ 1367 set_page_private(first_page, handle); 1368 kunmap_atomic(vaddr); 1369 first_page->inuse++; 1370 zs_stat_inc(class, OBJ_USED, 1); 1371 1372 return obj; 1373 } 1374 1375 1376 /** 1377 * zs_malloc - Allocate block of given size from pool. 1378 * @pool: pool to allocate from 1379 * @size: size of block to allocate 1380 * 1381 * On success, handle to the allocated object is returned, 1382 * otherwise 0. 1383 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1384 */ 1385 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1386 { 1387 unsigned long handle, obj; 1388 struct size_class *class; 1389 struct page *first_page; 1390 1391 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1392 return 0; 1393 1394 handle = alloc_handle(pool); 1395 if (!handle) 1396 return 0; 1397 1398 /* extra space in chunk to keep the handle */ 1399 size += ZS_HANDLE_SIZE; 1400 class = pool->size_class[get_size_class_index(size)]; 1401 1402 spin_lock(&class->lock); 1403 first_page = find_get_zspage(class); 1404 1405 if (!first_page) { 1406 spin_unlock(&class->lock); 1407 first_page = alloc_zspage(class, pool->flags); 1408 if (unlikely(!first_page)) { 1409 free_handle(pool, handle); 1410 return 0; 1411 } 1412 1413 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1414 atomic_long_add(class->pages_per_zspage, 1415 &pool->pages_allocated); 1416 1417 spin_lock(&class->lock); 1418 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1419 class->size, class->pages_per_zspage)); 1420 } 1421 1422 obj = obj_malloc(first_page, class, handle); 1423 /* Now move the zspage to another fullness group, if required */ 1424 fix_fullness_group(class, first_page); 1425 record_obj(handle, obj); 1426 spin_unlock(&class->lock); 1427 1428 return handle; 1429 } 1430 EXPORT_SYMBOL_GPL(zs_malloc); 1431 1432 static void obj_free(struct zs_pool *pool, struct size_class *class, 1433 unsigned long obj) 1434 { 1435 struct link_free *link; 1436 struct page *first_page, *f_page; 1437 unsigned long f_objidx, f_offset; 1438 void *vaddr; 1439 int class_idx; 1440 enum fullness_group fullness; 1441 1442 BUG_ON(!obj); 1443 1444 obj &= ~OBJ_ALLOCATED_TAG; 1445 obj_to_location(obj, &f_page, &f_objidx); 1446 first_page = get_first_page(f_page); 1447 1448 get_zspage_mapping(first_page, &class_idx, &fullness); 1449 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1450 1451 vaddr = kmap_atomic(f_page); 1452 1453 /* Insert this object in containing zspage's freelist */ 1454 link = (struct link_free *)(vaddr + f_offset); 1455 link->next = first_page->freelist; 1456 if (class->huge) 1457 set_page_private(first_page, 0); 1458 kunmap_atomic(vaddr); 1459 first_page->freelist = (void *)obj; 1460 first_page->inuse--; 1461 zs_stat_dec(class, OBJ_USED, 1); 1462 } 1463 1464 void zs_free(struct zs_pool *pool, unsigned long handle) 1465 { 1466 struct page *first_page, *f_page; 1467 unsigned long obj, f_objidx; 1468 int class_idx; 1469 struct size_class *class; 1470 enum fullness_group fullness; 1471 1472 if (unlikely(!handle)) 1473 return; 1474 1475 pin_tag(handle); 1476 obj = handle_to_obj(handle); 1477 obj_to_location(obj, &f_page, &f_objidx); 1478 first_page = get_first_page(f_page); 1479 1480 get_zspage_mapping(first_page, &class_idx, &fullness); 1481 class = pool->size_class[class_idx]; 1482 1483 spin_lock(&class->lock); 1484 obj_free(pool, class, obj); 1485 fullness = fix_fullness_group(class, first_page); 1486 if (fullness == ZS_EMPTY) { 1487 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1488 class->size, class->pages_per_zspage)); 1489 atomic_long_sub(class->pages_per_zspage, 1490 &pool->pages_allocated); 1491 free_zspage(first_page); 1492 } 1493 spin_unlock(&class->lock); 1494 unpin_tag(handle); 1495 1496 free_handle(pool, handle); 1497 } 1498 EXPORT_SYMBOL_GPL(zs_free); 1499 1500 static void zs_object_copy(unsigned long src, unsigned long dst, 1501 struct size_class *class) 1502 { 1503 struct page *s_page, *d_page; 1504 unsigned long s_objidx, d_objidx; 1505 unsigned long s_off, d_off; 1506 void *s_addr, *d_addr; 1507 int s_size, d_size, size; 1508 int written = 0; 1509 1510 s_size = d_size = class->size; 1511 1512 obj_to_location(src, &s_page, &s_objidx); 1513 obj_to_location(dst, &d_page, &d_objidx); 1514 1515 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1516 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1517 1518 if (s_off + class->size > PAGE_SIZE) 1519 s_size = PAGE_SIZE - s_off; 1520 1521 if (d_off + class->size > PAGE_SIZE) 1522 d_size = PAGE_SIZE - d_off; 1523 1524 s_addr = kmap_atomic(s_page); 1525 d_addr = kmap_atomic(d_page); 1526 1527 while (1) { 1528 size = min(s_size, d_size); 1529 memcpy(d_addr + d_off, s_addr + s_off, size); 1530 written += size; 1531 1532 if (written == class->size) 1533 break; 1534 1535 s_off += size; 1536 s_size -= size; 1537 d_off += size; 1538 d_size -= size; 1539 1540 if (s_off >= PAGE_SIZE) { 1541 kunmap_atomic(d_addr); 1542 kunmap_atomic(s_addr); 1543 s_page = get_next_page(s_page); 1544 BUG_ON(!s_page); 1545 s_addr = kmap_atomic(s_page); 1546 d_addr = kmap_atomic(d_page); 1547 s_size = class->size - written; 1548 s_off = 0; 1549 } 1550 1551 if (d_off >= PAGE_SIZE) { 1552 kunmap_atomic(d_addr); 1553 d_page = get_next_page(d_page); 1554 BUG_ON(!d_page); 1555 d_addr = kmap_atomic(d_page); 1556 d_size = class->size - written; 1557 d_off = 0; 1558 } 1559 } 1560 1561 kunmap_atomic(d_addr); 1562 kunmap_atomic(s_addr); 1563 } 1564 1565 /* 1566 * Find alloced object in zspage from index object and 1567 * return handle. 1568 */ 1569 static unsigned long find_alloced_obj(struct page *page, int index, 1570 struct size_class *class) 1571 { 1572 unsigned long head; 1573 int offset = 0; 1574 unsigned long handle = 0; 1575 void *addr = kmap_atomic(page); 1576 1577 if (!is_first_page(page)) 1578 offset = page->index; 1579 offset += class->size * index; 1580 1581 while (offset < PAGE_SIZE) { 1582 head = obj_to_head(class, page, addr + offset); 1583 if (head & OBJ_ALLOCATED_TAG) { 1584 handle = head & ~OBJ_ALLOCATED_TAG; 1585 if (trypin_tag(handle)) 1586 break; 1587 handle = 0; 1588 } 1589 1590 offset += class->size; 1591 index++; 1592 } 1593 1594 kunmap_atomic(addr); 1595 return handle; 1596 } 1597 1598 struct zs_compact_control { 1599 /* Source page for migration which could be a subpage of zspage. */ 1600 struct page *s_page; 1601 /* Destination page for migration which should be a first page 1602 * of zspage. */ 1603 struct page *d_page; 1604 /* Starting object index within @s_page which used for live object 1605 * in the subpage. */ 1606 int index; 1607 /* how many of objects are migrated */ 1608 int nr_migrated; 1609 }; 1610 1611 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1612 struct zs_compact_control *cc) 1613 { 1614 unsigned long used_obj, free_obj; 1615 unsigned long handle; 1616 struct page *s_page = cc->s_page; 1617 struct page *d_page = cc->d_page; 1618 unsigned long index = cc->index; 1619 int nr_migrated = 0; 1620 int ret = 0; 1621 1622 while (1) { 1623 handle = find_alloced_obj(s_page, index, class); 1624 if (!handle) { 1625 s_page = get_next_page(s_page); 1626 if (!s_page) 1627 break; 1628 index = 0; 1629 continue; 1630 } 1631 1632 /* Stop if there is no more space */ 1633 if (zspage_full(d_page)) { 1634 unpin_tag(handle); 1635 ret = -ENOMEM; 1636 break; 1637 } 1638 1639 used_obj = handle_to_obj(handle); 1640 free_obj = obj_malloc(d_page, class, handle); 1641 zs_object_copy(used_obj, free_obj, class); 1642 index++; 1643 record_obj(handle, free_obj); 1644 unpin_tag(handle); 1645 obj_free(pool, class, used_obj); 1646 nr_migrated++; 1647 } 1648 1649 /* Remember last position in this iteration */ 1650 cc->s_page = s_page; 1651 cc->index = index; 1652 cc->nr_migrated = nr_migrated; 1653 1654 return ret; 1655 } 1656 1657 static struct page *alloc_target_page(struct size_class *class) 1658 { 1659 int i; 1660 struct page *page; 1661 1662 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1663 page = class->fullness_list[i]; 1664 if (page) { 1665 remove_zspage(page, class, i); 1666 break; 1667 } 1668 } 1669 1670 return page; 1671 } 1672 1673 static void putback_zspage(struct zs_pool *pool, struct size_class *class, 1674 struct page *first_page) 1675 { 1676 enum fullness_group fullness; 1677 1678 BUG_ON(!is_first_page(first_page)); 1679 1680 fullness = get_fullness_group(first_page); 1681 insert_zspage(first_page, class, fullness); 1682 set_zspage_mapping(first_page, class->index, fullness); 1683 1684 if (fullness == ZS_EMPTY) { 1685 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1686 class->size, class->pages_per_zspage)); 1687 atomic_long_sub(class->pages_per_zspage, 1688 &pool->pages_allocated); 1689 1690 free_zspage(first_page); 1691 } 1692 } 1693 1694 static struct page *isolate_source_page(struct size_class *class) 1695 { 1696 struct page *page; 1697 1698 page = class->fullness_list[ZS_ALMOST_EMPTY]; 1699 if (page) 1700 remove_zspage(page, class, ZS_ALMOST_EMPTY); 1701 1702 return page; 1703 } 1704 1705 static unsigned long __zs_compact(struct zs_pool *pool, 1706 struct size_class *class) 1707 { 1708 int nr_to_migrate; 1709 struct zs_compact_control cc; 1710 struct page *src_page; 1711 struct page *dst_page = NULL; 1712 unsigned long nr_total_migrated = 0; 1713 1714 spin_lock(&class->lock); 1715 while ((src_page = isolate_source_page(class))) { 1716 1717 BUG_ON(!is_first_page(src_page)); 1718 1719 /* The goal is to migrate all live objects in source page */ 1720 nr_to_migrate = src_page->inuse; 1721 cc.index = 0; 1722 cc.s_page = src_page; 1723 1724 while ((dst_page = alloc_target_page(class))) { 1725 cc.d_page = dst_page; 1726 /* 1727 * If there is no more space in dst_page, try to 1728 * allocate another zspage. 1729 */ 1730 if (!migrate_zspage(pool, class, &cc)) 1731 break; 1732 1733 putback_zspage(pool, class, dst_page); 1734 nr_total_migrated += cc.nr_migrated; 1735 nr_to_migrate -= cc.nr_migrated; 1736 } 1737 1738 /* Stop if we couldn't find slot */ 1739 if (dst_page == NULL) 1740 break; 1741 1742 putback_zspage(pool, class, dst_page); 1743 putback_zspage(pool, class, src_page); 1744 spin_unlock(&class->lock); 1745 nr_total_migrated += cc.nr_migrated; 1746 cond_resched(); 1747 spin_lock(&class->lock); 1748 } 1749 1750 if (src_page) 1751 putback_zspage(pool, class, src_page); 1752 1753 spin_unlock(&class->lock); 1754 1755 return nr_total_migrated; 1756 } 1757 1758 unsigned long zs_compact(struct zs_pool *pool) 1759 { 1760 int i; 1761 unsigned long nr_migrated = 0; 1762 struct size_class *class; 1763 1764 for (i = zs_size_classes - 1; i >= 0; i--) { 1765 class = pool->size_class[i]; 1766 if (!class) 1767 continue; 1768 if (class->index != i) 1769 continue; 1770 nr_migrated += __zs_compact(pool, class); 1771 } 1772 1773 return nr_migrated; 1774 } 1775 EXPORT_SYMBOL_GPL(zs_compact); 1776 1777 /** 1778 * zs_create_pool - Creates an allocation pool to work from. 1779 * @flags: allocation flags used to allocate pool metadata 1780 * 1781 * This function must be called before anything when using 1782 * the zsmalloc allocator. 1783 * 1784 * On success, a pointer to the newly created pool is returned, 1785 * otherwise NULL. 1786 */ 1787 struct zs_pool *zs_create_pool(char *name, gfp_t flags) 1788 { 1789 int i; 1790 struct zs_pool *pool; 1791 struct size_class *prev_class = NULL; 1792 1793 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1794 if (!pool) 1795 return NULL; 1796 1797 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1798 GFP_KERNEL); 1799 if (!pool->size_class) { 1800 kfree(pool); 1801 return NULL; 1802 } 1803 1804 pool->name = kstrdup(name, GFP_KERNEL); 1805 if (!pool->name) 1806 goto err; 1807 1808 if (create_handle_cache(pool)) 1809 goto err; 1810 1811 /* 1812 * Iterate reversly, because, size of size_class that we want to use 1813 * for merging should be larger or equal to current size. 1814 */ 1815 for (i = zs_size_classes - 1; i >= 0; i--) { 1816 int size; 1817 int pages_per_zspage; 1818 struct size_class *class; 1819 1820 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1821 if (size > ZS_MAX_ALLOC_SIZE) 1822 size = ZS_MAX_ALLOC_SIZE; 1823 pages_per_zspage = get_pages_per_zspage(size); 1824 1825 /* 1826 * size_class is used for normal zsmalloc operation such 1827 * as alloc/free for that size. Although it is natural that we 1828 * have one size_class for each size, there is a chance that we 1829 * can get more memory utilization if we use one size_class for 1830 * many different sizes whose size_class have same 1831 * characteristics. So, we makes size_class point to 1832 * previous size_class if possible. 1833 */ 1834 if (prev_class) { 1835 if (can_merge(prev_class, size, pages_per_zspage)) { 1836 pool->size_class[i] = prev_class; 1837 continue; 1838 } 1839 } 1840 1841 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1842 if (!class) 1843 goto err; 1844 1845 class->size = size; 1846 class->index = i; 1847 class->pages_per_zspage = pages_per_zspage; 1848 if (pages_per_zspage == 1 && 1849 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1850 class->huge = true; 1851 spin_lock_init(&class->lock); 1852 pool->size_class[i] = class; 1853 1854 prev_class = class; 1855 } 1856 1857 pool->flags = flags; 1858 1859 if (zs_pool_stat_create(name, pool)) 1860 goto err; 1861 1862 return pool; 1863 1864 err: 1865 zs_destroy_pool(pool); 1866 return NULL; 1867 } 1868 EXPORT_SYMBOL_GPL(zs_create_pool); 1869 1870 void zs_destroy_pool(struct zs_pool *pool) 1871 { 1872 int i; 1873 1874 zs_pool_stat_destroy(pool); 1875 1876 for (i = 0; i < zs_size_classes; i++) { 1877 int fg; 1878 struct size_class *class = pool->size_class[i]; 1879 1880 if (!class) 1881 continue; 1882 1883 if (class->index != i) 1884 continue; 1885 1886 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1887 if (class->fullness_list[fg]) { 1888 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1889 class->size, fg); 1890 } 1891 } 1892 kfree(class); 1893 } 1894 1895 destroy_handle_cache(pool); 1896 kfree(pool->size_class); 1897 kfree(pool->name); 1898 kfree(pool); 1899 } 1900 EXPORT_SYMBOL_GPL(zs_destroy_pool); 1901 1902 static int __init zs_init(void) 1903 { 1904 int ret = zs_register_cpu_notifier(); 1905 1906 if (ret) 1907 goto notifier_fail; 1908 1909 init_zs_size_classes(); 1910 1911 #ifdef CONFIG_ZPOOL 1912 zpool_register_driver(&zs_zpool_driver); 1913 #endif 1914 1915 ret = zs_stat_init(); 1916 if (ret) { 1917 pr_err("zs stat initialization failed\n"); 1918 goto stat_fail; 1919 } 1920 return 0; 1921 1922 stat_fail: 1923 #ifdef CONFIG_ZPOOL 1924 zpool_unregister_driver(&zs_zpool_driver); 1925 #endif 1926 notifier_fail: 1927 zs_unregister_cpu_notifier(); 1928 1929 return ret; 1930 } 1931 1932 static void __exit zs_exit(void) 1933 { 1934 #ifdef CONFIG_ZPOOL 1935 zpool_unregister_driver(&zs_zpool_driver); 1936 #endif 1937 zs_unregister_cpu_notifier(); 1938 1939 zs_stat_exit(); 1940 } 1941 1942 module_init(zs_init); 1943 module_exit(zs_exit); 1944 1945 MODULE_LICENSE("Dual BSD/GPL"); 1946 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 1947