1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->first_page: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private (union with page->first_page): refers to the 30 * component page after the first page 31 * If the page is first_page for huge object, it stores handle. 32 * Look at size_class->huge. 33 * page->freelist: points to the first free object in zspage. 34 * Free objects are linked together using in-place 35 * metadata. 36 * page->objects: maximum number of objects we can store in this 37 * zspage (class->zspage_order * PAGE_SIZE / class->size) 38 * page->lru: links together first pages of various zspages. 39 * Basically forming list of zspages in a fullness group. 40 * page->mapping: class index and fullness group of the zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/hardirq.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 NR_ZS_STAT_TYPE, 170 }; 171 172 #ifdef CONFIG_ZSMALLOC_STAT 173 174 static struct dentry *zs_stat_root; 175 176 struct zs_size_stat { 177 unsigned long objs[NR_ZS_STAT_TYPE]; 178 }; 179 180 #endif 181 182 /* 183 * number of size_classes 184 */ 185 static int zs_size_classes; 186 187 /* 188 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 189 * n <= N / f, where 190 * n = number of allocated objects 191 * N = total number of objects zspage can store 192 * f = fullness_threshold_frac 193 * 194 * Similarly, we assign zspage to: 195 * ZS_ALMOST_FULL when n > N / f 196 * ZS_EMPTY when n == 0 197 * ZS_FULL when n == N 198 * 199 * (see: fix_fullness_group()) 200 */ 201 static const int fullness_threshold_frac = 4; 202 203 struct size_class { 204 /* 205 * Size of objects stored in this class. Must be multiple 206 * of ZS_ALIGN. 207 */ 208 int size; 209 unsigned int index; 210 211 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 212 int pages_per_zspage; 213 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 214 bool huge; 215 216 #ifdef CONFIG_ZSMALLOC_STAT 217 struct zs_size_stat stats; 218 #endif 219 220 spinlock_t lock; 221 222 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 223 }; 224 225 /* 226 * Placed within free objects to form a singly linked list. 227 * For every zspage, first_page->freelist gives head of this list. 228 * 229 * This must be power of 2 and less than or equal to ZS_ALIGN 230 */ 231 struct link_free { 232 union { 233 /* 234 * Position of next free chunk (encodes <PFN, obj_idx>) 235 * It's valid for non-allocated object 236 */ 237 void *next; 238 /* 239 * Handle of allocated object. 240 */ 241 unsigned long handle; 242 }; 243 }; 244 245 struct zs_pool { 246 char *name; 247 248 struct size_class **size_class; 249 struct kmem_cache *handle_cachep; 250 251 gfp_t flags; /* allocation flags used when growing pool */ 252 atomic_long_t pages_allocated; 253 254 #ifdef CONFIG_ZSMALLOC_STAT 255 struct dentry *stat_dentry; 256 #endif 257 }; 258 259 /* 260 * A zspage's class index and fullness group 261 * are encoded in its (first)page->mapping 262 */ 263 #define CLASS_IDX_BITS 28 264 #define FULLNESS_BITS 4 265 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 266 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 267 268 struct mapping_area { 269 #ifdef CONFIG_PGTABLE_MAPPING 270 struct vm_struct *vm; /* vm area for mapping object that span pages */ 271 #else 272 char *vm_buf; /* copy buffer for objects that span pages */ 273 #endif 274 char *vm_addr; /* address of kmap_atomic()'ed pages */ 275 enum zs_mapmode vm_mm; /* mapping mode */ 276 bool huge; 277 }; 278 279 static int create_handle_cache(struct zs_pool *pool) 280 { 281 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 282 0, 0, NULL); 283 return pool->handle_cachep ? 0 : 1; 284 } 285 286 static void destroy_handle_cache(struct zs_pool *pool) 287 { 288 if (pool->handle_cachep) 289 kmem_cache_destroy(pool->handle_cachep); 290 } 291 292 static unsigned long alloc_handle(struct zs_pool *pool) 293 { 294 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 295 pool->flags & ~__GFP_HIGHMEM); 296 } 297 298 static void free_handle(struct zs_pool *pool, unsigned long handle) 299 { 300 kmem_cache_free(pool->handle_cachep, (void *)handle); 301 } 302 303 static void record_obj(unsigned long handle, unsigned long obj) 304 { 305 *(unsigned long *)handle = obj; 306 } 307 308 /* zpool driver */ 309 310 #ifdef CONFIG_ZPOOL 311 312 static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, 313 struct zpool *zpool) 314 { 315 return zs_create_pool(name, gfp); 316 } 317 318 static void zs_zpool_destroy(void *pool) 319 { 320 zs_destroy_pool(pool); 321 } 322 323 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 324 unsigned long *handle) 325 { 326 *handle = zs_malloc(pool, size); 327 return *handle ? 0 : -1; 328 } 329 static void zs_zpool_free(void *pool, unsigned long handle) 330 { 331 zs_free(pool, handle); 332 } 333 334 static int zs_zpool_shrink(void *pool, unsigned int pages, 335 unsigned int *reclaimed) 336 { 337 return -EINVAL; 338 } 339 340 static void *zs_zpool_map(void *pool, unsigned long handle, 341 enum zpool_mapmode mm) 342 { 343 enum zs_mapmode zs_mm; 344 345 switch (mm) { 346 case ZPOOL_MM_RO: 347 zs_mm = ZS_MM_RO; 348 break; 349 case ZPOOL_MM_WO: 350 zs_mm = ZS_MM_WO; 351 break; 352 case ZPOOL_MM_RW: /* fallthru */ 353 default: 354 zs_mm = ZS_MM_RW; 355 break; 356 } 357 358 return zs_map_object(pool, handle, zs_mm); 359 } 360 static void zs_zpool_unmap(void *pool, unsigned long handle) 361 { 362 zs_unmap_object(pool, handle); 363 } 364 365 static u64 zs_zpool_total_size(void *pool) 366 { 367 return zs_get_total_pages(pool) << PAGE_SHIFT; 368 } 369 370 static struct zpool_driver zs_zpool_driver = { 371 .type = "zsmalloc", 372 .owner = THIS_MODULE, 373 .create = zs_zpool_create, 374 .destroy = zs_zpool_destroy, 375 .malloc = zs_zpool_malloc, 376 .free = zs_zpool_free, 377 .shrink = zs_zpool_shrink, 378 .map = zs_zpool_map, 379 .unmap = zs_zpool_unmap, 380 .total_size = zs_zpool_total_size, 381 }; 382 383 MODULE_ALIAS("zpool-zsmalloc"); 384 #endif /* CONFIG_ZPOOL */ 385 386 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 387 { 388 return pages_per_zspage * PAGE_SIZE / size; 389 } 390 391 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 392 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 393 394 static int is_first_page(struct page *page) 395 { 396 return PagePrivate(page); 397 } 398 399 static int is_last_page(struct page *page) 400 { 401 return PagePrivate2(page); 402 } 403 404 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 405 enum fullness_group *fullness) 406 { 407 unsigned long m; 408 BUG_ON(!is_first_page(page)); 409 410 m = (unsigned long)page->mapping; 411 *fullness = m & FULLNESS_MASK; 412 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 413 } 414 415 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 416 enum fullness_group fullness) 417 { 418 unsigned long m; 419 BUG_ON(!is_first_page(page)); 420 421 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 422 (fullness & FULLNESS_MASK); 423 page->mapping = (struct address_space *)m; 424 } 425 426 /* 427 * zsmalloc divides the pool into various size classes where each 428 * class maintains a list of zspages where each zspage is divided 429 * into equal sized chunks. Each allocation falls into one of these 430 * classes depending on its size. This function returns index of the 431 * size class which has chunk size big enough to hold the give size. 432 */ 433 static int get_size_class_index(int size) 434 { 435 int idx = 0; 436 437 if (likely(size > ZS_MIN_ALLOC_SIZE)) 438 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 439 ZS_SIZE_CLASS_DELTA); 440 441 return min(zs_size_classes - 1, idx); 442 } 443 444 #ifdef CONFIG_ZSMALLOC_STAT 445 446 static inline void zs_stat_inc(struct size_class *class, 447 enum zs_stat_type type, unsigned long cnt) 448 { 449 class->stats.objs[type] += cnt; 450 } 451 452 static inline void zs_stat_dec(struct size_class *class, 453 enum zs_stat_type type, unsigned long cnt) 454 { 455 class->stats.objs[type] -= cnt; 456 } 457 458 static inline unsigned long zs_stat_get(struct size_class *class, 459 enum zs_stat_type type) 460 { 461 return class->stats.objs[type]; 462 } 463 464 static int __init zs_stat_init(void) 465 { 466 if (!debugfs_initialized()) 467 return -ENODEV; 468 469 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 470 if (!zs_stat_root) 471 return -ENOMEM; 472 473 return 0; 474 } 475 476 static void __exit zs_stat_exit(void) 477 { 478 debugfs_remove_recursive(zs_stat_root); 479 } 480 481 static int zs_stats_size_show(struct seq_file *s, void *v) 482 { 483 int i; 484 struct zs_pool *pool = s->private; 485 struct size_class *class; 486 int objs_per_zspage; 487 unsigned long class_almost_full, class_almost_empty; 488 unsigned long obj_allocated, obj_used, pages_used; 489 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 490 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 491 492 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", 493 "class", "size", "almost_full", "almost_empty", 494 "obj_allocated", "obj_used", "pages_used", 495 "pages_per_zspage"); 496 497 for (i = 0; i < zs_size_classes; i++) { 498 class = pool->size_class[i]; 499 500 if (class->index != i) 501 continue; 502 503 spin_lock(&class->lock); 504 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 505 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 506 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 507 obj_used = zs_stat_get(class, OBJ_USED); 508 spin_unlock(&class->lock); 509 510 objs_per_zspage = get_maxobj_per_zspage(class->size, 511 class->pages_per_zspage); 512 pages_used = obj_allocated / objs_per_zspage * 513 class->pages_per_zspage; 514 515 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", 516 i, class->size, class_almost_full, class_almost_empty, 517 obj_allocated, obj_used, pages_used, 518 class->pages_per_zspage); 519 520 total_class_almost_full += class_almost_full; 521 total_class_almost_empty += class_almost_empty; 522 total_objs += obj_allocated; 523 total_used_objs += obj_used; 524 total_pages += pages_used; 525 } 526 527 seq_puts(s, "\n"); 528 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", 529 "Total", "", total_class_almost_full, 530 total_class_almost_empty, total_objs, 531 total_used_objs, total_pages); 532 533 return 0; 534 } 535 536 static int zs_stats_size_open(struct inode *inode, struct file *file) 537 { 538 return single_open(file, zs_stats_size_show, inode->i_private); 539 } 540 541 static const struct file_operations zs_stat_size_ops = { 542 .open = zs_stats_size_open, 543 .read = seq_read, 544 .llseek = seq_lseek, 545 .release = single_release, 546 }; 547 548 static int zs_pool_stat_create(char *name, struct zs_pool *pool) 549 { 550 struct dentry *entry; 551 552 if (!zs_stat_root) 553 return -ENODEV; 554 555 entry = debugfs_create_dir(name, zs_stat_root); 556 if (!entry) { 557 pr_warn("debugfs dir <%s> creation failed\n", name); 558 return -ENOMEM; 559 } 560 pool->stat_dentry = entry; 561 562 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 563 pool->stat_dentry, pool, &zs_stat_size_ops); 564 if (!entry) { 565 pr_warn("%s: debugfs file entry <%s> creation failed\n", 566 name, "classes"); 567 return -ENOMEM; 568 } 569 570 return 0; 571 } 572 573 static void zs_pool_stat_destroy(struct zs_pool *pool) 574 { 575 debugfs_remove_recursive(pool->stat_dentry); 576 } 577 578 #else /* CONFIG_ZSMALLOC_STAT */ 579 580 static inline void zs_stat_inc(struct size_class *class, 581 enum zs_stat_type type, unsigned long cnt) 582 { 583 } 584 585 static inline void zs_stat_dec(struct size_class *class, 586 enum zs_stat_type type, unsigned long cnt) 587 { 588 } 589 590 static inline unsigned long zs_stat_get(struct size_class *class, 591 enum zs_stat_type type) 592 { 593 return 0; 594 } 595 596 static int __init zs_stat_init(void) 597 { 598 return 0; 599 } 600 601 static void __exit zs_stat_exit(void) 602 { 603 } 604 605 static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) 606 { 607 return 0; 608 } 609 610 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 611 { 612 } 613 614 #endif 615 616 617 /* 618 * For each size class, zspages are divided into different groups 619 * depending on how "full" they are. This was done so that we could 620 * easily find empty or nearly empty zspages when we try to shrink 621 * the pool (not yet implemented). This function returns fullness 622 * status of the given page. 623 */ 624 static enum fullness_group get_fullness_group(struct page *page) 625 { 626 int inuse, max_objects; 627 enum fullness_group fg; 628 BUG_ON(!is_first_page(page)); 629 630 inuse = page->inuse; 631 max_objects = page->objects; 632 633 if (inuse == 0) 634 fg = ZS_EMPTY; 635 else if (inuse == max_objects) 636 fg = ZS_FULL; 637 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 638 fg = ZS_ALMOST_EMPTY; 639 else 640 fg = ZS_ALMOST_FULL; 641 642 return fg; 643 } 644 645 /* 646 * Each size class maintains various freelists and zspages are assigned 647 * to one of these freelists based on the number of live objects they 648 * have. This functions inserts the given zspage into the freelist 649 * identified by <class, fullness_group>. 650 */ 651 static void insert_zspage(struct page *page, struct size_class *class, 652 enum fullness_group fullness) 653 { 654 struct page **head; 655 656 BUG_ON(!is_first_page(page)); 657 658 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 659 return; 660 661 head = &class->fullness_list[fullness]; 662 if (*head) 663 list_add_tail(&page->lru, &(*head)->lru); 664 665 *head = page; 666 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 667 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 668 } 669 670 /* 671 * This function removes the given zspage from the freelist identified 672 * by <class, fullness_group>. 673 */ 674 static void remove_zspage(struct page *page, struct size_class *class, 675 enum fullness_group fullness) 676 { 677 struct page **head; 678 679 BUG_ON(!is_first_page(page)); 680 681 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 682 return; 683 684 head = &class->fullness_list[fullness]; 685 BUG_ON(!*head); 686 if (list_empty(&(*head)->lru)) 687 *head = NULL; 688 else if (*head == page) 689 *head = (struct page *)list_entry((*head)->lru.next, 690 struct page, lru); 691 692 list_del_init(&page->lru); 693 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 694 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 695 } 696 697 /* 698 * Each size class maintains zspages in different fullness groups depending 699 * on the number of live objects they contain. When allocating or freeing 700 * objects, the fullness status of the page can change, say, from ALMOST_FULL 701 * to ALMOST_EMPTY when freeing an object. This function checks if such 702 * a status change has occurred for the given page and accordingly moves the 703 * page from the freelist of the old fullness group to that of the new 704 * fullness group. 705 */ 706 static enum fullness_group fix_fullness_group(struct size_class *class, 707 struct page *page) 708 { 709 int class_idx; 710 enum fullness_group currfg, newfg; 711 712 BUG_ON(!is_first_page(page)); 713 714 get_zspage_mapping(page, &class_idx, &currfg); 715 newfg = get_fullness_group(page); 716 if (newfg == currfg) 717 goto out; 718 719 remove_zspage(page, class, currfg); 720 insert_zspage(page, class, newfg); 721 set_zspage_mapping(page, class_idx, newfg); 722 723 out: 724 return newfg; 725 } 726 727 /* 728 * We have to decide on how many pages to link together 729 * to form a zspage for each size class. This is important 730 * to reduce wastage due to unusable space left at end of 731 * each zspage which is given as: 732 * wastage = Zp % class_size 733 * usage = Zp - wastage 734 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 735 * 736 * For example, for size class of 3/8 * PAGE_SIZE, we should 737 * link together 3 PAGE_SIZE sized pages to form a zspage 738 * since then we can perfectly fit in 8 such objects. 739 */ 740 static int get_pages_per_zspage(int class_size) 741 { 742 int i, max_usedpc = 0; 743 /* zspage order which gives maximum used size per KB */ 744 int max_usedpc_order = 1; 745 746 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 747 int zspage_size; 748 int waste, usedpc; 749 750 zspage_size = i * PAGE_SIZE; 751 waste = zspage_size % class_size; 752 usedpc = (zspage_size - waste) * 100 / zspage_size; 753 754 if (usedpc > max_usedpc) { 755 max_usedpc = usedpc; 756 max_usedpc_order = i; 757 } 758 } 759 760 return max_usedpc_order; 761 } 762 763 /* 764 * A single 'zspage' is composed of many system pages which are 765 * linked together using fields in struct page. This function finds 766 * the first/head page, given any component page of a zspage. 767 */ 768 static struct page *get_first_page(struct page *page) 769 { 770 if (is_first_page(page)) 771 return page; 772 else 773 return page->first_page; 774 } 775 776 static struct page *get_next_page(struct page *page) 777 { 778 struct page *next; 779 780 if (is_last_page(page)) 781 next = NULL; 782 else if (is_first_page(page)) 783 next = (struct page *)page_private(page); 784 else 785 next = list_entry(page->lru.next, struct page, lru); 786 787 return next; 788 } 789 790 /* 791 * Encode <page, obj_idx> as a single handle value. 792 * We use the least bit of handle for tagging. 793 */ 794 static void *location_to_obj(struct page *page, unsigned long obj_idx) 795 { 796 unsigned long obj; 797 798 if (!page) { 799 BUG_ON(obj_idx); 800 return NULL; 801 } 802 803 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 804 obj |= ((obj_idx) & OBJ_INDEX_MASK); 805 obj <<= OBJ_TAG_BITS; 806 807 return (void *)obj; 808 } 809 810 /* 811 * Decode <page, obj_idx> pair from the given object handle. We adjust the 812 * decoded obj_idx back to its original value since it was adjusted in 813 * location_to_obj(). 814 */ 815 static void obj_to_location(unsigned long obj, struct page **page, 816 unsigned long *obj_idx) 817 { 818 obj >>= OBJ_TAG_BITS; 819 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 820 *obj_idx = (obj & OBJ_INDEX_MASK); 821 } 822 823 static unsigned long handle_to_obj(unsigned long handle) 824 { 825 return *(unsigned long *)handle; 826 } 827 828 static unsigned long obj_to_head(struct size_class *class, struct page *page, 829 void *obj) 830 { 831 if (class->huge) { 832 VM_BUG_ON(!is_first_page(page)); 833 return *(unsigned long *)page_private(page); 834 } else 835 return *(unsigned long *)obj; 836 } 837 838 static unsigned long obj_idx_to_offset(struct page *page, 839 unsigned long obj_idx, int class_size) 840 { 841 unsigned long off = 0; 842 843 if (!is_first_page(page)) 844 off = page->index; 845 846 return off + obj_idx * class_size; 847 } 848 849 static inline int trypin_tag(unsigned long handle) 850 { 851 unsigned long *ptr = (unsigned long *)handle; 852 853 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 854 } 855 856 static void pin_tag(unsigned long handle) 857 { 858 while (!trypin_tag(handle)); 859 } 860 861 static void unpin_tag(unsigned long handle) 862 { 863 unsigned long *ptr = (unsigned long *)handle; 864 865 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 866 } 867 868 static void reset_page(struct page *page) 869 { 870 clear_bit(PG_private, &page->flags); 871 clear_bit(PG_private_2, &page->flags); 872 set_page_private(page, 0); 873 page->mapping = NULL; 874 page->freelist = NULL; 875 page_mapcount_reset(page); 876 } 877 878 static void free_zspage(struct page *first_page) 879 { 880 struct page *nextp, *tmp, *head_extra; 881 882 BUG_ON(!is_first_page(first_page)); 883 BUG_ON(first_page->inuse); 884 885 head_extra = (struct page *)page_private(first_page); 886 887 reset_page(first_page); 888 __free_page(first_page); 889 890 /* zspage with only 1 system page */ 891 if (!head_extra) 892 return; 893 894 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 895 list_del(&nextp->lru); 896 reset_page(nextp); 897 __free_page(nextp); 898 } 899 reset_page(head_extra); 900 __free_page(head_extra); 901 } 902 903 /* Initialize a newly allocated zspage */ 904 static void init_zspage(struct page *first_page, struct size_class *class) 905 { 906 unsigned long off = 0; 907 struct page *page = first_page; 908 909 BUG_ON(!is_first_page(first_page)); 910 while (page) { 911 struct page *next_page; 912 struct link_free *link; 913 unsigned int i = 1; 914 void *vaddr; 915 916 /* 917 * page->index stores offset of first object starting 918 * in the page. For the first page, this is always 0, 919 * so we use first_page->index (aka ->freelist) to store 920 * head of corresponding zspage's freelist. 921 */ 922 if (page != first_page) 923 page->index = off; 924 925 vaddr = kmap_atomic(page); 926 link = (struct link_free *)vaddr + off / sizeof(*link); 927 928 while ((off += class->size) < PAGE_SIZE) { 929 link->next = location_to_obj(page, i++); 930 link += class->size / sizeof(*link); 931 } 932 933 /* 934 * We now come to the last (full or partial) object on this 935 * page, which must point to the first object on the next 936 * page (if present) 937 */ 938 next_page = get_next_page(page); 939 link->next = location_to_obj(next_page, 0); 940 kunmap_atomic(vaddr); 941 page = next_page; 942 off %= PAGE_SIZE; 943 } 944 } 945 946 /* 947 * Allocate a zspage for the given size class 948 */ 949 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 950 { 951 int i, error; 952 struct page *first_page = NULL, *uninitialized_var(prev_page); 953 954 /* 955 * Allocate individual pages and link them together as: 956 * 1. first page->private = first sub-page 957 * 2. all sub-pages are linked together using page->lru 958 * 3. each sub-page is linked to the first page using page->first_page 959 * 960 * For each size class, First/Head pages are linked together using 961 * page->lru. Also, we set PG_private to identify the first page 962 * (i.e. no other sub-page has this flag set) and PG_private_2 to 963 * identify the last page. 964 */ 965 error = -ENOMEM; 966 for (i = 0; i < class->pages_per_zspage; i++) { 967 struct page *page; 968 969 page = alloc_page(flags); 970 if (!page) 971 goto cleanup; 972 973 INIT_LIST_HEAD(&page->lru); 974 if (i == 0) { /* first page */ 975 SetPagePrivate(page); 976 set_page_private(page, 0); 977 first_page = page; 978 first_page->inuse = 0; 979 } 980 if (i == 1) 981 set_page_private(first_page, (unsigned long)page); 982 if (i >= 1) 983 page->first_page = first_page; 984 if (i >= 2) 985 list_add(&page->lru, &prev_page->lru); 986 if (i == class->pages_per_zspage - 1) /* last page */ 987 SetPagePrivate2(page); 988 prev_page = page; 989 } 990 991 init_zspage(first_page, class); 992 993 first_page->freelist = location_to_obj(first_page, 0); 994 /* Maximum number of objects we can store in this zspage */ 995 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 996 997 error = 0; /* Success */ 998 999 cleanup: 1000 if (unlikely(error) && first_page) { 1001 free_zspage(first_page); 1002 first_page = NULL; 1003 } 1004 1005 return first_page; 1006 } 1007 1008 static struct page *find_get_zspage(struct size_class *class) 1009 { 1010 int i; 1011 struct page *page; 1012 1013 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1014 page = class->fullness_list[i]; 1015 if (page) 1016 break; 1017 } 1018 1019 return page; 1020 } 1021 1022 #ifdef CONFIG_PGTABLE_MAPPING 1023 static inline int __zs_cpu_up(struct mapping_area *area) 1024 { 1025 /* 1026 * Make sure we don't leak memory if a cpu UP notification 1027 * and zs_init() race and both call zs_cpu_up() on the same cpu 1028 */ 1029 if (area->vm) 1030 return 0; 1031 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1032 if (!area->vm) 1033 return -ENOMEM; 1034 return 0; 1035 } 1036 1037 static inline void __zs_cpu_down(struct mapping_area *area) 1038 { 1039 if (area->vm) 1040 free_vm_area(area->vm); 1041 area->vm = NULL; 1042 } 1043 1044 static inline void *__zs_map_object(struct mapping_area *area, 1045 struct page *pages[2], int off, int size) 1046 { 1047 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1048 area->vm_addr = area->vm->addr; 1049 return area->vm_addr + off; 1050 } 1051 1052 static inline void __zs_unmap_object(struct mapping_area *area, 1053 struct page *pages[2], int off, int size) 1054 { 1055 unsigned long addr = (unsigned long)area->vm_addr; 1056 1057 unmap_kernel_range(addr, PAGE_SIZE * 2); 1058 } 1059 1060 #else /* CONFIG_PGTABLE_MAPPING */ 1061 1062 static inline int __zs_cpu_up(struct mapping_area *area) 1063 { 1064 /* 1065 * Make sure we don't leak memory if a cpu UP notification 1066 * and zs_init() race and both call zs_cpu_up() on the same cpu 1067 */ 1068 if (area->vm_buf) 1069 return 0; 1070 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1071 if (!area->vm_buf) 1072 return -ENOMEM; 1073 return 0; 1074 } 1075 1076 static inline void __zs_cpu_down(struct mapping_area *area) 1077 { 1078 kfree(area->vm_buf); 1079 area->vm_buf = NULL; 1080 } 1081 1082 static void *__zs_map_object(struct mapping_area *area, 1083 struct page *pages[2], int off, int size) 1084 { 1085 int sizes[2]; 1086 void *addr; 1087 char *buf = area->vm_buf; 1088 1089 /* disable page faults to match kmap_atomic() return conditions */ 1090 pagefault_disable(); 1091 1092 /* no read fastpath */ 1093 if (area->vm_mm == ZS_MM_WO) 1094 goto out; 1095 1096 sizes[0] = PAGE_SIZE - off; 1097 sizes[1] = size - sizes[0]; 1098 1099 /* copy object to per-cpu buffer */ 1100 addr = kmap_atomic(pages[0]); 1101 memcpy(buf, addr + off, sizes[0]); 1102 kunmap_atomic(addr); 1103 addr = kmap_atomic(pages[1]); 1104 memcpy(buf + sizes[0], addr, sizes[1]); 1105 kunmap_atomic(addr); 1106 out: 1107 return area->vm_buf; 1108 } 1109 1110 static void __zs_unmap_object(struct mapping_area *area, 1111 struct page *pages[2], int off, int size) 1112 { 1113 int sizes[2]; 1114 void *addr; 1115 char *buf; 1116 1117 /* no write fastpath */ 1118 if (area->vm_mm == ZS_MM_RO) 1119 goto out; 1120 1121 buf = area->vm_buf; 1122 if (!area->huge) { 1123 buf = buf + ZS_HANDLE_SIZE; 1124 size -= ZS_HANDLE_SIZE; 1125 off += ZS_HANDLE_SIZE; 1126 } 1127 1128 sizes[0] = PAGE_SIZE - off; 1129 sizes[1] = size - sizes[0]; 1130 1131 /* copy per-cpu buffer to object */ 1132 addr = kmap_atomic(pages[0]); 1133 memcpy(addr + off, buf, sizes[0]); 1134 kunmap_atomic(addr); 1135 addr = kmap_atomic(pages[1]); 1136 memcpy(addr, buf + sizes[0], sizes[1]); 1137 kunmap_atomic(addr); 1138 1139 out: 1140 /* enable page faults to match kunmap_atomic() return conditions */ 1141 pagefault_enable(); 1142 } 1143 1144 #endif /* CONFIG_PGTABLE_MAPPING */ 1145 1146 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1147 void *pcpu) 1148 { 1149 int ret, cpu = (long)pcpu; 1150 struct mapping_area *area; 1151 1152 switch (action) { 1153 case CPU_UP_PREPARE: 1154 area = &per_cpu(zs_map_area, cpu); 1155 ret = __zs_cpu_up(area); 1156 if (ret) 1157 return notifier_from_errno(ret); 1158 break; 1159 case CPU_DEAD: 1160 case CPU_UP_CANCELED: 1161 area = &per_cpu(zs_map_area, cpu); 1162 __zs_cpu_down(area); 1163 break; 1164 } 1165 1166 return NOTIFY_OK; 1167 } 1168 1169 static struct notifier_block zs_cpu_nb = { 1170 .notifier_call = zs_cpu_notifier 1171 }; 1172 1173 static int zs_register_cpu_notifier(void) 1174 { 1175 int cpu, uninitialized_var(ret); 1176 1177 cpu_notifier_register_begin(); 1178 1179 __register_cpu_notifier(&zs_cpu_nb); 1180 for_each_online_cpu(cpu) { 1181 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1182 if (notifier_to_errno(ret)) 1183 break; 1184 } 1185 1186 cpu_notifier_register_done(); 1187 return notifier_to_errno(ret); 1188 } 1189 1190 static void zs_unregister_cpu_notifier(void) 1191 { 1192 int cpu; 1193 1194 cpu_notifier_register_begin(); 1195 1196 for_each_online_cpu(cpu) 1197 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1198 __unregister_cpu_notifier(&zs_cpu_nb); 1199 1200 cpu_notifier_register_done(); 1201 } 1202 1203 static void init_zs_size_classes(void) 1204 { 1205 int nr; 1206 1207 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1208 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1209 nr += 1; 1210 1211 zs_size_classes = nr; 1212 } 1213 1214 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1215 { 1216 if (prev->pages_per_zspage != pages_per_zspage) 1217 return false; 1218 1219 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1220 != get_maxobj_per_zspage(size, pages_per_zspage)) 1221 return false; 1222 1223 return true; 1224 } 1225 1226 static bool zspage_full(struct page *page) 1227 { 1228 BUG_ON(!is_first_page(page)); 1229 1230 return page->inuse == page->objects; 1231 } 1232 1233 unsigned long zs_get_total_pages(struct zs_pool *pool) 1234 { 1235 return atomic_long_read(&pool->pages_allocated); 1236 } 1237 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1238 1239 /** 1240 * zs_map_object - get address of allocated object from handle. 1241 * @pool: pool from which the object was allocated 1242 * @handle: handle returned from zs_malloc 1243 * 1244 * Before using an object allocated from zs_malloc, it must be mapped using 1245 * this function. When done with the object, it must be unmapped using 1246 * zs_unmap_object. 1247 * 1248 * Only one object can be mapped per cpu at a time. There is no protection 1249 * against nested mappings. 1250 * 1251 * This function returns with preemption and page faults disabled. 1252 */ 1253 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1254 enum zs_mapmode mm) 1255 { 1256 struct page *page; 1257 unsigned long obj, obj_idx, off; 1258 1259 unsigned int class_idx; 1260 enum fullness_group fg; 1261 struct size_class *class; 1262 struct mapping_area *area; 1263 struct page *pages[2]; 1264 void *ret; 1265 1266 BUG_ON(!handle); 1267 1268 /* 1269 * Because we use per-cpu mapping areas shared among the 1270 * pools/users, we can't allow mapping in interrupt context 1271 * because it can corrupt another users mappings. 1272 */ 1273 BUG_ON(in_interrupt()); 1274 1275 /* From now on, migration cannot move the object */ 1276 pin_tag(handle); 1277 1278 obj = handle_to_obj(handle); 1279 obj_to_location(obj, &page, &obj_idx); 1280 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1281 class = pool->size_class[class_idx]; 1282 off = obj_idx_to_offset(page, obj_idx, class->size); 1283 1284 area = &get_cpu_var(zs_map_area); 1285 area->vm_mm = mm; 1286 if (off + class->size <= PAGE_SIZE) { 1287 /* this object is contained entirely within a page */ 1288 area->vm_addr = kmap_atomic(page); 1289 ret = area->vm_addr + off; 1290 goto out; 1291 } 1292 1293 /* this object spans two pages */ 1294 pages[0] = page; 1295 pages[1] = get_next_page(page); 1296 BUG_ON(!pages[1]); 1297 1298 ret = __zs_map_object(area, pages, off, class->size); 1299 out: 1300 if (!class->huge) 1301 ret += ZS_HANDLE_SIZE; 1302 1303 return ret; 1304 } 1305 EXPORT_SYMBOL_GPL(zs_map_object); 1306 1307 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1308 { 1309 struct page *page; 1310 unsigned long obj, obj_idx, off; 1311 1312 unsigned int class_idx; 1313 enum fullness_group fg; 1314 struct size_class *class; 1315 struct mapping_area *area; 1316 1317 BUG_ON(!handle); 1318 1319 obj = handle_to_obj(handle); 1320 obj_to_location(obj, &page, &obj_idx); 1321 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1322 class = pool->size_class[class_idx]; 1323 off = obj_idx_to_offset(page, obj_idx, class->size); 1324 1325 area = this_cpu_ptr(&zs_map_area); 1326 if (off + class->size <= PAGE_SIZE) 1327 kunmap_atomic(area->vm_addr); 1328 else { 1329 struct page *pages[2]; 1330 1331 pages[0] = page; 1332 pages[1] = get_next_page(page); 1333 BUG_ON(!pages[1]); 1334 1335 __zs_unmap_object(area, pages, off, class->size); 1336 } 1337 put_cpu_var(zs_map_area); 1338 unpin_tag(handle); 1339 } 1340 EXPORT_SYMBOL_GPL(zs_unmap_object); 1341 1342 static unsigned long obj_malloc(struct page *first_page, 1343 struct size_class *class, unsigned long handle) 1344 { 1345 unsigned long obj; 1346 struct link_free *link; 1347 1348 struct page *m_page; 1349 unsigned long m_objidx, m_offset; 1350 void *vaddr; 1351 1352 handle |= OBJ_ALLOCATED_TAG; 1353 obj = (unsigned long)first_page->freelist; 1354 obj_to_location(obj, &m_page, &m_objidx); 1355 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1356 1357 vaddr = kmap_atomic(m_page); 1358 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1359 first_page->freelist = link->next; 1360 if (!class->huge) 1361 /* record handle in the header of allocated chunk */ 1362 link->handle = handle; 1363 else 1364 /* record handle in first_page->private */ 1365 set_page_private(first_page, handle); 1366 kunmap_atomic(vaddr); 1367 first_page->inuse++; 1368 zs_stat_inc(class, OBJ_USED, 1); 1369 1370 return obj; 1371 } 1372 1373 1374 /** 1375 * zs_malloc - Allocate block of given size from pool. 1376 * @pool: pool to allocate from 1377 * @size: size of block to allocate 1378 * 1379 * On success, handle to the allocated object is returned, 1380 * otherwise 0. 1381 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1382 */ 1383 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1384 { 1385 unsigned long handle, obj; 1386 struct size_class *class; 1387 struct page *first_page; 1388 1389 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1390 return 0; 1391 1392 handle = alloc_handle(pool); 1393 if (!handle) 1394 return 0; 1395 1396 /* extra space in chunk to keep the handle */ 1397 size += ZS_HANDLE_SIZE; 1398 class = pool->size_class[get_size_class_index(size)]; 1399 1400 spin_lock(&class->lock); 1401 first_page = find_get_zspage(class); 1402 1403 if (!first_page) { 1404 spin_unlock(&class->lock); 1405 first_page = alloc_zspage(class, pool->flags); 1406 if (unlikely(!first_page)) { 1407 free_handle(pool, handle); 1408 return 0; 1409 } 1410 1411 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1412 atomic_long_add(class->pages_per_zspage, 1413 &pool->pages_allocated); 1414 1415 spin_lock(&class->lock); 1416 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1417 class->size, class->pages_per_zspage)); 1418 } 1419 1420 obj = obj_malloc(first_page, class, handle); 1421 /* Now move the zspage to another fullness group, if required */ 1422 fix_fullness_group(class, first_page); 1423 record_obj(handle, obj); 1424 spin_unlock(&class->lock); 1425 1426 return handle; 1427 } 1428 EXPORT_SYMBOL_GPL(zs_malloc); 1429 1430 static void obj_free(struct zs_pool *pool, struct size_class *class, 1431 unsigned long obj) 1432 { 1433 struct link_free *link; 1434 struct page *first_page, *f_page; 1435 unsigned long f_objidx, f_offset; 1436 void *vaddr; 1437 int class_idx; 1438 enum fullness_group fullness; 1439 1440 BUG_ON(!obj); 1441 1442 obj &= ~OBJ_ALLOCATED_TAG; 1443 obj_to_location(obj, &f_page, &f_objidx); 1444 first_page = get_first_page(f_page); 1445 1446 get_zspage_mapping(first_page, &class_idx, &fullness); 1447 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1448 1449 vaddr = kmap_atomic(f_page); 1450 1451 /* Insert this object in containing zspage's freelist */ 1452 link = (struct link_free *)(vaddr + f_offset); 1453 link->next = first_page->freelist; 1454 if (class->huge) 1455 set_page_private(first_page, 0); 1456 kunmap_atomic(vaddr); 1457 first_page->freelist = (void *)obj; 1458 first_page->inuse--; 1459 zs_stat_dec(class, OBJ_USED, 1); 1460 } 1461 1462 void zs_free(struct zs_pool *pool, unsigned long handle) 1463 { 1464 struct page *first_page, *f_page; 1465 unsigned long obj, f_objidx; 1466 int class_idx; 1467 struct size_class *class; 1468 enum fullness_group fullness; 1469 1470 if (unlikely(!handle)) 1471 return; 1472 1473 pin_tag(handle); 1474 obj = handle_to_obj(handle); 1475 obj_to_location(obj, &f_page, &f_objidx); 1476 first_page = get_first_page(f_page); 1477 1478 get_zspage_mapping(first_page, &class_idx, &fullness); 1479 class = pool->size_class[class_idx]; 1480 1481 spin_lock(&class->lock); 1482 obj_free(pool, class, obj); 1483 fullness = fix_fullness_group(class, first_page); 1484 if (fullness == ZS_EMPTY) { 1485 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1486 class->size, class->pages_per_zspage)); 1487 atomic_long_sub(class->pages_per_zspage, 1488 &pool->pages_allocated); 1489 free_zspage(first_page); 1490 } 1491 spin_unlock(&class->lock); 1492 unpin_tag(handle); 1493 1494 free_handle(pool, handle); 1495 } 1496 EXPORT_SYMBOL_GPL(zs_free); 1497 1498 static void zs_object_copy(unsigned long src, unsigned long dst, 1499 struct size_class *class) 1500 { 1501 struct page *s_page, *d_page; 1502 unsigned long s_objidx, d_objidx; 1503 unsigned long s_off, d_off; 1504 void *s_addr, *d_addr; 1505 int s_size, d_size, size; 1506 int written = 0; 1507 1508 s_size = d_size = class->size; 1509 1510 obj_to_location(src, &s_page, &s_objidx); 1511 obj_to_location(dst, &d_page, &d_objidx); 1512 1513 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1514 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1515 1516 if (s_off + class->size > PAGE_SIZE) 1517 s_size = PAGE_SIZE - s_off; 1518 1519 if (d_off + class->size > PAGE_SIZE) 1520 d_size = PAGE_SIZE - d_off; 1521 1522 s_addr = kmap_atomic(s_page); 1523 d_addr = kmap_atomic(d_page); 1524 1525 while (1) { 1526 size = min(s_size, d_size); 1527 memcpy(d_addr + d_off, s_addr + s_off, size); 1528 written += size; 1529 1530 if (written == class->size) 1531 break; 1532 1533 s_off += size; 1534 s_size -= size; 1535 d_off += size; 1536 d_size -= size; 1537 1538 if (s_off >= PAGE_SIZE) { 1539 kunmap_atomic(d_addr); 1540 kunmap_atomic(s_addr); 1541 s_page = get_next_page(s_page); 1542 BUG_ON(!s_page); 1543 s_addr = kmap_atomic(s_page); 1544 d_addr = kmap_atomic(d_page); 1545 s_size = class->size - written; 1546 s_off = 0; 1547 } 1548 1549 if (d_off >= PAGE_SIZE) { 1550 kunmap_atomic(d_addr); 1551 d_page = get_next_page(d_page); 1552 BUG_ON(!d_page); 1553 d_addr = kmap_atomic(d_page); 1554 d_size = class->size - written; 1555 d_off = 0; 1556 } 1557 } 1558 1559 kunmap_atomic(d_addr); 1560 kunmap_atomic(s_addr); 1561 } 1562 1563 /* 1564 * Find alloced object in zspage from index object and 1565 * return handle. 1566 */ 1567 static unsigned long find_alloced_obj(struct page *page, int index, 1568 struct size_class *class) 1569 { 1570 unsigned long head; 1571 int offset = 0; 1572 unsigned long handle = 0; 1573 void *addr = kmap_atomic(page); 1574 1575 if (!is_first_page(page)) 1576 offset = page->index; 1577 offset += class->size * index; 1578 1579 while (offset < PAGE_SIZE) { 1580 head = obj_to_head(class, page, addr + offset); 1581 if (head & OBJ_ALLOCATED_TAG) { 1582 handle = head & ~OBJ_ALLOCATED_TAG; 1583 if (trypin_tag(handle)) 1584 break; 1585 handle = 0; 1586 } 1587 1588 offset += class->size; 1589 index++; 1590 } 1591 1592 kunmap_atomic(addr); 1593 return handle; 1594 } 1595 1596 struct zs_compact_control { 1597 /* Source page for migration which could be a subpage of zspage. */ 1598 struct page *s_page; 1599 /* Destination page for migration which should be a first page 1600 * of zspage. */ 1601 struct page *d_page; 1602 /* Starting object index within @s_page which used for live object 1603 * in the subpage. */ 1604 int index; 1605 /* how many of objects are migrated */ 1606 int nr_migrated; 1607 }; 1608 1609 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1610 struct zs_compact_control *cc) 1611 { 1612 unsigned long used_obj, free_obj; 1613 unsigned long handle; 1614 struct page *s_page = cc->s_page; 1615 struct page *d_page = cc->d_page; 1616 unsigned long index = cc->index; 1617 int nr_migrated = 0; 1618 int ret = 0; 1619 1620 while (1) { 1621 handle = find_alloced_obj(s_page, index, class); 1622 if (!handle) { 1623 s_page = get_next_page(s_page); 1624 if (!s_page) 1625 break; 1626 index = 0; 1627 continue; 1628 } 1629 1630 /* Stop if there is no more space */ 1631 if (zspage_full(d_page)) { 1632 unpin_tag(handle); 1633 ret = -ENOMEM; 1634 break; 1635 } 1636 1637 used_obj = handle_to_obj(handle); 1638 free_obj = obj_malloc(d_page, class, handle); 1639 zs_object_copy(used_obj, free_obj, class); 1640 index++; 1641 record_obj(handle, free_obj); 1642 unpin_tag(handle); 1643 obj_free(pool, class, used_obj); 1644 nr_migrated++; 1645 } 1646 1647 /* Remember last position in this iteration */ 1648 cc->s_page = s_page; 1649 cc->index = index; 1650 cc->nr_migrated = nr_migrated; 1651 1652 return ret; 1653 } 1654 1655 static struct page *alloc_target_page(struct size_class *class) 1656 { 1657 int i; 1658 struct page *page; 1659 1660 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1661 page = class->fullness_list[i]; 1662 if (page) { 1663 remove_zspage(page, class, i); 1664 break; 1665 } 1666 } 1667 1668 return page; 1669 } 1670 1671 static void putback_zspage(struct zs_pool *pool, struct size_class *class, 1672 struct page *first_page) 1673 { 1674 enum fullness_group fullness; 1675 1676 BUG_ON(!is_first_page(first_page)); 1677 1678 fullness = get_fullness_group(first_page); 1679 insert_zspage(first_page, class, fullness); 1680 set_zspage_mapping(first_page, class->index, fullness); 1681 1682 if (fullness == ZS_EMPTY) { 1683 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1684 class->size, class->pages_per_zspage)); 1685 atomic_long_sub(class->pages_per_zspage, 1686 &pool->pages_allocated); 1687 1688 free_zspage(first_page); 1689 } 1690 } 1691 1692 static struct page *isolate_source_page(struct size_class *class) 1693 { 1694 struct page *page; 1695 1696 page = class->fullness_list[ZS_ALMOST_EMPTY]; 1697 if (page) 1698 remove_zspage(page, class, ZS_ALMOST_EMPTY); 1699 1700 return page; 1701 } 1702 1703 static unsigned long __zs_compact(struct zs_pool *pool, 1704 struct size_class *class) 1705 { 1706 int nr_to_migrate; 1707 struct zs_compact_control cc; 1708 struct page *src_page; 1709 struct page *dst_page = NULL; 1710 unsigned long nr_total_migrated = 0; 1711 1712 spin_lock(&class->lock); 1713 while ((src_page = isolate_source_page(class))) { 1714 1715 BUG_ON(!is_first_page(src_page)); 1716 1717 /* The goal is to migrate all live objects in source page */ 1718 nr_to_migrate = src_page->inuse; 1719 cc.index = 0; 1720 cc.s_page = src_page; 1721 1722 while ((dst_page = alloc_target_page(class))) { 1723 cc.d_page = dst_page; 1724 /* 1725 * If there is no more space in dst_page, try to 1726 * allocate another zspage. 1727 */ 1728 if (!migrate_zspage(pool, class, &cc)) 1729 break; 1730 1731 putback_zspage(pool, class, dst_page); 1732 nr_total_migrated += cc.nr_migrated; 1733 nr_to_migrate -= cc.nr_migrated; 1734 } 1735 1736 /* Stop if we couldn't find slot */ 1737 if (dst_page == NULL) 1738 break; 1739 1740 putback_zspage(pool, class, dst_page); 1741 putback_zspage(pool, class, src_page); 1742 spin_unlock(&class->lock); 1743 nr_total_migrated += cc.nr_migrated; 1744 cond_resched(); 1745 spin_lock(&class->lock); 1746 } 1747 1748 if (src_page) 1749 putback_zspage(pool, class, src_page); 1750 1751 spin_unlock(&class->lock); 1752 1753 return nr_total_migrated; 1754 } 1755 1756 unsigned long zs_compact(struct zs_pool *pool) 1757 { 1758 int i; 1759 unsigned long nr_migrated = 0; 1760 struct size_class *class; 1761 1762 for (i = zs_size_classes - 1; i >= 0; i--) { 1763 class = pool->size_class[i]; 1764 if (!class) 1765 continue; 1766 if (class->index != i) 1767 continue; 1768 nr_migrated += __zs_compact(pool, class); 1769 } 1770 1771 return nr_migrated; 1772 } 1773 EXPORT_SYMBOL_GPL(zs_compact); 1774 1775 /** 1776 * zs_create_pool - Creates an allocation pool to work from. 1777 * @flags: allocation flags used to allocate pool metadata 1778 * 1779 * This function must be called before anything when using 1780 * the zsmalloc allocator. 1781 * 1782 * On success, a pointer to the newly created pool is returned, 1783 * otherwise NULL. 1784 */ 1785 struct zs_pool *zs_create_pool(char *name, gfp_t flags) 1786 { 1787 int i; 1788 struct zs_pool *pool; 1789 struct size_class *prev_class = NULL; 1790 1791 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1792 if (!pool) 1793 return NULL; 1794 1795 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1796 GFP_KERNEL); 1797 if (!pool->size_class) { 1798 kfree(pool); 1799 return NULL; 1800 } 1801 1802 pool->name = kstrdup(name, GFP_KERNEL); 1803 if (!pool->name) 1804 goto err; 1805 1806 if (create_handle_cache(pool)) 1807 goto err; 1808 1809 /* 1810 * Iterate reversly, because, size of size_class that we want to use 1811 * for merging should be larger or equal to current size. 1812 */ 1813 for (i = zs_size_classes - 1; i >= 0; i--) { 1814 int size; 1815 int pages_per_zspage; 1816 struct size_class *class; 1817 1818 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1819 if (size > ZS_MAX_ALLOC_SIZE) 1820 size = ZS_MAX_ALLOC_SIZE; 1821 pages_per_zspage = get_pages_per_zspage(size); 1822 1823 /* 1824 * size_class is used for normal zsmalloc operation such 1825 * as alloc/free for that size. Although it is natural that we 1826 * have one size_class for each size, there is a chance that we 1827 * can get more memory utilization if we use one size_class for 1828 * many different sizes whose size_class have same 1829 * characteristics. So, we makes size_class point to 1830 * previous size_class if possible. 1831 */ 1832 if (prev_class) { 1833 if (can_merge(prev_class, size, pages_per_zspage)) { 1834 pool->size_class[i] = prev_class; 1835 continue; 1836 } 1837 } 1838 1839 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1840 if (!class) 1841 goto err; 1842 1843 class->size = size; 1844 class->index = i; 1845 class->pages_per_zspage = pages_per_zspage; 1846 if (pages_per_zspage == 1 && 1847 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1848 class->huge = true; 1849 spin_lock_init(&class->lock); 1850 pool->size_class[i] = class; 1851 1852 prev_class = class; 1853 } 1854 1855 pool->flags = flags; 1856 1857 if (zs_pool_stat_create(name, pool)) 1858 goto err; 1859 1860 return pool; 1861 1862 err: 1863 zs_destroy_pool(pool); 1864 return NULL; 1865 } 1866 EXPORT_SYMBOL_GPL(zs_create_pool); 1867 1868 void zs_destroy_pool(struct zs_pool *pool) 1869 { 1870 int i; 1871 1872 zs_pool_stat_destroy(pool); 1873 1874 for (i = 0; i < zs_size_classes; i++) { 1875 int fg; 1876 struct size_class *class = pool->size_class[i]; 1877 1878 if (!class) 1879 continue; 1880 1881 if (class->index != i) 1882 continue; 1883 1884 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1885 if (class->fullness_list[fg]) { 1886 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1887 class->size, fg); 1888 } 1889 } 1890 kfree(class); 1891 } 1892 1893 destroy_handle_cache(pool); 1894 kfree(pool->size_class); 1895 kfree(pool->name); 1896 kfree(pool); 1897 } 1898 EXPORT_SYMBOL_GPL(zs_destroy_pool); 1899 1900 static int __init zs_init(void) 1901 { 1902 int ret = zs_register_cpu_notifier(); 1903 1904 if (ret) 1905 goto notifier_fail; 1906 1907 init_zs_size_classes(); 1908 1909 #ifdef CONFIG_ZPOOL 1910 zpool_register_driver(&zs_zpool_driver); 1911 #endif 1912 1913 ret = zs_stat_init(); 1914 if (ret) { 1915 pr_err("zs stat initialization failed\n"); 1916 goto stat_fail; 1917 } 1918 return 0; 1919 1920 stat_fail: 1921 #ifdef CONFIG_ZPOOL 1922 zpool_unregister_driver(&zs_zpool_driver); 1923 #endif 1924 notifier_fail: 1925 zs_unregister_cpu_notifier(); 1926 1927 return ret; 1928 } 1929 1930 static void __exit zs_exit(void) 1931 { 1932 #ifdef CONFIG_ZPOOL 1933 zpool_unregister_driver(&zs_zpool_driver); 1934 #endif 1935 zs_unregister_cpu_notifier(); 1936 1937 zs_stat_exit(); 1938 } 1939 1940 module_init(zs_init); 1941 module_exit(zs_exit); 1942 1943 MODULE_LICENSE("Dual BSD/GPL"); 1944 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 1945