1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * z3fold.c 4 * 5 * Author: Vitaly Wool <vitaly.wool@konsulko.com> 6 * Copyright (C) 2016, Sony Mobile Communications Inc. 7 * 8 * This implementation is based on zbud written by Seth Jennings. 9 * 10 * z3fold is an special purpose allocator for storing compressed pages. It 11 * can store up to three compressed pages per page which improves the 12 * compression ratio of zbud while retaining its main concepts (e. g. always 13 * storing an integral number of objects per page) and simplicity. 14 * It still has simple and deterministic reclaim properties that make it 15 * preferable to a higher density approach (with no requirement on integral 16 * number of object per page) when reclaim is used. 17 * 18 * As in zbud, pages are divided into "chunks". The size of the chunks is 19 * fixed at compile time and is determined by NCHUNKS_ORDER below. 20 * 21 * z3fold doesn't export any API and is meant to be used via zpool API. 22 */ 23 24 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 25 26 #include <linux/atomic.h> 27 #include <linux/sched.h> 28 #include <linux/cpumask.h> 29 #include <linux/list.h> 30 #include <linux/mm.h> 31 #include <linux/module.h> 32 #include <linux/page-flags.h> 33 #include <linux/migrate.h> 34 #include <linux/node.h> 35 #include <linux/compaction.h> 36 #include <linux/percpu.h> 37 #include <linux/mount.h> 38 #include <linux/pseudo_fs.h> 39 #include <linux/fs.h> 40 #include <linux/preempt.h> 41 #include <linux/workqueue.h> 42 #include <linux/slab.h> 43 #include <linux/spinlock.h> 44 #include <linux/zpool.h> 45 #include <linux/magic.h> 46 #include <linux/kmemleak.h> 47 48 /* 49 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 50 * adjusting internal fragmentation. It also determines the number of 51 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 52 * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks 53 * in the beginning of an allocated page are occupied by z3fold header, so 54 * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), 55 * which shows the max number of free chunks in z3fold page, also there will 56 * be 63, or 62, respectively, freelists per pool. 57 */ 58 #define NCHUNKS_ORDER 6 59 60 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 61 #define CHUNK_SIZE (1 << CHUNK_SHIFT) 62 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) 63 #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) 64 #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 65 #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) 66 67 #define BUDDY_MASK (0x3) 68 #define BUDDY_SHIFT 2 69 #define SLOTS_ALIGN (0x40) 70 71 /***************** 72 * Structures 73 *****************/ 74 struct z3fold_pool; 75 struct z3fold_ops { 76 int (*evict)(struct z3fold_pool *pool, unsigned long handle); 77 }; 78 79 enum buddy { 80 HEADLESS = 0, 81 FIRST, 82 MIDDLE, 83 LAST, 84 BUDDIES_MAX = LAST 85 }; 86 87 struct z3fold_buddy_slots { 88 /* 89 * we are using BUDDY_MASK in handle_to_buddy etc. so there should 90 * be enough slots to hold all possible variants 91 */ 92 unsigned long slot[BUDDY_MASK + 1]; 93 unsigned long pool; /* back link */ 94 rwlock_t lock; 95 }; 96 #define HANDLE_FLAG_MASK (0x03) 97 98 /* 99 * struct z3fold_header - z3fold page metadata occupying first chunks of each 100 * z3fold page, except for HEADLESS pages 101 * @buddy: links the z3fold page into the relevant list in the 102 * pool 103 * @page_lock: per-page lock 104 * @refcount: reference count for the z3fold page 105 * @work: work_struct for page layout optimization 106 * @slots: pointer to the structure holding buddy slots 107 * @pool: pointer to the containing pool 108 * @cpu: CPU which this page "belongs" to 109 * @first_chunks: the size of the first buddy in chunks, 0 if free 110 * @middle_chunks: the size of the middle buddy in chunks, 0 if free 111 * @last_chunks: the size of the last buddy in chunks, 0 if free 112 * @first_num: the starting number (for the first handle) 113 * @mapped_count: the number of objects currently mapped 114 */ 115 struct z3fold_header { 116 struct list_head buddy; 117 spinlock_t page_lock; 118 struct kref refcount; 119 struct work_struct work; 120 struct z3fold_buddy_slots *slots; 121 struct z3fold_pool *pool; 122 short cpu; 123 unsigned short first_chunks; 124 unsigned short middle_chunks; 125 unsigned short last_chunks; 126 unsigned short start_middle; 127 unsigned short first_num:2; 128 unsigned short mapped_count:2; 129 unsigned short foreign_handles:2; 130 }; 131 132 /** 133 * struct z3fold_pool - stores metadata for each z3fold pool 134 * @name: pool name 135 * @lock: protects pool unbuddied/lru lists 136 * @stale_lock: protects pool stale page list 137 * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- 138 * buddies; the list each z3fold page is added to depends on 139 * the size of its free region. 140 * @lru: list tracking the z3fold pages in LRU order by most recently 141 * added buddy. 142 * @stale: list of pages marked for freeing 143 * @pages_nr: number of z3fold pages in the pool. 144 * @c_handle: cache for z3fold_buddy_slots allocation 145 * @ops: pointer to a structure of user defined operations specified at 146 * pool creation time. 147 * @compact_wq: workqueue for page layout background optimization 148 * @release_wq: workqueue for safe page release 149 * @work: work_struct for safe page release 150 * @inode: inode for z3fold pseudo filesystem 151 * 152 * This structure is allocated at pool creation time and maintains metadata 153 * pertaining to a particular z3fold pool. 154 */ 155 struct z3fold_pool { 156 const char *name; 157 spinlock_t lock; 158 spinlock_t stale_lock; 159 struct list_head *unbuddied; 160 struct list_head lru; 161 struct list_head stale; 162 atomic64_t pages_nr; 163 struct kmem_cache *c_handle; 164 const struct z3fold_ops *ops; 165 struct zpool *zpool; 166 const struct zpool_ops *zpool_ops; 167 struct workqueue_struct *compact_wq; 168 struct workqueue_struct *release_wq; 169 struct work_struct work; 170 struct inode *inode; 171 }; 172 173 /* 174 * Internal z3fold page flags 175 */ 176 enum z3fold_page_flags { 177 PAGE_HEADLESS = 0, 178 MIDDLE_CHUNK_MAPPED, 179 NEEDS_COMPACTING, 180 PAGE_STALE, 181 PAGE_CLAIMED, /* by either reclaim or free */ 182 }; 183 184 /* 185 * handle flags, go under HANDLE_FLAG_MASK 186 */ 187 enum z3fold_handle_flags { 188 HANDLES_NOFREE = 0, 189 }; 190 191 /* 192 * Forward declarations 193 */ 194 static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); 195 static void compact_page_work(struct work_struct *w); 196 197 /***************** 198 * Helpers 199 *****************/ 200 201 /* Converts an allocation size in bytes to size in z3fold chunks */ 202 static int size_to_chunks(size_t size) 203 { 204 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 205 } 206 207 #define for_each_unbuddied_list(_iter, _begin) \ 208 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 209 210 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, 211 gfp_t gfp) 212 { 213 struct z3fold_buddy_slots *slots; 214 215 slots = kmem_cache_zalloc(pool->c_handle, 216 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); 217 218 if (slots) { 219 /* It will be freed separately in free_handle(). */ 220 kmemleak_not_leak(slots); 221 slots->pool = (unsigned long)pool; 222 rwlock_init(&slots->lock); 223 } 224 225 return slots; 226 } 227 228 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) 229 { 230 return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); 231 } 232 233 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) 234 { 235 return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); 236 } 237 238 /* Lock a z3fold page */ 239 static inline void z3fold_page_lock(struct z3fold_header *zhdr) 240 { 241 spin_lock(&zhdr->page_lock); 242 } 243 244 /* Try to lock a z3fold page */ 245 static inline int z3fold_page_trylock(struct z3fold_header *zhdr) 246 { 247 return spin_trylock(&zhdr->page_lock); 248 } 249 250 /* Unlock a z3fold page */ 251 static inline void z3fold_page_unlock(struct z3fold_header *zhdr) 252 { 253 spin_unlock(&zhdr->page_lock); 254 } 255 256 257 static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, 258 bool lock) 259 { 260 struct z3fold_buddy_slots *slots; 261 struct z3fold_header *zhdr; 262 int locked = 0; 263 264 if (!(handle & (1 << PAGE_HEADLESS))) { 265 slots = handle_to_slots(handle); 266 do { 267 unsigned long addr; 268 269 read_lock(&slots->lock); 270 addr = *(unsigned long *)handle; 271 zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 272 if (lock) 273 locked = z3fold_page_trylock(zhdr); 274 read_unlock(&slots->lock); 275 if (locked) 276 break; 277 cpu_relax(); 278 } while (lock); 279 } else { 280 zhdr = (struct z3fold_header *)(handle & PAGE_MASK); 281 } 282 283 return zhdr; 284 } 285 286 /* Returns the z3fold page where a given handle is stored */ 287 static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) 288 { 289 return __get_z3fold_header(h, false); 290 } 291 292 /* return locked z3fold page if it's not headless */ 293 static inline struct z3fold_header *get_z3fold_header(unsigned long h) 294 { 295 return __get_z3fold_header(h, true); 296 } 297 298 static inline void put_z3fold_header(struct z3fold_header *zhdr) 299 { 300 struct page *page = virt_to_page(zhdr); 301 302 if (!test_bit(PAGE_HEADLESS, &page->private)) 303 z3fold_page_unlock(zhdr); 304 } 305 306 static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) 307 { 308 struct z3fold_buddy_slots *slots; 309 int i; 310 bool is_free; 311 312 if (handle & (1 << PAGE_HEADLESS)) 313 return; 314 315 if (WARN_ON(*(unsigned long *)handle == 0)) 316 return; 317 318 slots = handle_to_slots(handle); 319 write_lock(&slots->lock); 320 *(unsigned long *)handle = 0; 321 322 if (test_bit(HANDLES_NOFREE, &slots->pool)) { 323 write_unlock(&slots->lock); 324 return; /* simple case, nothing else to do */ 325 } 326 327 if (zhdr->slots != slots) 328 zhdr->foreign_handles--; 329 330 is_free = true; 331 for (i = 0; i <= BUDDY_MASK; i++) { 332 if (slots->slot[i]) { 333 is_free = false; 334 break; 335 } 336 } 337 write_unlock(&slots->lock); 338 339 if (is_free) { 340 struct z3fold_pool *pool = slots_to_pool(slots); 341 342 if (zhdr->slots == slots) 343 zhdr->slots = NULL; 344 kmem_cache_free(pool->c_handle, slots); 345 } 346 } 347 348 static int z3fold_init_fs_context(struct fs_context *fc) 349 { 350 return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; 351 } 352 353 static struct file_system_type z3fold_fs = { 354 .name = "z3fold", 355 .init_fs_context = z3fold_init_fs_context, 356 .kill_sb = kill_anon_super, 357 }; 358 359 static struct vfsmount *z3fold_mnt; 360 static int z3fold_mount(void) 361 { 362 int ret = 0; 363 364 z3fold_mnt = kern_mount(&z3fold_fs); 365 if (IS_ERR(z3fold_mnt)) 366 ret = PTR_ERR(z3fold_mnt); 367 368 return ret; 369 } 370 371 static void z3fold_unmount(void) 372 { 373 kern_unmount(z3fold_mnt); 374 } 375 376 static const struct address_space_operations z3fold_aops; 377 static int z3fold_register_migration(struct z3fold_pool *pool) 378 { 379 pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); 380 if (IS_ERR(pool->inode)) { 381 pool->inode = NULL; 382 return 1; 383 } 384 385 pool->inode->i_mapping->private_data = pool; 386 pool->inode->i_mapping->a_ops = &z3fold_aops; 387 return 0; 388 } 389 390 static void z3fold_unregister_migration(struct z3fold_pool *pool) 391 { 392 if (pool->inode) 393 iput(pool->inode); 394 } 395 396 /* Initializes the z3fold header of a newly allocated z3fold page */ 397 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, 398 struct z3fold_pool *pool, gfp_t gfp) 399 { 400 struct z3fold_header *zhdr = page_address(page); 401 struct z3fold_buddy_slots *slots; 402 403 INIT_LIST_HEAD(&page->lru); 404 clear_bit(PAGE_HEADLESS, &page->private); 405 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 406 clear_bit(NEEDS_COMPACTING, &page->private); 407 clear_bit(PAGE_STALE, &page->private); 408 clear_bit(PAGE_CLAIMED, &page->private); 409 if (headless) 410 return zhdr; 411 412 slots = alloc_slots(pool, gfp); 413 if (!slots) 414 return NULL; 415 416 memset(zhdr, 0, sizeof(*zhdr)); 417 spin_lock_init(&zhdr->page_lock); 418 kref_init(&zhdr->refcount); 419 zhdr->cpu = -1; 420 zhdr->slots = slots; 421 zhdr->pool = pool; 422 INIT_LIST_HEAD(&zhdr->buddy); 423 INIT_WORK(&zhdr->work, compact_page_work); 424 return zhdr; 425 } 426 427 /* Resets the struct page fields and frees the page */ 428 static void free_z3fold_page(struct page *page, bool headless) 429 { 430 if (!headless) { 431 lock_page(page); 432 __ClearPageMovable(page); 433 unlock_page(page); 434 } 435 ClearPagePrivate(page); 436 __free_page(page); 437 } 438 439 /* Helper function to build the index */ 440 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) 441 { 442 return (bud + zhdr->first_num) & BUDDY_MASK; 443 } 444 445 /* 446 * Encodes the handle of a particular buddy within a z3fold page 447 * Pool lock should be held as this function accesses first_num 448 */ 449 static unsigned long __encode_handle(struct z3fold_header *zhdr, 450 struct z3fold_buddy_slots *slots, 451 enum buddy bud) 452 { 453 unsigned long h = (unsigned long)zhdr; 454 int idx = 0; 455 456 /* 457 * For a headless page, its handle is its pointer with the extra 458 * PAGE_HEADLESS bit set 459 */ 460 if (bud == HEADLESS) 461 return h | (1 << PAGE_HEADLESS); 462 463 /* otherwise, return pointer to encoded handle */ 464 idx = __idx(zhdr, bud); 465 h += idx; 466 if (bud == LAST) 467 h |= (zhdr->last_chunks << BUDDY_SHIFT); 468 469 write_lock(&slots->lock); 470 slots->slot[idx] = h; 471 write_unlock(&slots->lock); 472 return (unsigned long)&slots->slot[idx]; 473 } 474 475 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 476 { 477 return __encode_handle(zhdr, zhdr->slots, bud); 478 } 479 480 /* only for LAST bud, returns zero otherwise */ 481 static unsigned short handle_to_chunks(unsigned long handle) 482 { 483 struct z3fold_buddy_slots *slots = handle_to_slots(handle); 484 unsigned long addr; 485 486 read_lock(&slots->lock); 487 addr = *(unsigned long *)handle; 488 read_unlock(&slots->lock); 489 return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; 490 } 491 492 /* 493 * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle 494 * but that doesn't matter. because the masking will result in the 495 * correct buddy number. 496 */ 497 static enum buddy handle_to_buddy(unsigned long handle) 498 { 499 struct z3fold_header *zhdr; 500 struct z3fold_buddy_slots *slots = handle_to_slots(handle); 501 unsigned long addr; 502 503 read_lock(&slots->lock); 504 WARN_ON(handle & (1 << PAGE_HEADLESS)); 505 addr = *(unsigned long *)handle; 506 read_unlock(&slots->lock); 507 zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 508 return (addr - zhdr->first_num) & BUDDY_MASK; 509 } 510 511 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) 512 { 513 return zhdr->pool; 514 } 515 516 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) 517 { 518 struct page *page = virt_to_page(zhdr); 519 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 520 521 WARN_ON(!list_empty(&zhdr->buddy)); 522 set_bit(PAGE_STALE, &page->private); 523 clear_bit(NEEDS_COMPACTING, &page->private); 524 spin_lock(&pool->lock); 525 if (!list_empty(&page->lru)) 526 list_del_init(&page->lru); 527 spin_unlock(&pool->lock); 528 529 if (locked) 530 z3fold_page_unlock(zhdr); 531 532 spin_lock(&pool->stale_lock); 533 list_add(&zhdr->buddy, &pool->stale); 534 queue_work(pool->release_wq, &pool->work); 535 spin_unlock(&pool->stale_lock); 536 } 537 538 static void release_z3fold_page(struct kref *ref) 539 { 540 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 541 refcount); 542 __release_z3fold_page(zhdr, false); 543 } 544 545 static void release_z3fold_page_locked(struct kref *ref) 546 { 547 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 548 refcount); 549 WARN_ON(z3fold_page_trylock(zhdr)); 550 __release_z3fold_page(zhdr, true); 551 } 552 553 static void release_z3fold_page_locked_list(struct kref *ref) 554 { 555 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 556 refcount); 557 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 558 559 spin_lock(&pool->lock); 560 list_del_init(&zhdr->buddy); 561 spin_unlock(&pool->lock); 562 563 WARN_ON(z3fold_page_trylock(zhdr)); 564 __release_z3fold_page(zhdr, true); 565 } 566 567 static void free_pages_work(struct work_struct *w) 568 { 569 struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); 570 571 spin_lock(&pool->stale_lock); 572 while (!list_empty(&pool->stale)) { 573 struct z3fold_header *zhdr = list_first_entry(&pool->stale, 574 struct z3fold_header, buddy); 575 struct page *page = virt_to_page(zhdr); 576 577 list_del(&zhdr->buddy); 578 if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) 579 continue; 580 spin_unlock(&pool->stale_lock); 581 cancel_work_sync(&zhdr->work); 582 free_z3fold_page(page, false); 583 cond_resched(); 584 spin_lock(&pool->stale_lock); 585 } 586 spin_unlock(&pool->stale_lock); 587 } 588 589 /* 590 * Returns the number of free chunks in a z3fold page. 591 * NB: can't be used with HEADLESS pages. 592 */ 593 static int num_free_chunks(struct z3fold_header *zhdr) 594 { 595 int nfree; 596 /* 597 * If there is a middle object, pick up the bigger free space 598 * either before or after it. Otherwise just subtract the number 599 * of chunks occupied by the first and the last objects. 600 */ 601 if (zhdr->middle_chunks != 0) { 602 int nfree_before = zhdr->first_chunks ? 603 0 : zhdr->start_middle - ZHDR_CHUNKS; 604 int nfree_after = zhdr->last_chunks ? 605 0 : TOTAL_CHUNKS - 606 (zhdr->start_middle + zhdr->middle_chunks); 607 nfree = max(nfree_before, nfree_after); 608 } else 609 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; 610 return nfree; 611 } 612 613 /* Add to the appropriate unbuddied list */ 614 static inline void add_to_unbuddied(struct z3fold_pool *pool, 615 struct z3fold_header *zhdr) 616 { 617 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 618 zhdr->middle_chunks == 0) { 619 struct list_head *unbuddied; 620 int freechunks = num_free_chunks(zhdr); 621 622 migrate_disable(); 623 unbuddied = this_cpu_ptr(pool->unbuddied); 624 spin_lock(&pool->lock); 625 list_add(&zhdr->buddy, &unbuddied[freechunks]); 626 spin_unlock(&pool->lock); 627 zhdr->cpu = smp_processor_id(); 628 migrate_enable(); 629 } 630 } 631 632 static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) 633 { 634 enum buddy bud = HEADLESS; 635 636 if (zhdr->middle_chunks) { 637 if (!zhdr->first_chunks && 638 chunks <= zhdr->start_middle - ZHDR_CHUNKS) 639 bud = FIRST; 640 else if (!zhdr->last_chunks) 641 bud = LAST; 642 } else { 643 if (!zhdr->first_chunks) 644 bud = FIRST; 645 else if (!zhdr->last_chunks) 646 bud = LAST; 647 else 648 bud = MIDDLE; 649 } 650 651 return bud; 652 } 653 654 static inline void *mchunk_memmove(struct z3fold_header *zhdr, 655 unsigned short dst_chunk) 656 { 657 void *beg = zhdr; 658 return memmove(beg + (dst_chunk << CHUNK_SHIFT), 659 beg + (zhdr->start_middle << CHUNK_SHIFT), 660 zhdr->middle_chunks << CHUNK_SHIFT); 661 } 662 663 static inline bool buddy_single(struct z3fold_header *zhdr) 664 { 665 return !((zhdr->first_chunks && zhdr->middle_chunks) || 666 (zhdr->first_chunks && zhdr->last_chunks) || 667 (zhdr->middle_chunks && zhdr->last_chunks)); 668 } 669 670 static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) 671 { 672 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 673 void *p = zhdr; 674 unsigned long old_handle = 0; 675 size_t sz = 0; 676 struct z3fold_header *new_zhdr = NULL; 677 int first_idx = __idx(zhdr, FIRST); 678 int middle_idx = __idx(zhdr, MIDDLE); 679 int last_idx = __idx(zhdr, LAST); 680 unsigned short *moved_chunks = NULL; 681 682 /* 683 * No need to protect slots here -- all the slots are "local" and 684 * the page lock is already taken 685 */ 686 if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { 687 p += ZHDR_SIZE_ALIGNED; 688 sz = zhdr->first_chunks << CHUNK_SHIFT; 689 old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; 690 moved_chunks = &zhdr->first_chunks; 691 } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { 692 p += zhdr->start_middle << CHUNK_SHIFT; 693 sz = zhdr->middle_chunks << CHUNK_SHIFT; 694 old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; 695 moved_chunks = &zhdr->middle_chunks; 696 } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { 697 p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); 698 sz = zhdr->last_chunks << CHUNK_SHIFT; 699 old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; 700 moved_chunks = &zhdr->last_chunks; 701 } 702 703 if (sz > 0) { 704 enum buddy new_bud = HEADLESS; 705 short chunks = size_to_chunks(sz); 706 void *q; 707 708 new_zhdr = __z3fold_alloc(pool, sz, false); 709 if (!new_zhdr) 710 return NULL; 711 712 if (WARN_ON(new_zhdr == zhdr)) 713 goto out_fail; 714 715 new_bud = get_free_buddy(new_zhdr, chunks); 716 q = new_zhdr; 717 switch (new_bud) { 718 case FIRST: 719 new_zhdr->first_chunks = chunks; 720 q += ZHDR_SIZE_ALIGNED; 721 break; 722 case MIDDLE: 723 new_zhdr->middle_chunks = chunks; 724 new_zhdr->start_middle = 725 new_zhdr->first_chunks + ZHDR_CHUNKS; 726 q += new_zhdr->start_middle << CHUNK_SHIFT; 727 break; 728 case LAST: 729 new_zhdr->last_chunks = chunks; 730 q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); 731 break; 732 default: 733 goto out_fail; 734 } 735 new_zhdr->foreign_handles++; 736 memcpy(q, p, sz); 737 write_lock(&zhdr->slots->lock); 738 *(unsigned long *)old_handle = (unsigned long)new_zhdr + 739 __idx(new_zhdr, new_bud); 740 if (new_bud == LAST) 741 *(unsigned long *)old_handle |= 742 (new_zhdr->last_chunks << BUDDY_SHIFT); 743 write_unlock(&zhdr->slots->lock); 744 add_to_unbuddied(pool, new_zhdr); 745 z3fold_page_unlock(new_zhdr); 746 747 *moved_chunks = 0; 748 } 749 750 return new_zhdr; 751 752 out_fail: 753 if (new_zhdr) { 754 if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) 755 atomic64_dec(&pool->pages_nr); 756 else { 757 add_to_unbuddied(pool, new_zhdr); 758 z3fold_page_unlock(new_zhdr); 759 } 760 } 761 return NULL; 762 763 } 764 765 #define BIG_CHUNK_GAP 3 766 /* Has to be called with lock held */ 767 static int z3fold_compact_page(struct z3fold_header *zhdr) 768 { 769 struct page *page = virt_to_page(zhdr); 770 771 if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) 772 return 0; /* can't move middle chunk, it's used */ 773 774 if (unlikely(PageIsolated(page))) 775 return 0; 776 777 if (zhdr->middle_chunks == 0) 778 return 0; /* nothing to compact */ 779 780 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 781 /* move to the beginning */ 782 mchunk_memmove(zhdr, ZHDR_CHUNKS); 783 zhdr->first_chunks = zhdr->middle_chunks; 784 zhdr->middle_chunks = 0; 785 zhdr->start_middle = 0; 786 zhdr->first_num++; 787 return 1; 788 } 789 790 /* 791 * moving data is expensive, so let's only do that if 792 * there's substantial gain (at least BIG_CHUNK_GAP chunks) 793 */ 794 if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && 795 zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= 796 BIG_CHUNK_GAP) { 797 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); 798 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 799 return 1; 800 } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && 801 TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle 802 + zhdr->middle_chunks) >= 803 BIG_CHUNK_GAP) { 804 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - 805 zhdr->middle_chunks; 806 mchunk_memmove(zhdr, new_start); 807 zhdr->start_middle = new_start; 808 return 1; 809 } 810 811 return 0; 812 } 813 814 static void do_compact_page(struct z3fold_header *zhdr, bool locked) 815 { 816 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 817 struct page *page; 818 819 page = virt_to_page(zhdr); 820 if (locked) 821 WARN_ON(z3fold_page_trylock(zhdr)); 822 else 823 z3fold_page_lock(zhdr); 824 if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { 825 z3fold_page_unlock(zhdr); 826 return; 827 } 828 spin_lock(&pool->lock); 829 list_del_init(&zhdr->buddy); 830 spin_unlock(&pool->lock); 831 832 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 833 atomic64_dec(&pool->pages_nr); 834 return; 835 } 836 837 if (test_bit(PAGE_STALE, &page->private) || 838 test_and_set_bit(PAGE_CLAIMED, &page->private)) { 839 z3fold_page_unlock(zhdr); 840 return; 841 } 842 843 if (!zhdr->foreign_handles && buddy_single(zhdr) && 844 zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { 845 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) 846 atomic64_dec(&pool->pages_nr); 847 else { 848 clear_bit(PAGE_CLAIMED, &page->private); 849 z3fold_page_unlock(zhdr); 850 } 851 return; 852 } 853 854 z3fold_compact_page(zhdr); 855 add_to_unbuddied(pool, zhdr); 856 clear_bit(PAGE_CLAIMED, &page->private); 857 z3fold_page_unlock(zhdr); 858 } 859 860 static void compact_page_work(struct work_struct *w) 861 { 862 struct z3fold_header *zhdr = container_of(w, struct z3fold_header, 863 work); 864 865 do_compact_page(zhdr, false); 866 } 867 868 /* returns _locked_ z3fold page header or NULL */ 869 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, 870 size_t size, bool can_sleep) 871 { 872 struct z3fold_header *zhdr = NULL; 873 struct page *page; 874 struct list_head *unbuddied; 875 int chunks = size_to_chunks(size), i; 876 877 lookup: 878 migrate_disable(); 879 /* First, try to find an unbuddied z3fold page. */ 880 unbuddied = this_cpu_ptr(pool->unbuddied); 881 for_each_unbuddied_list(i, chunks) { 882 struct list_head *l = &unbuddied[i]; 883 884 zhdr = list_first_entry_or_null(READ_ONCE(l), 885 struct z3fold_header, buddy); 886 887 if (!zhdr) 888 continue; 889 890 /* Re-check under lock. */ 891 spin_lock(&pool->lock); 892 l = &unbuddied[i]; 893 if (unlikely(zhdr != list_first_entry(READ_ONCE(l), 894 struct z3fold_header, buddy)) || 895 !z3fold_page_trylock(zhdr)) { 896 spin_unlock(&pool->lock); 897 zhdr = NULL; 898 migrate_enable(); 899 if (can_sleep) 900 cond_resched(); 901 goto lookup; 902 } 903 list_del_init(&zhdr->buddy); 904 zhdr->cpu = -1; 905 spin_unlock(&pool->lock); 906 907 page = virt_to_page(zhdr); 908 if (test_bit(NEEDS_COMPACTING, &page->private) || 909 test_bit(PAGE_CLAIMED, &page->private)) { 910 z3fold_page_unlock(zhdr); 911 zhdr = NULL; 912 migrate_enable(); 913 if (can_sleep) 914 cond_resched(); 915 goto lookup; 916 } 917 918 /* 919 * this page could not be removed from its unbuddied 920 * list while pool lock was held, and then we've taken 921 * page lock so kref_put could not be called before 922 * we got here, so it's safe to just call kref_get() 923 */ 924 kref_get(&zhdr->refcount); 925 break; 926 } 927 migrate_enable(); 928 929 if (!zhdr) { 930 int cpu; 931 932 /* look for _exact_ match on other cpus' lists */ 933 for_each_online_cpu(cpu) { 934 struct list_head *l; 935 936 unbuddied = per_cpu_ptr(pool->unbuddied, cpu); 937 spin_lock(&pool->lock); 938 l = &unbuddied[chunks]; 939 940 zhdr = list_first_entry_or_null(READ_ONCE(l), 941 struct z3fold_header, buddy); 942 943 if (!zhdr || !z3fold_page_trylock(zhdr)) { 944 spin_unlock(&pool->lock); 945 zhdr = NULL; 946 continue; 947 } 948 list_del_init(&zhdr->buddy); 949 zhdr->cpu = -1; 950 spin_unlock(&pool->lock); 951 952 page = virt_to_page(zhdr); 953 if (test_bit(NEEDS_COMPACTING, &page->private) || 954 test_bit(PAGE_CLAIMED, &page->private)) { 955 z3fold_page_unlock(zhdr); 956 zhdr = NULL; 957 if (can_sleep) 958 cond_resched(); 959 continue; 960 } 961 kref_get(&zhdr->refcount); 962 break; 963 } 964 } 965 966 if (zhdr && !zhdr->slots) 967 zhdr->slots = alloc_slots(pool, 968 can_sleep ? GFP_NOIO : GFP_ATOMIC); 969 return zhdr; 970 } 971 972 /* 973 * API Functions 974 */ 975 976 /** 977 * z3fold_create_pool() - create a new z3fold pool 978 * @name: pool name 979 * @gfp: gfp flags when allocating the z3fold pool structure 980 * @ops: user-defined operations for the z3fold pool 981 * 982 * Return: pointer to the new z3fold pool or NULL if the metadata allocation 983 * failed. 984 */ 985 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, 986 const struct z3fold_ops *ops) 987 { 988 struct z3fold_pool *pool = NULL; 989 int i, cpu; 990 991 pool = kzalloc(sizeof(struct z3fold_pool), gfp); 992 if (!pool) 993 goto out; 994 pool->c_handle = kmem_cache_create("z3fold_handle", 995 sizeof(struct z3fold_buddy_slots), 996 SLOTS_ALIGN, 0, NULL); 997 if (!pool->c_handle) 998 goto out_c; 999 spin_lock_init(&pool->lock); 1000 spin_lock_init(&pool->stale_lock); 1001 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 1002 if (!pool->unbuddied) 1003 goto out_pool; 1004 for_each_possible_cpu(cpu) { 1005 struct list_head *unbuddied = 1006 per_cpu_ptr(pool->unbuddied, cpu); 1007 for_each_unbuddied_list(i, 0) 1008 INIT_LIST_HEAD(&unbuddied[i]); 1009 } 1010 INIT_LIST_HEAD(&pool->lru); 1011 INIT_LIST_HEAD(&pool->stale); 1012 atomic64_set(&pool->pages_nr, 0); 1013 pool->name = name; 1014 pool->compact_wq = create_singlethread_workqueue(pool->name); 1015 if (!pool->compact_wq) 1016 goto out_unbuddied; 1017 pool->release_wq = create_singlethread_workqueue(pool->name); 1018 if (!pool->release_wq) 1019 goto out_wq; 1020 if (z3fold_register_migration(pool)) 1021 goto out_rwq; 1022 INIT_WORK(&pool->work, free_pages_work); 1023 pool->ops = ops; 1024 return pool; 1025 1026 out_rwq: 1027 destroy_workqueue(pool->release_wq); 1028 out_wq: 1029 destroy_workqueue(pool->compact_wq); 1030 out_unbuddied: 1031 free_percpu(pool->unbuddied); 1032 out_pool: 1033 kmem_cache_destroy(pool->c_handle); 1034 out_c: 1035 kfree(pool); 1036 out: 1037 return NULL; 1038 } 1039 1040 /** 1041 * z3fold_destroy_pool() - destroys an existing z3fold pool 1042 * @pool: the z3fold pool to be destroyed 1043 * 1044 * The pool should be emptied before this function is called. 1045 */ 1046 static void z3fold_destroy_pool(struct z3fold_pool *pool) 1047 { 1048 kmem_cache_destroy(pool->c_handle); 1049 1050 /* 1051 * We need to destroy pool->compact_wq before pool->release_wq, 1052 * as any pending work on pool->compact_wq will call 1053 * queue_work(pool->release_wq, &pool->work). 1054 * 1055 * There are still outstanding pages until both workqueues are drained, 1056 * so we cannot unregister migration until then. 1057 */ 1058 1059 destroy_workqueue(pool->compact_wq); 1060 destroy_workqueue(pool->release_wq); 1061 z3fold_unregister_migration(pool); 1062 kfree(pool); 1063 } 1064 1065 /** 1066 * z3fold_alloc() - allocates a region of a given size 1067 * @pool: z3fold pool from which to allocate 1068 * @size: size in bytes of the desired allocation 1069 * @gfp: gfp flags used if the pool needs to grow 1070 * @handle: handle of the new allocation 1071 * 1072 * This function will attempt to find a free region in the pool large enough to 1073 * satisfy the allocation request. A search of the unbuddied lists is 1074 * performed first. If no suitable free region is found, then a new page is 1075 * allocated and added to the pool to satisfy the request. 1076 * 1077 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 1078 * as z3fold pool pages. 1079 * 1080 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or 1081 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 1082 * a new page. 1083 */ 1084 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, 1085 unsigned long *handle) 1086 { 1087 int chunks = size_to_chunks(size); 1088 struct z3fold_header *zhdr = NULL; 1089 struct page *page = NULL; 1090 enum buddy bud; 1091 bool can_sleep = gfpflags_allow_blocking(gfp); 1092 1093 if (!size) 1094 return -EINVAL; 1095 1096 if (size > PAGE_SIZE) 1097 return -ENOSPC; 1098 1099 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 1100 bud = HEADLESS; 1101 else { 1102 retry: 1103 zhdr = __z3fold_alloc(pool, size, can_sleep); 1104 if (zhdr) { 1105 bud = get_free_buddy(zhdr, chunks); 1106 if (bud == HEADLESS) { 1107 if (kref_put(&zhdr->refcount, 1108 release_z3fold_page_locked)) 1109 atomic64_dec(&pool->pages_nr); 1110 else 1111 z3fold_page_unlock(zhdr); 1112 pr_err("No free chunks in unbuddied\n"); 1113 WARN_ON(1); 1114 goto retry; 1115 } 1116 page = virt_to_page(zhdr); 1117 goto found; 1118 } 1119 bud = FIRST; 1120 } 1121 1122 page = NULL; 1123 if (can_sleep) { 1124 spin_lock(&pool->stale_lock); 1125 zhdr = list_first_entry_or_null(&pool->stale, 1126 struct z3fold_header, buddy); 1127 /* 1128 * Before allocating a page, let's see if we can take one from 1129 * the stale pages list. cancel_work_sync() can sleep so we 1130 * limit this case to the contexts where we can sleep 1131 */ 1132 if (zhdr) { 1133 list_del(&zhdr->buddy); 1134 spin_unlock(&pool->stale_lock); 1135 cancel_work_sync(&zhdr->work); 1136 page = virt_to_page(zhdr); 1137 } else { 1138 spin_unlock(&pool->stale_lock); 1139 } 1140 } 1141 if (!page) 1142 page = alloc_page(gfp); 1143 1144 if (!page) 1145 return -ENOMEM; 1146 1147 zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); 1148 if (!zhdr) { 1149 __free_page(page); 1150 return -ENOMEM; 1151 } 1152 atomic64_inc(&pool->pages_nr); 1153 1154 if (bud == HEADLESS) { 1155 set_bit(PAGE_HEADLESS, &page->private); 1156 goto headless; 1157 } 1158 if (can_sleep) { 1159 lock_page(page); 1160 __SetPageMovable(page, pool->inode->i_mapping); 1161 unlock_page(page); 1162 } else { 1163 if (trylock_page(page)) { 1164 __SetPageMovable(page, pool->inode->i_mapping); 1165 unlock_page(page); 1166 } 1167 } 1168 z3fold_page_lock(zhdr); 1169 1170 found: 1171 if (bud == FIRST) 1172 zhdr->first_chunks = chunks; 1173 else if (bud == LAST) 1174 zhdr->last_chunks = chunks; 1175 else { 1176 zhdr->middle_chunks = chunks; 1177 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 1178 } 1179 add_to_unbuddied(pool, zhdr); 1180 1181 headless: 1182 spin_lock(&pool->lock); 1183 /* Add/move z3fold page to beginning of LRU */ 1184 if (!list_empty(&page->lru)) 1185 list_del(&page->lru); 1186 1187 list_add(&page->lru, &pool->lru); 1188 1189 *handle = encode_handle(zhdr, bud); 1190 spin_unlock(&pool->lock); 1191 if (bud != HEADLESS) 1192 z3fold_page_unlock(zhdr); 1193 1194 return 0; 1195 } 1196 1197 /** 1198 * z3fold_free() - frees the allocation associated with the given handle 1199 * @pool: pool in which the allocation resided 1200 * @handle: handle associated with the allocation returned by z3fold_alloc() 1201 * 1202 * In the case that the z3fold page in which the allocation resides is under 1203 * reclaim, as indicated by the PG_reclaim flag being set, this function 1204 * only sets the first|last_chunks to 0. The page is actually freed 1205 * once both buddies are evicted (see z3fold_reclaim_page() below). 1206 */ 1207 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) 1208 { 1209 struct z3fold_header *zhdr; 1210 struct page *page; 1211 enum buddy bud; 1212 bool page_claimed; 1213 1214 zhdr = get_z3fold_header(handle); 1215 page = virt_to_page(zhdr); 1216 page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); 1217 1218 if (test_bit(PAGE_HEADLESS, &page->private)) { 1219 /* if a headless page is under reclaim, just leave. 1220 * NB: we use test_and_set_bit for a reason: if the bit 1221 * has not been set before, we release this page 1222 * immediately so we don't care about its value any more. 1223 */ 1224 if (!page_claimed) { 1225 spin_lock(&pool->lock); 1226 list_del(&page->lru); 1227 spin_unlock(&pool->lock); 1228 put_z3fold_header(zhdr); 1229 free_z3fold_page(page, true); 1230 atomic64_dec(&pool->pages_nr); 1231 } 1232 return; 1233 } 1234 1235 /* Non-headless case */ 1236 bud = handle_to_buddy(handle); 1237 1238 switch (bud) { 1239 case FIRST: 1240 zhdr->first_chunks = 0; 1241 break; 1242 case MIDDLE: 1243 zhdr->middle_chunks = 0; 1244 break; 1245 case LAST: 1246 zhdr->last_chunks = 0; 1247 break; 1248 default: 1249 pr_err("%s: unknown bud %d\n", __func__, bud); 1250 WARN_ON(1); 1251 put_z3fold_header(zhdr); 1252 return; 1253 } 1254 1255 if (!page_claimed) 1256 free_handle(handle, zhdr); 1257 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { 1258 atomic64_dec(&pool->pages_nr); 1259 return; 1260 } 1261 if (page_claimed) { 1262 /* the page has not been claimed by us */ 1263 z3fold_page_unlock(zhdr); 1264 return; 1265 } 1266 if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 1267 put_z3fold_header(zhdr); 1268 clear_bit(PAGE_CLAIMED, &page->private); 1269 return; 1270 } 1271 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { 1272 spin_lock(&pool->lock); 1273 list_del_init(&zhdr->buddy); 1274 spin_unlock(&pool->lock); 1275 zhdr->cpu = -1; 1276 kref_get(&zhdr->refcount); 1277 clear_bit(PAGE_CLAIMED, &page->private); 1278 do_compact_page(zhdr, true); 1279 return; 1280 } 1281 kref_get(&zhdr->refcount); 1282 clear_bit(PAGE_CLAIMED, &page->private); 1283 queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); 1284 put_z3fold_header(zhdr); 1285 } 1286 1287 /** 1288 * z3fold_reclaim_page() - evicts allocations from a pool page and frees it 1289 * @pool: pool from which a page will attempt to be evicted 1290 * @retries: number of pages on the LRU list for which eviction will 1291 * be attempted before failing 1292 * 1293 * z3fold reclaim is different from normal system reclaim in that it is done 1294 * from the bottom, up. This is because only the bottom layer, z3fold, has 1295 * information on how the allocations are organized within each z3fold page. 1296 * This has the potential to create interesting locking situations between 1297 * z3fold and the user, however. 1298 * 1299 * To avoid these, this is how z3fold_reclaim_page() should be called: 1300 * 1301 * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). 1302 * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and 1303 * call the user-defined eviction handler with the pool and handle as 1304 * arguments. 1305 * 1306 * If the handle can not be evicted, the eviction handler should return 1307 * non-zero. z3fold_reclaim_page() will add the z3fold page back to the 1308 * appropriate list and try the next z3fold page on the LRU up to 1309 * a user defined number of retries. 1310 * 1311 * If the handle is successfully evicted, the eviction handler should 1312 * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() 1313 * contains logic to delay freeing the page if the page is under reclaim, 1314 * as indicated by the setting of the PG_reclaim flag on the underlying page. 1315 * 1316 * If all buddies in the z3fold page are successfully evicted, then the 1317 * z3fold page can be freed. 1318 * 1319 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are 1320 * no pages to evict or an eviction handler is not registered, -EAGAIN if 1321 * the retry limit was hit. 1322 */ 1323 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) 1324 { 1325 int i, ret = -1; 1326 struct z3fold_header *zhdr = NULL; 1327 struct page *page = NULL; 1328 struct list_head *pos; 1329 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 1330 struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN))); 1331 1332 rwlock_init(&slots.lock); 1333 slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); 1334 1335 spin_lock(&pool->lock); 1336 if (!pool->ops || !pool->ops->evict || retries == 0) { 1337 spin_unlock(&pool->lock); 1338 return -EINVAL; 1339 } 1340 for (i = 0; i < retries; i++) { 1341 if (list_empty(&pool->lru)) { 1342 spin_unlock(&pool->lock); 1343 return -EINVAL; 1344 } 1345 list_for_each_prev(pos, &pool->lru) { 1346 page = list_entry(pos, struct page, lru); 1347 1348 zhdr = page_address(page); 1349 if (test_bit(PAGE_HEADLESS, &page->private)) 1350 break; 1351 1352 if (kref_get_unless_zero(&zhdr->refcount) == 0) { 1353 zhdr = NULL; 1354 break; 1355 } 1356 if (!z3fold_page_trylock(zhdr)) { 1357 if (kref_put(&zhdr->refcount, 1358 release_z3fold_page)) 1359 atomic64_dec(&pool->pages_nr); 1360 zhdr = NULL; 1361 continue; /* can't evict at this point */ 1362 } 1363 1364 /* test_and_set_bit is of course atomic, but we still 1365 * need to do it under page lock, otherwise checking 1366 * that bit in __z3fold_alloc wouldn't make sense 1367 */ 1368 if (zhdr->foreign_handles || 1369 test_and_set_bit(PAGE_CLAIMED, &page->private)) { 1370 if (kref_put(&zhdr->refcount, 1371 release_z3fold_page)) 1372 atomic64_dec(&pool->pages_nr); 1373 else 1374 z3fold_page_unlock(zhdr); 1375 zhdr = NULL; 1376 continue; /* can't evict such page */ 1377 } 1378 list_del_init(&zhdr->buddy); 1379 zhdr->cpu = -1; 1380 break; 1381 } 1382 1383 if (!zhdr) 1384 break; 1385 1386 list_del_init(&page->lru); 1387 spin_unlock(&pool->lock); 1388 1389 if (!test_bit(PAGE_HEADLESS, &page->private)) { 1390 /* 1391 * We need encode the handles before unlocking, and 1392 * use our local slots structure because z3fold_free 1393 * can zero out zhdr->slots and we can't do much 1394 * about that 1395 */ 1396 first_handle = 0; 1397 last_handle = 0; 1398 middle_handle = 0; 1399 memset(slots.slot, 0, sizeof(slots.slot)); 1400 if (zhdr->first_chunks) 1401 first_handle = __encode_handle(zhdr, &slots, 1402 FIRST); 1403 if (zhdr->middle_chunks) 1404 middle_handle = __encode_handle(zhdr, &slots, 1405 MIDDLE); 1406 if (zhdr->last_chunks) 1407 last_handle = __encode_handle(zhdr, &slots, 1408 LAST); 1409 /* 1410 * it's safe to unlock here because we hold a 1411 * reference to this page 1412 */ 1413 z3fold_page_unlock(zhdr); 1414 } else { 1415 first_handle = encode_handle(zhdr, HEADLESS); 1416 last_handle = middle_handle = 0; 1417 } 1418 /* Issue the eviction callback(s) */ 1419 if (middle_handle) { 1420 ret = pool->ops->evict(pool, middle_handle); 1421 if (ret) 1422 goto next; 1423 } 1424 if (first_handle) { 1425 ret = pool->ops->evict(pool, first_handle); 1426 if (ret) 1427 goto next; 1428 } 1429 if (last_handle) { 1430 ret = pool->ops->evict(pool, last_handle); 1431 if (ret) 1432 goto next; 1433 } 1434 next: 1435 if (test_bit(PAGE_HEADLESS, &page->private)) { 1436 if (ret == 0) { 1437 free_z3fold_page(page, true); 1438 atomic64_dec(&pool->pages_nr); 1439 return 0; 1440 } 1441 spin_lock(&pool->lock); 1442 list_add(&page->lru, &pool->lru); 1443 spin_unlock(&pool->lock); 1444 clear_bit(PAGE_CLAIMED, &page->private); 1445 } else { 1446 struct z3fold_buddy_slots *slots = zhdr->slots; 1447 z3fold_page_lock(zhdr); 1448 if (kref_put(&zhdr->refcount, 1449 release_z3fold_page_locked)) { 1450 kmem_cache_free(pool->c_handle, slots); 1451 atomic64_dec(&pool->pages_nr); 1452 return 0; 1453 } 1454 /* 1455 * if we are here, the page is still not completely 1456 * free. Take the global pool lock then to be able 1457 * to add it back to the lru list 1458 */ 1459 spin_lock(&pool->lock); 1460 list_add(&page->lru, &pool->lru); 1461 spin_unlock(&pool->lock); 1462 z3fold_page_unlock(zhdr); 1463 clear_bit(PAGE_CLAIMED, &page->private); 1464 } 1465 1466 /* We started off locked to we need to lock the pool back */ 1467 spin_lock(&pool->lock); 1468 } 1469 spin_unlock(&pool->lock); 1470 return -EAGAIN; 1471 } 1472 1473 /** 1474 * z3fold_map() - maps the allocation associated with the given handle 1475 * @pool: pool in which the allocation resides 1476 * @handle: handle associated with the allocation to be mapped 1477 * 1478 * Extracts the buddy number from handle and constructs the pointer to the 1479 * correct starting chunk within the page. 1480 * 1481 * Returns: a pointer to the mapped allocation 1482 */ 1483 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) 1484 { 1485 struct z3fold_header *zhdr; 1486 struct page *page; 1487 void *addr; 1488 enum buddy buddy; 1489 1490 zhdr = get_z3fold_header(handle); 1491 addr = zhdr; 1492 page = virt_to_page(zhdr); 1493 1494 if (test_bit(PAGE_HEADLESS, &page->private)) 1495 goto out; 1496 1497 buddy = handle_to_buddy(handle); 1498 switch (buddy) { 1499 case FIRST: 1500 addr += ZHDR_SIZE_ALIGNED; 1501 break; 1502 case MIDDLE: 1503 addr += zhdr->start_middle << CHUNK_SHIFT; 1504 set_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1505 break; 1506 case LAST: 1507 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); 1508 break; 1509 default: 1510 pr_err("unknown buddy id %d\n", buddy); 1511 WARN_ON(1); 1512 addr = NULL; 1513 break; 1514 } 1515 1516 if (addr) 1517 zhdr->mapped_count++; 1518 out: 1519 put_z3fold_header(zhdr); 1520 return addr; 1521 } 1522 1523 /** 1524 * z3fold_unmap() - unmaps the allocation associated with the given handle 1525 * @pool: pool in which the allocation resides 1526 * @handle: handle associated with the allocation to be unmapped 1527 */ 1528 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) 1529 { 1530 struct z3fold_header *zhdr; 1531 struct page *page; 1532 enum buddy buddy; 1533 1534 zhdr = get_z3fold_header(handle); 1535 page = virt_to_page(zhdr); 1536 1537 if (test_bit(PAGE_HEADLESS, &page->private)) 1538 return; 1539 1540 buddy = handle_to_buddy(handle); 1541 if (buddy == MIDDLE) 1542 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1543 zhdr->mapped_count--; 1544 put_z3fold_header(zhdr); 1545 } 1546 1547 /** 1548 * z3fold_get_pool_size() - gets the z3fold pool size in pages 1549 * @pool: pool whose size is being queried 1550 * 1551 * Returns: size in pages of the given pool. 1552 */ 1553 static u64 z3fold_get_pool_size(struct z3fold_pool *pool) 1554 { 1555 return atomic64_read(&pool->pages_nr); 1556 } 1557 1558 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1559 { 1560 struct z3fold_header *zhdr; 1561 struct z3fold_pool *pool; 1562 1563 VM_BUG_ON_PAGE(!PageMovable(page), page); 1564 VM_BUG_ON_PAGE(PageIsolated(page), page); 1565 1566 if (test_bit(PAGE_HEADLESS, &page->private)) 1567 return false; 1568 1569 zhdr = page_address(page); 1570 z3fold_page_lock(zhdr); 1571 if (test_bit(NEEDS_COMPACTING, &page->private) || 1572 test_bit(PAGE_STALE, &page->private)) 1573 goto out; 1574 1575 if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) 1576 goto out; 1577 1578 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) 1579 goto out; 1580 pool = zhdr_to_pool(zhdr); 1581 spin_lock(&pool->lock); 1582 if (!list_empty(&zhdr->buddy)) 1583 list_del_init(&zhdr->buddy); 1584 if (!list_empty(&page->lru)) 1585 list_del_init(&page->lru); 1586 spin_unlock(&pool->lock); 1587 1588 kref_get(&zhdr->refcount); 1589 z3fold_page_unlock(zhdr); 1590 return true; 1591 1592 out: 1593 z3fold_page_unlock(zhdr); 1594 return false; 1595 } 1596 1597 static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, 1598 struct page *page, enum migrate_mode mode) 1599 { 1600 struct z3fold_header *zhdr, *new_zhdr; 1601 struct z3fold_pool *pool; 1602 struct address_space *new_mapping; 1603 1604 VM_BUG_ON_PAGE(!PageMovable(page), page); 1605 VM_BUG_ON_PAGE(!PageIsolated(page), page); 1606 VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); 1607 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 1608 1609 zhdr = page_address(page); 1610 pool = zhdr_to_pool(zhdr); 1611 1612 if (!z3fold_page_trylock(zhdr)) 1613 return -EAGAIN; 1614 if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { 1615 z3fold_page_unlock(zhdr); 1616 clear_bit(PAGE_CLAIMED, &page->private); 1617 return -EBUSY; 1618 } 1619 if (work_pending(&zhdr->work)) { 1620 z3fold_page_unlock(zhdr); 1621 return -EAGAIN; 1622 } 1623 new_zhdr = page_address(newpage); 1624 memcpy(new_zhdr, zhdr, PAGE_SIZE); 1625 newpage->private = page->private; 1626 page->private = 0; 1627 z3fold_page_unlock(zhdr); 1628 spin_lock_init(&new_zhdr->page_lock); 1629 INIT_WORK(&new_zhdr->work, compact_page_work); 1630 /* 1631 * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, 1632 * so we only have to reinitialize it. 1633 */ 1634 INIT_LIST_HEAD(&new_zhdr->buddy); 1635 new_mapping = page_mapping(page); 1636 __ClearPageMovable(page); 1637 ClearPagePrivate(page); 1638 1639 get_page(newpage); 1640 z3fold_page_lock(new_zhdr); 1641 if (new_zhdr->first_chunks) 1642 encode_handle(new_zhdr, FIRST); 1643 if (new_zhdr->last_chunks) 1644 encode_handle(new_zhdr, LAST); 1645 if (new_zhdr->middle_chunks) 1646 encode_handle(new_zhdr, MIDDLE); 1647 set_bit(NEEDS_COMPACTING, &newpage->private); 1648 new_zhdr->cpu = smp_processor_id(); 1649 spin_lock(&pool->lock); 1650 list_add(&newpage->lru, &pool->lru); 1651 spin_unlock(&pool->lock); 1652 __SetPageMovable(newpage, new_mapping); 1653 z3fold_page_unlock(new_zhdr); 1654 1655 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1656 1657 page_mapcount_reset(page); 1658 clear_bit(PAGE_CLAIMED, &page->private); 1659 put_page(page); 1660 return 0; 1661 } 1662 1663 static void z3fold_page_putback(struct page *page) 1664 { 1665 struct z3fold_header *zhdr; 1666 struct z3fold_pool *pool; 1667 1668 zhdr = page_address(page); 1669 pool = zhdr_to_pool(zhdr); 1670 1671 z3fold_page_lock(zhdr); 1672 if (!list_empty(&zhdr->buddy)) 1673 list_del_init(&zhdr->buddy); 1674 INIT_LIST_HEAD(&page->lru); 1675 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1676 atomic64_dec(&pool->pages_nr); 1677 return; 1678 } 1679 spin_lock(&pool->lock); 1680 list_add(&page->lru, &pool->lru); 1681 spin_unlock(&pool->lock); 1682 clear_bit(PAGE_CLAIMED, &page->private); 1683 z3fold_page_unlock(zhdr); 1684 } 1685 1686 static const struct address_space_operations z3fold_aops = { 1687 .isolate_page = z3fold_page_isolate, 1688 .migratepage = z3fold_page_migrate, 1689 .putback_page = z3fold_page_putback, 1690 }; 1691 1692 /***************** 1693 * zpool 1694 ****************/ 1695 1696 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) 1697 { 1698 if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) 1699 return pool->zpool_ops->evict(pool->zpool, handle); 1700 else 1701 return -ENOENT; 1702 } 1703 1704 static const struct z3fold_ops z3fold_zpool_ops = { 1705 .evict = z3fold_zpool_evict 1706 }; 1707 1708 static void *z3fold_zpool_create(const char *name, gfp_t gfp, 1709 const struct zpool_ops *zpool_ops, 1710 struct zpool *zpool) 1711 { 1712 struct z3fold_pool *pool; 1713 1714 pool = z3fold_create_pool(name, gfp, 1715 zpool_ops ? &z3fold_zpool_ops : NULL); 1716 if (pool) { 1717 pool->zpool = zpool; 1718 pool->zpool_ops = zpool_ops; 1719 } 1720 return pool; 1721 } 1722 1723 static void z3fold_zpool_destroy(void *pool) 1724 { 1725 z3fold_destroy_pool(pool); 1726 } 1727 1728 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, 1729 unsigned long *handle) 1730 { 1731 return z3fold_alloc(pool, size, gfp, handle); 1732 } 1733 static void z3fold_zpool_free(void *pool, unsigned long handle) 1734 { 1735 z3fold_free(pool, handle); 1736 } 1737 1738 static int z3fold_zpool_shrink(void *pool, unsigned int pages, 1739 unsigned int *reclaimed) 1740 { 1741 unsigned int total = 0; 1742 int ret = -EINVAL; 1743 1744 while (total < pages) { 1745 ret = z3fold_reclaim_page(pool, 8); 1746 if (ret < 0) 1747 break; 1748 total++; 1749 } 1750 1751 if (reclaimed) 1752 *reclaimed = total; 1753 1754 return ret; 1755 } 1756 1757 static void *z3fold_zpool_map(void *pool, unsigned long handle, 1758 enum zpool_mapmode mm) 1759 { 1760 return z3fold_map(pool, handle); 1761 } 1762 static void z3fold_zpool_unmap(void *pool, unsigned long handle) 1763 { 1764 z3fold_unmap(pool, handle); 1765 } 1766 1767 static u64 z3fold_zpool_total_size(void *pool) 1768 { 1769 return z3fold_get_pool_size(pool) * PAGE_SIZE; 1770 } 1771 1772 static struct zpool_driver z3fold_zpool_driver = { 1773 .type = "z3fold", 1774 .sleep_mapped = true, 1775 .owner = THIS_MODULE, 1776 .create = z3fold_zpool_create, 1777 .destroy = z3fold_zpool_destroy, 1778 .malloc = z3fold_zpool_malloc, 1779 .free = z3fold_zpool_free, 1780 .shrink = z3fold_zpool_shrink, 1781 .map = z3fold_zpool_map, 1782 .unmap = z3fold_zpool_unmap, 1783 .total_size = z3fold_zpool_total_size, 1784 }; 1785 1786 MODULE_ALIAS("zpool-z3fold"); 1787 1788 static int __init init_z3fold(void) 1789 { 1790 int ret; 1791 1792 /* Make sure the z3fold header is not larger than the page size */ 1793 BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); 1794 ret = z3fold_mount(); 1795 if (ret) 1796 return ret; 1797 1798 zpool_register_driver(&z3fold_zpool_driver); 1799 1800 return 0; 1801 } 1802 1803 static void __exit exit_z3fold(void) 1804 { 1805 z3fold_unmount(); 1806 zpool_unregister_driver(&z3fold_zpool_driver); 1807 } 1808 1809 module_init(init_z3fold); 1810 module_exit(exit_z3fold); 1811 1812 MODULE_LICENSE("GPL"); 1813 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); 1814 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); 1815