1 /* 2 * zbud.c 3 * 4 * Copyright (C) 2013, Seth Jennings, IBM 5 * 6 * Concepts based on zcache internal zbud allocator by Dan Magenheimer. 7 * 8 * zbud is an special purpose allocator for storing compressed pages. Contrary 9 * to what its name may suggest, zbud is not a buddy allocator, but rather an 10 * allocator that "buddies" two compressed pages together in a single memory 11 * page. 12 * 13 * While this design limits storage density, it has simple and deterministic 14 * reclaim properties that make it preferable to a higher density approach when 15 * reclaim will be used. 16 * 17 * zbud works by storing compressed pages, or "zpages", together in pairs in a 18 * single memory page called a "zbud page". The first buddy is "left 19 * justified" at the beginning of the zbud page, and the last buddy is "right 20 * justified" at the end of the zbud page. The benefit is that if either 21 * buddy is freed, the freed buddy space, coalesced with whatever slack space 22 * that existed between the buddies, results in the largest possible free region 23 * within the zbud page. 24 * 25 * zbud also provides an attractive lower bound on density. The ratio of zpages 26 * to zbud pages can not be less than 1. This ensures that zbud can never "do 27 * harm" by using more pages to store zpages than the uncompressed zpages would 28 * have used on their own. 29 * 30 * zbud pages are divided into "chunks". The size of the chunks is fixed at 31 * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages 32 * into chunks allows organizing unbuddied zbud pages into a manageable number 33 * of unbuddied lists according to the number of free chunks available in the 34 * zbud page. 35 * 36 * The zbud API differs from that of conventional allocators in that the 37 * allocation function, zbud_alloc(), returns an opaque handle to the user, 38 * not a dereferenceable pointer. The user must map the handle using 39 * zbud_map() in order to get a usable pointer by which to access the 40 * allocation data and unmap the handle with zbud_unmap() when operations 41 * on the allocation data are complete. 42 */ 43 44 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 45 46 #include <linux/atomic.h> 47 #include <linux/list.h> 48 #include <linux/mm.h> 49 #include <linux/module.h> 50 #include <linux/preempt.h> 51 #include <linux/slab.h> 52 #include <linux/spinlock.h> 53 #include <linux/zbud.h> 54 #include <linux/zpool.h> 55 56 /***************** 57 * Structures 58 *****************/ 59 /* 60 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 61 * adjusting internal fragmentation. It also determines the number of 62 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 63 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there 64 * will be 64 freelists per pool. 65 */ 66 #define NCHUNKS_ORDER 6 67 68 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 69 #define CHUNK_SIZE (1 << CHUNK_SHIFT) 70 #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 71 #define ZHDR_SIZE_ALIGNED CHUNK_SIZE 72 73 /** 74 * struct zbud_pool - stores metadata for each zbud pool 75 * @lock: protects all pool fields and first|last_chunk fields of any 76 * zbud page in the pool 77 * @unbuddied: array of lists tracking zbud pages that only contain one buddy; 78 * the lists each zbud page is added to depends on the size of 79 * its free region. 80 * @buddied: list tracking the zbud pages that contain two buddies; 81 * these zbud pages are full 82 * @lru: list tracking the zbud pages in LRU order by most recently 83 * added buddy. 84 * @pages_nr: number of zbud pages in the pool. 85 * @ops: pointer to a structure of user defined operations specified at 86 * pool creation time. 87 * 88 * This structure is allocated at pool creation time and maintains metadata 89 * pertaining to a particular zbud pool. 90 */ 91 struct zbud_pool { 92 spinlock_t lock; 93 struct list_head unbuddied[NCHUNKS]; 94 struct list_head buddied; 95 struct list_head lru; 96 u64 pages_nr; 97 struct zbud_ops *ops; 98 }; 99 100 /* 101 * struct zbud_header - zbud page metadata occupying the first chunk of each 102 * zbud page. 103 * @buddy: links the zbud page into the unbuddied/buddied lists in the pool 104 * @lru: links the zbud page into the lru list in the pool 105 * @first_chunks: the size of the first buddy in chunks, 0 if free 106 * @last_chunks: the size of the last buddy in chunks, 0 if free 107 */ 108 struct zbud_header { 109 struct list_head buddy; 110 struct list_head lru; 111 unsigned int first_chunks; 112 unsigned int last_chunks; 113 bool under_reclaim; 114 }; 115 116 /***************** 117 * zpool 118 ****************/ 119 120 #ifdef CONFIG_ZPOOL 121 122 static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) 123 { 124 return zpool_evict(pool, handle); 125 } 126 127 static struct zbud_ops zbud_zpool_ops = { 128 .evict = zbud_zpool_evict 129 }; 130 131 static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 132 { 133 return zbud_create_pool(gfp, &zbud_zpool_ops); 134 } 135 136 static void zbud_zpool_destroy(void *pool) 137 { 138 zbud_destroy_pool(pool); 139 } 140 141 static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, 142 unsigned long *handle) 143 { 144 return zbud_alloc(pool, size, gfp, handle); 145 } 146 static void zbud_zpool_free(void *pool, unsigned long handle) 147 { 148 zbud_free(pool, handle); 149 } 150 151 static int zbud_zpool_shrink(void *pool, unsigned int pages, 152 unsigned int *reclaimed) 153 { 154 unsigned int total = 0; 155 int ret = -EINVAL; 156 157 while (total < pages) { 158 ret = zbud_reclaim_page(pool, 8); 159 if (ret < 0) 160 break; 161 total++; 162 } 163 164 if (reclaimed) 165 *reclaimed = total; 166 167 return ret; 168 } 169 170 static void *zbud_zpool_map(void *pool, unsigned long handle, 171 enum zpool_mapmode mm) 172 { 173 return zbud_map(pool, handle); 174 } 175 static void zbud_zpool_unmap(void *pool, unsigned long handle) 176 { 177 zbud_unmap(pool, handle); 178 } 179 180 static u64 zbud_zpool_total_size(void *pool) 181 { 182 return zbud_get_pool_size(pool) * PAGE_SIZE; 183 } 184 185 static struct zpool_driver zbud_zpool_driver = { 186 .type = "zbud", 187 .owner = THIS_MODULE, 188 .create = zbud_zpool_create, 189 .destroy = zbud_zpool_destroy, 190 .malloc = zbud_zpool_malloc, 191 .free = zbud_zpool_free, 192 .shrink = zbud_zpool_shrink, 193 .map = zbud_zpool_map, 194 .unmap = zbud_zpool_unmap, 195 .total_size = zbud_zpool_total_size, 196 }; 197 198 MODULE_ALIAS("zpool-zbud"); 199 #endif /* CONFIG_ZPOOL */ 200 201 /***************** 202 * Helpers 203 *****************/ 204 /* Just to make the code easier to read */ 205 enum buddy { 206 FIRST, 207 LAST 208 }; 209 210 /* Converts an allocation size in bytes to size in zbud chunks */ 211 static int size_to_chunks(size_t size) 212 { 213 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 214 } 215 216 #define for_each_unbuddied_list(_iter, _begin) \ 217 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 218 219 /* Initializes the zbud header of a newly allocated zbud page */ 220 static struct zbud_header *init_zbud_page(struct page *page) 221 { 222 struct zbud_header *zhdr = page_address(page); 223 zhdr->first_chunks = 0; 224 zhdr->last_chunks = 0; 225 INIT_LIST_HEAD(&zhdr->buddy); 226 INIT_LIST_HEAD(&zhdr->lru); 227 zhdr->under_reclaim = 0; 228 return zhdr; 229 } 230 231 /* Resets the struct page fields and frees the page */ 232 static void free_zbud_page(struct zbud_header *zhdr) 233 { 234 __free_page(virt_to_page(zhdr)); 235 } 236 237 /* 238 * Encodes the handle of a particular buddy within a zbud page 239 * Pool lock should be held as this function accesses first|last_chunks 240 */ 241 static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) 242 { 243 unsigned long handle; 244 245 /* 246 * For now, the encoded handle is actually just the pointer to the data 247 * but this might not always be the case. A little information hiding. 248 * Add CHUNK_SIZE to the handle if it is the first allocation to jump 249 * over the zbud header in the first chunk. 250 */ 251 handle = (unsigned long)zhdr; 252 if (bud == FIRST) 253 /* skip over zbud header */ 254 handle += ZHDR_SIZE_ALIGNED; 255 else /* bud == LAST */ 256 handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); 257 return handle; 258 } 259 260 /* Returns the zbud page where a given handle is stored */ 261 static struct zbud_header *handle_to_zbud_header(unsigned long handle) 262 { 263 return (struct zbud_header *)(handle & PAGE_MASK); 264 } 265 266 /* Returns the number of free chunks in a zbud page */ 267 static int num_free_chunks(struct zbud_header *zhdr) 268 { 269 /* 270 * Rather than branch for different situations, just use the fact that 271 * free buddies have a length of zero to simplify everything. -1 at the 272 * end for the zbud header. 273 */ 274 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; 275 } 276 277 /***************** 278 * API Functions 279 *****************/ 280 /** 281 * zbud_create_pool() - create a new zbud pool 282 * @gfp: gfp flags when allocating the zbud pool structure 283 * @ops: user-defined operations for the zbud pool 284 * 285 * Return: pointer to the new zbud pool or NULL if the metadata allocation 286 * failed. 287 */ 288 struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) 289 { 290 struct zbud_pool *pool; 291 int i; 292 293 pool = kmalloc(sizeof(struct zbud_pool), gfp); 294 if (!pool) 295 return NULL; 296 spin_lock_init(&pool->lock); 297 for_each_unbuddied_list(i, 0) 298 INIT_LIST_HEAD(&pool->unbuddied[i]); 299 INIT_LIST_HEAD(&pool->buddied); 300 INIT_LIST_HEAD(&pool->lru); 301 pool->pages_nr = 0; 302 pool->ops = ops; 303 return pool; 304 } 305 306 /** 307 * zbud_destroy_pool() - destroys an existing zbud pool 308 * @pool: the zbud pool to be destroyed 309 * 310 * The pool should be emptied before this function is called. 311 */ 312 void zbud_destroy_pool(struct zbud_pool *pool) 313 { 314 kfree(pool); 315 } 316 317 /** 318 * zbud_alloc() - allocates a region of a given size 319 * @pool: zbud pool from which to allocate 320 * @size: size in bytes of the desired allocation 321 * @gfp: gfp flags used if the pool needs to grow 322 * @handle: handle of the new allocation 323 * 324 * This function will attempt to find a free region in the pool large enough to 325 * satisfy the allocation request. A search of the unbuddied lists is 326 * performed first. If no suitable free region is found, then a new page is 327 * allocated and added to the pool to satisfy the request. 328 * 329 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 330 * as zbud pool pages. 331 * 332 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or 333 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 334 * a new page. 335 */ 336 int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, 337 unsigned long *handle) 338 { 339 int chunks, i, freechunks; 340 struct zbud_header *zhdr = NULL; 341 enum buddy bud; 342 struct page *page; 343 344 if (!size || (gfp & __GFP_HIGHMEM)) 345 return -EINVAL; 346 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 347 return -ENOSPC; 348 chunks = size_to_chunks(size); 349 spin_lock(&pool->lock); 350 351 /* First, try to find an unbuddied zbud page. */ 352 zhdr = NULL; 353 for_each_unbuddied_list(i, chunks) { 354 if (!list_empty(&pool->unbuddied[i])) { 355 zhdr = list_first_entry(&pool->unbuddied[i], 356 struct zbud_header, buddy); 357 list_del(&zhdr->buddy); 358 if (zhdr->first_chunks == 0) 359 bud = FIRST; 360 else 361 bud = LAST; 362 goto found; 363 } 364 } 365 366 /* Couldn't find unbuddied zbud page, create new one */ 367 spin_unlock(&pool->lock); 368 page = alloc_page(gfp); 369 if (!page) 370 return -ENOMEM; 371 spin_lock(&pool->lock); 372 pool->pages_nr++; 373 zhdr = init_zbud_page(page); 374 bud = FIRST; 375 376 found: 377 if (bud == FIRST) 378 zhdr->first_chunks = chunks; 379 else 380 zhdr->last_chunks = chunks; 381 382 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { 383 /* Add to unbuddied list */ 384 freechunks = num_free_chunks(zhdr); 385 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 386 } else { 387 /* Add to buddied list */ 388 list_add(&zhdr->buddy, &pool->buddied); 389 } 390 391 /* Add/move zbud page to beginning of LRU */ 392 if (!list_empty(&zhdr->lru)) 393 list_del(&zhdr->lru); 394 list_add(&zhdr->lru, &pool->lru); 395 396 *handle = encode_handle(zhdr, bud); 397 spin_unlock(&pool->lock); 398 399 return 0; 400 } 401 402 /** 403 * zbud_free() - frees the allocation associated with the given handle 404 * @pool: pool in which the allocation resided 405 * @handle: handle associated with the allocation returned by zbud_alloc() 406 * 407 * In the case that the zbud page in which the allocation resides is under 408 * reclaim, as indicated by the PG_reclaim flag being set, this function 409 * only sets the first|last_chunks to 0. The page is actually freed 410 * once both buddies are evicted (see zbud_reclaim_page() below). 411 */ 412 void zbud_free(struct zbud_pool *pool, unsigned long handle) 413 { 414 struct zbud_header *zhdr; 415 int freechunks; 416 417 spin_lock(&pool->lock); 418 zhdr = handle_to_zbud_header(handle); 419 420 /* If first buddy, handle will be page aligned */ 421 if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) 422 zhdr->last_chunks = 0; 423 else 424 zhdr->first_chunks = 0; 425 426 if (zhdr->under_reclaim) { 427 /* zbud page is under reclaim, reclaim will free */ 428 spin_unlock(&pool->lock); 429 return; 430 } 431 432 /* Remove from existing buddy list */ 433 list_del(&zhdr->buddy); 434 435 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 436 /* zbud page is empty, free */ 437 list_del(&zhdr->lru); 438 free_zbud_page(zhdr); 439 pool->pages_nr--; 440 } else { 441 /* Add to unbuddied list */ 442 freechunks = num_free_chunks(zhdr); 443 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 444 } 445 446 spin_unlock(&pool->lock); 447 } 448 449 #define list_tail_entry(ptr, type, member) \ 450 list_entry((ptr)->prev, type, member) 451 452 /** 453 * zbud_reclaim_page() - evicts allocations from a pool page and frees it 454 * @pool: pool from which a page will attempt to be evicted 455 * @retires: number of pages on the LRU list for which eviction will 456 * be attempted before failing 457 * 458 * zbud reclaim is different from normal system reclaim in that the reclaim is 459 * done from the bottom, up. This is because only the bottom layer, zbud, has 460 * information on how the allocations are organized within each zbud page. This 461 * has the potential to create interesting locking situations between zbud and 462 * the user, however. 463 * 464 * To avoid these, this is how zbud_reclaim_page() should be called: 465 466 * The user detects a page should be reclaimed and calls zbud_reclaim_page(). 467 * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call 468 * the user-defined eviction handler with the pool and handle as arguments. 469 * 470 * If the handle can not be evicted, the eviction handler should return 471 * non-zero. zbud_reclaim_page() will add the zbud page back to the 472 * appropriate list and try the next zbud page on the LRU up to 473 * a user defined number of retries. 474 * 475 * If the handle is successfully evicted, the eviction handler should 476 * return 0 _and_ should have called zbud_free() on the handle. zbud_free() 477 * contains logic to delay freeing the page if the page is under reclaim, 478 * as indicated by the setting of the PG_reclaim flag on the underlying page. 479 * 480 * If all buddies in the zbud page are successfully evicted, then the 481 * zbud page can be freed. 482 * 483 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are 484 * no pages to evict or an eviction handler is not registered, -EAGAIN if 485 * the retry limit was hit. 486 */ 487 int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) 488 { 489 int i, ret, freechunks; 490 struct zbud_header *zhdr; 491 unsigned long first_handle = 0, last_handle = 0; 492 493 spin_lock(&pool->lock); 494 if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || 495 retries == 0) { 496 spin_unlock(&pool->lock); 497 return -EINVAL; 498 } 499 for (i = 0; i < retries; i++) { 500 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); 501 list_del(&zhdr->lru); 502 list_del(&zhdr->buddy); 503 /* Protect zbud page against free */ 504 zhdr->under_reclaim = true; 505 /* 506 * We need encode the handles before unlocking, since we can 507 * race with free that will set (first|last)_chunks to 0 508 */ 509 first_handle = 0; 510 last_handle = 0; 511 if (zhdr->first_chunks) 512 first_handle = encode_handle(zhdr, FIRST); 513 if (zhdr->last_chunks) 514 last_handle = encode_handle(zhdr, LAST); 515 spin_unlock(&pool->lock); 516 517 /* Issue the eviction callback(s) */ 518 if (first_handle) { 519 ret = pool->ops->evict(pool, first_handle); 520 if (ret) 521 goto next; 522 } 523 if (last_handle) { 524 ret = pool->ops->evict(pool, last_handle); 525 if (ret) 526 goto next; 527 } 528 next: 529 spin_lock(&pool->lock); 530 zhdr->under_reclaim = false; 531 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 532 /* 533 * Both buddies are now free, free the zbud page and 534 * return success. 535 */ 536 free_zbud_page(zhdr); 537 pool->pages_nr--; 538 spin_unlock(&pool->lock); 539 return 0; 540 } else if (zhdr->first_chunks == 0 || 541 zhdr->last_chunks == 0) { 542 /* add to unbuddied list */ 543 freechunks = num_free_chunks(zhdr); 544 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 545 } else { 546 /* add to buddied list */ 547 list_add(&zhdr->buddy, &pool->buddied); 548 } 549 550 /* add to beginning of LRU */ 551 list_add(&zhdr->lru, &pool->lru); 552 } 553 spin_unlock(&pool->lock); 554 return -EAGAIN; 555 } 556 557 /** 558 * zbud_map() - maps the allocation associated with the given handle 559 * @pool: pool in which the allocation resides 560 * @handle: handle associated with the allocation to be mapped 561 * 562 * While trivial for zbud, the mapping functions for others allocators 563 * implementing this allocation API could have more complex information encoded 564 * in the handle and could create temporary mappings to make the data 565 * accessible to the user. 566 * 567 * Returns: a pointer to the mapped allocation 568 */ 569 void *zbud_map(struct zbud_pool *pool, unsigned long handle) 570 { 571 return (void *)(handle); 572 } 573 574 /** 575 * zbud_unmap() - maps the allocation associated with the given handle 576 * @pool: pool in which the allocation resides 577 * @handle: handle associated with the allocation to be unmapped 578 */ 579 void zbud_unmap(struct zbud_pool *pool, unsigned long handle) 580 { 581 } 582 583 /** 584 * zbud_get_pool_size() - gets the zbud pool size in pages 585 * @pool: pool whose size is being queried 586 * 587 * Returns: size in pages of the given pool. The pool lock need not be 588 * taken to access pages_nr. 589 */ 590 u64 zbud_get_pool_size(struct zbud_pool *pool) 591 { 592 return pool->pages_nr; 593 } 594 595 static int __init init_zbud(void) 596 { 597 /* Make sure the zbud header will fit in one chunk */ 598 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); 599 pr_info("loaded\n"); 600 601 #ifdef CONFIG_ZPOOL 602 zpool_register_driver(&zbud_zpool_driver); 603 #endif 604 605 return 0; 606 } 607 608 static void __exit exit_zbud(void) 609 { 610 #ifdef CONFIG_ZPOOL 611 zpool_unregister_driver(&zbud_zpool_driver); 612 #endif 613 614 pr_info("unloaded\n"); 615 } 616 617 module_init(init_zbud); 618 module_exit(exit_zbud); 619 620 MODULE_LICENSE("GPL"); 621 MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); 622 MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); 623