1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/file.h> 9 #include <linux/fs.h> 10 #include <linux/pagemap.h> 11 #include <linux/pagevec.h> 12 #include <linux/highmem.h> 13 #include <linux/kthread.h> 14 #include <linux/time.h> 15 #include <linux/init.h> 16 #include <linux/string.h> 17 #include <linux/backing-dev.h> 18 #include <linux/writeback.h> 19 #include <linux/psi.h> 20 #include <linux/slab.h> 21 #include <linux/sched/mm.h> 22 #include <linux/log2.h> 23 #include <linux/shrinker.h> 24 #include <crypto/hash.h> 25 #include "misc.h" 26 #include "ctree.h" 27 #include "fs.h" 28 #include "btrfs_inode.h" 29 #include "bio.h" 30 #include "ordered-data.h" 31 #include "compression.h" 32 #include "extent_io.h" 33 #include "extent_map.h" 34 #include "subpage.h" 35 #include "messages.h" 36 #include "super.h" 37 38 static struct bio_set btrfs_compressed_bioset; 39 40 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; 41 42 const char* btrfs_compress_type2str(enum btrfs_compression_type type) 43 { 44 switch (type) { 45 case BTRFS_COMPRESS_ZLIB: 46 case BTRFS_COMPRESS_LZO: 47 case BTRFS_COMPRESS_ZSTD: 48 case BTRFS_COMPRESS_NONE: 49 return btrfs_compress_types[type]; 50 default: 51 break; 52 } 53 54 return NULL; 55 } 56 57 static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) 58 { 59 return container_of(bbio, struct compressed_bio, bbio); 60 } 61 62 static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, 63 u64 start, blk_opf_t op, 64 btrfs_bio_end_io_t end_io) 65 { 66 struct btrfs_bio *bbio; 67 68 bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, 69 GFP_NOFS, &btrfs_compressed_bioset)); 70 btrfs_bio_init(bbio, inode, start, end_io, NULL); 71 return to_compressed_bio(bbio); 72 } 73 74 bool btrfs_compress_is_valid_type(const char *str, size_t len) 75 { 76 int i; 77 78 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 79 size_t comp_len = strlen(btrfs_compress_types[i]); 80 81 if (len < comp_len) 82 continue; 83 84 if (!strncmp(btrfs_compress_types[i], str, comp_len)) 85 return true; 86 } 87 return false; 88 } 89 90 static int compression_compress_pages(int type, struct list_head *ws, 91 struct btrfs_inode *inode, u64 start, 92 struct folio **folios, unsigned long *out_folios, 93 unsigned long *total_in, unsigned long *total_out) 94 { 95 switch (type) { 96 case BTRFS_COMPRESS_ZLIB: 97 return zlib_compress_folios(ws, inode, start, folios, 98 out_folios, total_in, total_out); 99 case BTRFS_COMPRESS_LZO: 100 return lzo_compress_folios(ws, inode, start, folios, 101 out_folios, total_in, total_out); 102 case BTRFS_COMPRESS_ZSTD: 103 return zstd_compress_folios(ws, inode, start, folios, 104 out_folios, total_in, total_out); 105 case BTRFS_COMPRESS_NONE: 106 default: 107 /* 108 * This can happen when compression races with remount setting 109 * it to 'no compress', while caller doesn't call 110 * inode_need_compress() to check if we really need to 111 * compress. 112 * 113 * Not a big deal, just need to inform caller that we 114 * haven't allocated any pages yet. 115 */ 116 *out_folios = 0; 117 return -E2BIG; 118 } 119 } 120 121 static int compression_decompress_bio(struct list_head *ws, 122 struct compressed_bio *cb) 123 { 124 switch (cb->compress_type) { 125 case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); 126 case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); 127 case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); 128 case BTRFS_COMPRESS_NONE: 129 default: 130 /* 131 * This can't happen, the type is validated several times 132 * before we get here. 133 */ 134 BUG(); 135 } 136 } 137 138 static int compression_decompress(int type, struct list_head *ws, 139 const u8 *data_in, struct folio *dest_folio, 140 unsigned long dest_pgoff, size_t srclen, size_t destlen) 141 { 142 switch (type) { 143 case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, 144 dest_pgoff, srclen, destlen); 145 case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, 146 dest_pgoff, srclen, destlen); 147 case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, 148 dest_pgoff, srclen, destlen); 149 case BTRFS_COMPRESS_NONE: 150 default: 151 /* 152 * This can't happen, the type is validated several times 153 * before we get here. 154 */ 155 BUG(); 156 } 157 } 158 159 static void btrfs_free_compressed_folios(struct compressed_bio *cb) 160 { 161 for (unsigned int i = 0; i < cb->nr_folios; i++) 162 btrfs_free_compr_folio(cb->compressed_folios[i]); 163 kfree(cb->compressed_folios); 164 } 165 166 static int btrfs_decompress_bio(struct compressed_bio *cb); 167 168 /* 169 * Global cache of last unused pages for compression/decompression. 170 */ 171 static struct btrfs_compr_pool { 172 struct shrinker *shrinker; 173 spinlock_t lock; 174 struct list_head list; 175 int count; 176 int thresh; 177 } compr_pool; 178 179 static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc) 180 { 181 int ret; 182 183 /* 184 * We must not read the values more than once if 'ret' gets expanded in 185 * the return statement so we don't accidentally return a negative 186 * number, even if the first condition finds it positive. 187 */ 188 ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh); 189 190 return ret > 0 ? ret : 0; 191 } 192 193 static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) 194 { 195 LIST_HEAD(remove); 196 struct list_head *tmp, *next; 197 int freed; 198 199 if (compr_pool.count == 0) 200 return SHRINK_STOP; 201 202 /* For now, just simply drain the whole list. */ 203 spin_lock(&compr_pool.lock); 204 list_splice_init(&compr_pool.list, &remove); 205 freed = compr_pool.count; 206 compr_pool.count = 0; 207 spin_unlock(&compr_pool.lock); 208 209 list_for_each_safe(tmp, next, &remove) { 210 struct page *page = list_entry(tmp, struct page, lru); 211 212 ASSERT(page_ref_count(page) == 1); 213 put_page(page); 214 } 215 216 return freed; 217 } 218 219 /* 220 * Common wrappers for page allocation from compression wrappers 221 */ 222 struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) 223 { 224 struct folio *folio = NULL; 225 226 /* For bs > ps cases, no cached folio pool for now. */ 227 if (fs_info->block_min_order) 228 goto alloc; 229 230 spin_lock(&compr_pool.lock); 231 if (compr_pool.count > 0) { 232 folio = list_first_entry(&compr_pool.list, struct folio, lru); 233 list_del_init(&folio->lru); 234 compr_pool.count--; 235 } 236 spin_unlock(&compr_pool.lock); 237 238 if (folio) 239 return folio; 240 241 alloc: 242 return folio_alloc(GFP_NOFS, fs_info->block_min_order); 243 } 244 245 void btrfs_free_compr_folio(struct folio *folio) 246 { 247 bool do_free = false; 248 249 /* The folio is from bs > ps fs, no cached pool for now. */ 250 if (folio_order(folio)) 251 goto free; 252 253 spin_lock(&compr_pool.lock); 254 if (compr_pool.count > compr_pool.thresh) { 255 do_free = true; 256 } else { 257 list_add(&folio->lru, &compr_pool.list); 258 compr_pool.count++; 259 } 260 spin_unlock(&compr_pool.lock); 261 262 if (!do_free) 263 return; 264 265 free: 266 ASSERT(folio_ref_count(folio) == 1); 267 folio_put(folio); 268 } 269 270 static void end_bbio_compressed_read(struct btrfs_bio *bbio) 271 { 272 struct compressed_bio *cb = to_compressed_bio(bbio); 273 blk_status_t status = bbio->bio.bi_status; 274 275 if (!status) 276 status = errno_to_blk_status(btrfs_decompress_bio(cb)); 277 278 btrfs_free_compressed_folios(cb); 279 btrfs_bio_end_io(cb->orig_bbio, status); 280 bio_put(&bbio->bio); 281 } 282 283 /* 284 * Clear the writeback bits on all of the file 285 * pages for a compressed write 286 */ 287 static noinline void end_compressed_writeback(const struct compressed_bio *cb) 288 { 289 struct inode *inode = &cb->bbio.inode->vfs_inode; 290 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 291 pgoff_t index = cb->start >> PAGE_SHIFT; 292 const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 293 struct folio_batch fbatch; 294 int i; 295 int ret; 296 297 ret = blk_status_to_errno(cb->bbio.bio.bi_status); 298 if (ret) 299 mapping_set_error(inode->i_mapping, ret); 300 301 folio_batch_init(&fbatch); 302 while (index <= end_index) { 303 ret = filemap_get_folios(inode->i_mapping, &index, end_index, 304 &fbatch); 305 306 if (ret == 0) 307 return; 308 309 for (i = 0; i < ret; i++) { 310 struct folio *folio = fbatch.folios[i]; 311 312 btrfs_folio_clamp_clear_writeback(fs_info, folio, 313 cb->start, cb->len); 314 } 315 folio_batch_release(&fbatch); 316 } 317 /* the inode may be gone now */ 318 } 319 320 /* 321 * Do the cleanup once all the compressed pages hit the disk. This will clear 322 * writeback on the file pages and free the compressed pages. 323 * 324 * This also calls the writeback end hooks for the file pages so that metadata 325 * and checksums can be updated in the file. 326 */ 327 static void end_bbio_compressed_write(struct btrfs_bio *bbio) 328 { 329 struct compressed_bio *cb = to_compressed_bio(bbio); 330 331 btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, 332 cb->bbio.bio.bi_status == BLK_STS_OK); 333 334 if (cb->writeback) 335 end_compressed_writeback(cb); 336 /* Note, our inode could be gone now. */ 337 btrfs_free_compressed_folios(cb); 338 bio_put(&cb->bbio.bio); 339 } 340 341 static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) 342 { 343 struct bio *bio = &cb->bbio.bio; 344 u32 offset = 0; 345 unsigned int findex = 0; 346 347 while (offset < cb->compressed_len) { 348 struct folio *folio = cb->compressed_folios[findex]; 349 u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); 350 int ret; 351 352 /* Maximum compressed extent is smaller than bio size limit. */ 353 ret = bio_add_folio(bio, folio, len, 0); 354 ASSERT(ret); 355 offset += len; 356 findex++; 357 } 358 } 359 360 /* 361 * worker function to build and submit bios for previously compressed pages. 362 * The corresponding pages in the inode should be marked for writeback 363 * and the compressed pages should have a reference on them for dropping 364 * when the IO is complete. 365 * 366 * This also checksums the file bytes and gets things ready for 367 * the end io hooks. 368 */ 369 void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, 370 struct folio **compressed_folios, 371 unsigned int nr_folios, 372 blk_opf_t write_flags, 373 bool writeback) 374 { 375 struct btrfs_inode *inode = ordered->inode; 376 struct btrfs_fs_info *fs_info = inode->root->fs_info; 377 struct compressed_bio *cb; 378 379 ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); 380 ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); 381 382 cb = alloc_compressed_bio(inode, ordered->file_offset, 383 REQ_OP_WRITE | write_flags, 384 end_bbio_compressed_write); 385 cb->start = ordered->file_offset; 386 cb->len = ordered->num_bytes; 387 cb->compressed_folios = compressed_folios; 388 cb->compressed_len = ordered->disk_num_bytes; 389 cb->writeback = writeback; 390 cb->nr_folios = nr_folios; 391 cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; 392 cb->bbio.ordered = ordered; 393 btrfs_add_compressed_bio_folios(cb); 394 395 btrfs_submit_bbio(&cb->bbio, 0); 396 } 397 398 /* 399 * Add extra pages in the same compressed file extent so that we don't need to 400 * re-read the same extent again and again. 401 * 402 * NOTE: this won't work well for subpage, as for subpage read, we lock the 403 * full page then submit bio for each compressed/regular extents. 404 * 405 * This means, if we have several sectors in the same page points to the same 406 * on-disk compressed data, we will re-read the same extent many times and 407 * this function can only help for the next page. 408 */ 409 static noinline int add_ra_bio_pages(struct inode *inode, 410 u64 compressed_end, 411 struct compressed_bio *cb, 412 int *memstall, unsigned long *pflags) 413 { 414 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 415 pgoff_t end_index; 416 struct bio *orig_bio = &cb->orig_bbio->bio; 417 u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; 418 u64 isize = i_size_read(inode); 419 int ret; 420 struct folio *folio; 421 struct extent_map *em; 422 struct address_space *mapping = inode->i_mapping; 423 struct extent_map_tree *em_tree; 424 struct extent_io_tree *tree; 425 int sectors_missed = 0; 426 427 em_tree = &BTRFS_I(inode)->extent_tree; 428 tree = &BTRFS_I(inode)->io_tree; 429 430 if (isize == 0) 431 return 0; 432 433 /* 434 * For current subpage support, we only support 64K page size, 435 * which means maximum compressed extent size (128K) is just 2x page 436 * size. 437 * This makes readahead less effective, so here disable readahead for 438 * subpage for now, until full compressed write is supported. 439 */ 440 if (fs_info->sectorsize < PAGE_SIZE) 441 return 0; 442 443 /* For bs > ps cases, we don't support readahead for compressed folios for now. */ 444 if (fs_info->block_min_order) 445 return 0; 446 447 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 448 449 while (cur < compressed_end) { 450 pgoff_t page_end; 451 pgoff_t pg_index = cur >> PAGE_SHIFT; 452 u32 add_size; 453 454 if (pg_index > end_index) 455 break; 456 457 folio = filemap_get_folio(mapping, pg_index); 458 if (!IS_ERR(folio)) { 459 u64 folio_sz = folio_size(folio); 460 u64 offset = offset_in_folio(folio, cur); 461 462 folio_put(folio); 463 sectors_missed += (folio_sz - offset) >> 464 fs_info->sectorsize_bits; 465 466 /* Beyond threshold, no need to continue */ 467 if (sectors_missed > 4) 468 break; 469 470 /* 471 * Jump to next page start as we already have page for 472 * current offset. 473 */ 474 cur += (folio_sz - offset); 475 continue; 476 } 477 478 folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, 479 ~__GFP_FS), 0); 480 if (!folio) 481 break; 482 483 if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { 484 /* There is already a page, skip to page end */ 485 cur += folio_size(folio); 486 folio_put(folio); 487 continue; 488 } 489 490 if (!*memstall && folio_test_workingset(folio)) { 491 psi_memstall_enter(pflags); 492 *memstall = 1; 493 } 494 495 ret = set_folio_extent_mapped(folio); 496 if (ret < 0) { 497 folio_unlock(folio); 498 folio_put(folio); 499 break; 500 } 501 502 page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; 503 btrfs_lock_extent(tree, cur, page_end, NULL); 504 read_lock(&em_tree->lock); 505 em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); 506 read_unlock(&em_tree->lock); 507 508 /* 509 * At this point, we have a locked page in the page cache for 510 * these bytes in the file. But, we have to make sure they map 511 * to this compressed extent on disk. 512 */ 513 if (!em || cur < em->start || 514 (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) || 515 (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) != 516 orig_bio->bi_iter.bi_sector) { 517 btrfs_free_extent_map(em); 518 btrfs_unlock_extent(tree, cur, page_end, NULL); 519 folio_unlock(folio); 520 folio_put(folio); 521 break; 522 } 523 add_size = min(em->start + em->len, page_end + 1) - cur; 524 btrfs_free_extent_map(em); 525 btrfs_unlock_extent(tree, cur, page_end, NULL); 526 527 if (folio_contains(folio, end_index)) { 528 size_t zero_offset = offset_in_folio(folio, isize); 529 530 if (zero_offset) { 531 int zeros; 532 zeros = folio_size(folio) - zero_offset; 533 folio_zero_range(folio, zero_offset, zeros); 534 } 535 } 536 537 if (!bio_add_folio(orig_bio, folio, add_size, 538 offset_in_folio(folio, cur))) { 539 folio_unlock(folio); 540 folio_put(folio); 541 break; 542 } 543 /* 544 * If it's subpage, we also need to increase its 545 * subpage::readers number, as at endio we will decrease 546 * subpage::readers and to unlock the page. 547 */ 548 if (fs_info->sectorsize < PAGE_SIZE) 549 btrfs_folio_set_lock(fs_info, folio, cur, add_size); 550 folio_put(folio); 551 cur += add_size; 552 } 553 return 0; 554 } 555 556 /* 557 * for a compressed read, the bio we get passed has all the inode pages 558 * in it. We don't actually do IO on those pages but allocate new ones 559 * to hold the compressed pages on disk. 560 * 561 * bio->bi_iter.bi_sector points to the compressed extent on disk 562 * bio->bi_io_vec points to all of the inode pages 563 * 564 * After the compressed pages are read, we copy the bytes into the 565 * bio we were passed and then call the bio end_io calls 566 */ 567 void btrfs_submit_compressed_read(struct btrfs_bio *bbio) 568 { 569 struct btrfs_inode *inode = bbio->inode; 570 struct btrfs_fs_info *fs_info = inode->root->fs_info; 571 struct extent_map_tree *em_tree = &inode->extent_tree; 572 struct compressed_bio *cb; 573 unsigned int compressed_len; 574 u64 file_offset = bbio->file_offset; 575 u64 em_len; 576 u64 em_start; 577 struct extent_map *em; 578 unsigned long pflags; 579 int memstall = 0; 580 blk_status_t status; 581 int ret; 582 583 /* we need the actual starting offset of this extent in the file */ 584 read_lock(&em_tree->lock); 585 em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); 586 read_unlock(&em_tree->lock); 587 if (!em) { 588 status = BLK_STS_IOERR; 589 goto out; 590 } 591 592 ASSERT(btrfs_extent_map_is_compressed(em)); 593 compressed_len = em->disk_num_bytes; 594 595 cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, 596 end_bbio_compressed_read); 597 598 cb->start = em->start - em->offset; 599 em_len = em->len; 600 em_start = em->start; 601 602 cb->len = bbio->bio.bi_iter.bi_size; 603 cb->compressed_len = compressed_len; 604 cb->compress_type = btrfs_extent_map_compression(em); 605 cb->orig_bbio = bbio; 606 cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; 607 608 btrfs_free_extent_map(em); 609 610 cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); 611 cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); 612 if (!cb->compressed_folios) { 613 status = BLK_STS_RESOURCE; 614 goto out_free_bio; 615 } 616 617 ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, 618 cb->compressed_folios); 619 if (ret) { 620 status = BLK_STS_RESOURCE; 621 goto out_free_compressed_pages; 622 } 623 624 add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, 625 &pflags); 626 627 /* include any pages we added in add_ra-bio_pages */ 628 cb->len = bbio->bio.bi_iter.bi_size; 629 cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; 630 btrfs_add_compressed_bio_folios(cb); 631 632 if (memstall) 633 psi_memstall_leave(&pflags); 634 635 btrfs_submit_bbio(&cb->bbio, 0); 636 return; 637 638 out_free_compressed_pages: 639 kfree(cb->compressed_folios); 640 out_free_bio: 641 bio_put(&cb->bbio.bio); 642 out: 643 btrfs_bio_end_io(bbio, status); 644 } 645 646 /* 647 * Heuristic uses systematic sampling to collect data from the input data 648 * range, the logic can be tuned by the following constants: 649 * 650 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 651 * @SAMPLING_INTERVAL - range from which the sampled data can be collected 652 */ 653 #define SAMPLING_READ_SIZE (16) 654 #define SAMPLING_INTERVAL (256) 655 656 /* 657 * For statistical analysis of the input data we consider bytes that form a 658 * Galois Field of 256 objects. Each object has an attribute count, ie. how 659 * many times the object appeared in the sample. 660 */ 661 #define BUCKET_SIZE (256) 662 663 /* 664 * The size of the sample is based on a statistical sampling rule of thumb. 665 * The common way is to perform sampling tests as long as the number of 666 * elements in each cell is at least 5. 667 * 668 * Instead of 5, we choose 32 to obtain more accurate results. 669 * If the data contain the maximum number of symbols, which is 256, we obtain a 670 * sample size bound by 8192. 671 * 672 * For a sample of at most 8KB of data per data range: 16 consecutive bytes 673 * from up to 512 locations. 674 */ 675 #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 676 SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 677 678 struct bucket_item { 679 u32 count; 680 }; 681 682 struct heuristic_ws { 683 /* Partial copy of input data */ 684 u8 *sample; 685 u32 sample_size; 686 /* Buckets store counters for each byte value */ 687 struct bucket_item *bucket; 688 /* Sorting buffer */ 689 struct bucket_item *bucket_b; 690 struct list_head list; 691 }; 692 693 static void free_heuristic_ws(struct list_head *ws) 694 { 695 struct heuristic_ws *workspace; 696 697 workspace = list_entry(ws, struct heuristic_ws, list); 698 699 kvfree(workspace->sample); 700 kfree(workspace->bucket); 701 kfree(workspace->bucket_b); 702 kfree(workspace); 703 } 704 705 static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) 706 { 707 struct heuristic_ws *ws; 708 709 ws = kzalloc(sizeof(*ws), GFP_KERNEL); 710 if (!ws) 711 return ERR_PTR(-ENOMEM); 712 713 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 714 if (!ws->sample) 715 goto fail; 716 717 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 718 if (!ws->bucket) 719 goto fail; 720 721 ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); 722 if (!ws->bucket_b) 723 goto fail; 724 725 INIT_LIST_HEAD(&ws->list); 726 return &ws->list; 727 fail: 728 free_heuristic_ws(&ws->list); 729 return ERR_PTR(-ENOMEM); 730 } 731 732 const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 }; 733 734 static const struct btrfs_compress_levels * const btrfs_compress_levels[] = { 735 /* The heuristic is represented as compression type 0 */ 736 &btrfs_heuristic_compress, 737 &btrfs_zlib_compress, 738 &btrfs_lzo_compress, 739 &btrfs_zstd_compress, 740 }; 741 742 static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level) 743 { 744 switch (type) { 745 case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info); 746 case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level); 747 case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info); 748 case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level); 749 default: 750 /* 751 * This can't happen, the type is validated several times 752 * before we get here. 753 */ 754 BUG(); 755 } 756 } 757 758 static void free_workspace(int type, struct list_head *ws) 759 { 760 switch (type) { 761 case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); 762 case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); 763 case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); 764 case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); 765 default: 766 /* 767 * This can't happen, the type is validated several times 768 * before we get here. 769 */ 770 BUG(); 771 } 772 } 773 774 static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, 775 enum btrfs_compression_type type) 776 { 777 struct workspace_manager *gwsm; 778 struct list_head *workspace; 779 780 ASSERT(fs_info->compr_wsm[type] == NULL); 781 gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL); 782 if (!gwsm) 783 return -ENOMEM; 784 785 INIT_LIST_HEAD(&gwsm->idle_ws); 786 spin_lock_init(&gwsm->ws_lock); 787 atomic_set(&gwsm->total_ws, 0); 788 init_waitqueue_head(&gwsm->ws_wait); 789 fs_info->compr_wsm[type] = gwsm; 790 791 /* 792 * Preallocate one workspace for each compression type so we can 793 * guarantee forward progress in the worst case 794 */ 795 workspace = alloc_workspace(fs_info, type, 0); 796 if (IS_ERR(workspace)) { 797 btrfs_warn(fs_info, 798 "cannot preallocate compression workspace for %s, will try later", 799 btrfs_compress_type2str(type)); 800 } else { 801 atomic_set(&gwsm->total_ws, 1); 802 gwsm->free_ws = 1; 803 list_add(workspace, &gwsm->idle_ws); 804 } 805 return 0; 806 } 807 808 static void free_workspace_manager(struct btrfs_fs_info *fs_info, 809 enum btrfs_compression_type type) 810 { 811 struct list_head *ws; 812 struct workspace_manager *gwsm = fs_info->compr_wsm[type]; 813 814 /* ZSTD uses its own workspace manager, should enter here. */ 815 ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES); 816 if (!gwsm) 817 return; 818 fs_info->compr_wsm[type] = NULL; 819 while (!list_empty(&gwsm->idle_ws)) { 820 ws = gwsm->idle_ws.next; 821 list_del(ws); 822 free_workspace(type, ws); 823 atomic_dec(&gwsm->total_ws); 824 } 825 kfree(gwsm); 826 } 827 828 /* 829 * This finds an available workspace or allocates a new one. 830 * If it's not possible to allocate a new one, waits until there's one. 831 * Preallocation makes a forward progress guarantees and we do not return 832 * errors. 833 */ 834 struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) 835 { 836 struct workspace_manager *wsm = fs_info->compr_wsm[type]; 837 struct list_head *workspace; 838 int cpus = num_online_cpus(); 839 unsigned nofs_flag; 840 struct list_head *idle_ws; 841 spinlock_t *ws_lock; 842 atomic_t *total_ws; 843 wait_queue_head_t *ws_wait; 844 int *free_ws; 845 846 ASSERT(wsm); 847 idle_ws = &wsm->idle_ws; 848 ws_lock = &wsm->ws_lock; 849 total_ws = &wsm->total_ws; 850 ws_wait = &wsm->ws_wait; 851 free_ws = &wsm->free_ws; 852 853 again: 854 spin_lock(ws_lock); 855 if (!list_empty(idle_ws)) { 856 workspace = idle_ws->next; 857 list_del(workspace); 858 (*free_ws)--; 859 spin_unlock(ws_lock); 860 return workspace; 861 862 } 863 if (atomic_read(total_ws) > cpus) { 864 DEFINE_WAIT(wait); 865 866 spin_unlock(ws_lock); 867 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 868 if (atomic_read(total_ws) > cpus && !*free_ws) 869 schedule(); 870 finish_wait(ws_wait, &wait); 871 goto again; 872 } 873 atomic_inc(total_ws); 874 spin_unlock(ws_lock); 875 876 /* 877 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have 878 * to turn it off here because we might get called from the restricted 879 * context of btrfs_compress_bio/btrfs_compress_pages 880 */ 881 nofs_flag = memalloc_nofs_save(); 882 workspace = alloc_workspace(fs_info, type, level); 883 memalloc_nofs_restore(nofs_flag); 884 885 if (IS_ERR(workspace)) { 886 atomic_dec(total_ws); 887 wake_up(ws_wait); 888 889 /* 890 * Do not return the error but go back to waiting. There's a 891 * workspace preallocated for each type and the compression 892 * time is bounded so we get to a workspace eventually. This 893 * makes our caller's life easier. 894 * 895 * To prevent silent and low-probability deadlocks (when the 896 * initial preallocation fails), check if there are any 897 * workspaces at all. 898 */ 899 if (atomic_read(total_ws) == 0) { 900 static DEFINE_RATELIMIT_STATE(_rs, 901 /* once per minute */ 60 * HZ, 902 /* no burst */ 1); 903 904 if (__ratelimit(&_rs)) 905 btrfs_warn(fs_info, 906 "no compression workspaces, low memory, retrying"); 907 } 908 goto again; 909 } 910 return workspace; 911 } 912 913 static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level) 914 { 915 switch (type) { 916 case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level); 917 case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level); 918 case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level); 919 case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level); 920 default: 921 /* 922 * This can't happen, the type is validated several times 923 * before we get here. 924 */ 925 BUG(); 926 } 927 } 928 929 /* 930 * put a workspace struct back on the list or free it if we have enough 931 * idle ones sitting around 932 */ 933 void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) 934 { 935 struct workspace_manager *gwsm = fs_info->compr_wsm[type]; 936 struct list_head *idle_ws; 937 spinlock_t *ws_lock; 938 atomic_t *total_ws; 939 wait_queue_head_t *ws_wait; 940 int *free_ws; 941 942 ASSERT(gwsm); 943 idle_ws = &gwsm->idle_ws; 944 ws_lock = &gwsm->ws_lock; 945 total_ws = &gwsm->total_ws; 946 ws_wait = &gwsm->ws_wait; 947 free_ws = &gwsm->free_ws; 948 949 spin_lock(ws_lock); 950 if (*free_ws <= num_online_cpus()) { 951 list_add(ws, idle_ws); 952 (*free_ws)++; 953 spin_unlock(ws_lock); 954 goto wake; 955 } 956 spin_unlock(ws_lock); 957 958 free_workspace(type, ws); 959 atomic_dec(total_ws); 960 wake: 961 cond_wake_up(ws_wait); 962 } 963 964 static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) 965 { 966 switch (type) { 967 case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws); 968 case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws); 969 case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws); 970 case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws); 971 default: 972 /* 973 * This can't happen, the type is validated several times 974 * before we get here. 975 */ 976 BUG(); 977 } 978 } 979 980 /* 981 * Adjust @level according to the limits of the compression algorithm or 982 * fallback to default 983 */ 984 static int btrfs_compress_set_level(unsigned int type, int level) 985 { 986 const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; 987 988 if (level == 0) 989 level = levels->default_level; 990 else 991 level = clamp(level, levels->min_level, levels->max_level); 992 993 return level; 994 } 995 996 /* 997 * Check whether the @level is within the valid range for the given type. 998 */ 999 bool btrfs_compress_level_valid(unsigned int type, int level) 1000 { 1001 const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; 1002 1003 return levels->min_level <= level && level <= levels->max_level; 1004 } 1005 1006 /* Wrapper around find_get_page(), with extra error message. */ 1007 int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, 1008 struct folio **in_folio_ret) 1009 { 1010 struct folio *in_folio; 1011 1012 /* 1013 * The compressed write path should have the folio locked already, thus 1014 * we only need to grab one reference. 1015 */ 1016 in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT); 1017 if (IS_ERR(in_folio)) { 1018 struct btrfs_inode *inode = BTRFS_I(mapping->host); 1019 1020 btrfs_crit(inode->root->fs_info, 1021 "failed to get page cache, root %lld ino %llu file offset %llu", 1022 btrfs_root_id(inode->root), btrfs_ino(inode), start); 1023 return -ENOENT; 1024 } 1025 *in_folio_ret = in_folio; 1026 return 0; 1027 } 1028 1029 /* 1030 * Given an address space and start and length, compress the bytes into @pages 1031 * that are allocated on demand. 1032 * 1033 * @type_level is encoded algorithm and level, where level 0 means whatever 1034 * default the algorithm chooses and is opaque here; 1035 * - compression algo are 0-3 1036 * - the level are bits 4-7 1037 * 1038 * @out_folios is an in/out parameter, holds maximum number of folios to allocate 1039 * and returns number of actually allocated folios 1040 * 1041 * @total_in is used to return the number of bytes actually read. It 1042 * may be smaller than the input length if we had to exit early because we 1043 * ran out of room in the folios array or because we cross the 1044 * max_out threshold. 1045 * 1046 * @total_out is an in/out parameter, must be set to the input length and will 1047 * be also used to return the total number of compressed bytes 1048 */ 1049 int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, 1050 u64 start, struct folio **folios, unsigned long *out_folios, 1051 unsigned long *total_in, unsigned long *total_out) 1052 { 1053 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1054 const unsigned long orig_len = *total_out; 1055 struct list_head *workspace; 1056 int ret; 1057 1058 level = btrfs_compress_set_level(type, level); 1059 workspace = get_workspace(fs_info, type, level); 1060 ret = compression_compress_pages(type, workspace, inode, start, folios, 1061 out_folios, total_in, total_out); 1062 /* The total read-in bytes should be no larger than the input. */ 1063 ASSERT(*total_in <= orig_len); 1064 put_workspace(fs_info, type, workspace); 1065 return ret; 1066 } 1067 1068 static int btrfs_decompress_bio(struct compressed_bio *cb) 1069 { 1070 struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 1071 struct list_head *workspace; 1072 int ret; 1073 int type = cb->compress_type; 1074 1075 workspace = get_workspace(fs_info, type, 0); 1076 ret = compression_decompress_bio(workspace, cb); 1077 put_workspace(fs_info, type, workspace); 1078 1079 if (!ret) 1080 zero_fill_bio(&cb->orig_bbio->bio); 1081 return ret; 1082 } 1083 1084 /* 1085 * a less complex decompression routine. Our compressed data fits in a 1086 * single page, and we want to read a single page out of it. 1087 * dest_pgoff tells us the offset into the destination folio where we write the 1088 * decompressed data. 1089 */ 1090 int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, 1091 unsigned long dest_pgoff, size_t srclen, size_t destlen) 1092 { 1093 struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); 1094 struct list_head *workspace; 1095 const u32 sectorsize = fs_info->sectorsize; 1096 int ret; 1097 1098 /* 1099 * The full destination folio range should not exceed the folio size. 1100 * And the @destlen should not exceed sectorsize, as this is only called for 1101 * inline file extents, which should not exceed sectorsize. 1102 */ 1103 ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize); 1104 1105 workspace = get_workspace(fs_info, type, 0); 1106 ret = compression_decompress(type, workspace, data_in, dest_folio, 1107 dest_pgoff, srclen, destlen); 1108 put_workspace(fs_info, type, workspace); 1109 1110 return ret; 1111 } 1112 1113 int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) 1114 { 1115 int ret; 1116 1117 ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); 1118 if (ret < 0) 1119 goto error; 1120 ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); 1121 if (ret < 0) 1122 goto error; 1123 ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); 1124 if (ret < 0) 1125 goto error; 1126 ret = zstd_alloc_workspace_manager(fs_info); 1127 if (ret < 0) 1128 goto error; 1129 return 0; 1130 error: 1131 btrfs_free_compress_wsm(fs_info); 1132 return ret; 1133 } 1134 1135 void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) 1136 { 1137 free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); 1138 free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); 1139 free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); 1140 zstd_free_workspace_manager(fs_info); 1141 } 1142 1143 int __init btrfs_init_compress(void) 1144 { 1145 if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, 1146 offsetof(struct compressed_bio, bbio.bio), 1147 BIOSET_NEED_BVECS)) 1148 return -ENOMEM; 1149 1150 compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages"); 1151 if (!compr_pool.shrinker) 1152 return -ENOMEM; 1153 1154 spin_lock_init(&compr_pool.lock); 1155 INIT_LIST_HEAD(&compr_pool.list); 1156 compr_pool.count = 0; 1157 /* 128K / 4K = 32, for 8 threads is 256 pages. */ 1158 compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8; 1159 compr_pool.shrinker->count_objects = btrfs_compr_pool_count; 1160 compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan; 1161 compr_pool.shrinker->batch = 32; 1162 compr_pool.shrinker->seeks = DEFAULT_SEEKS; 1163 shrinker_register(compr_pool.shrinker); 1164 1165 return 0; 1166 } 1167 1168 void __cold btrfs_exit_compress(void) 1169 { 1170 /* For now scan drains all pages and does not touch the parameters. */ 1171 btrfs_compr_pool_scan(NULL, NULL); 1172 shrinker_free(compr_pool.shrinker); 1173 1174 bioset_exit(&btrfs_compressed_bioset); 1175 } 1176 1177 /* 1178 * The bvec is a single page bvec from a bio that contains folios from a filemap. 1179 * 1180 * Since the folio may be a large one, and if the bv_page is not a head page of 1181 * a large folio, then page->index is unreliable. 1182 * 1183 * Thus we need this helper to grab the proper file offset. 1184 */ 1185 static u64 file_offset_from_bvec(const struct bio_vec *bvec) 1186 { 1187 const struct page *page = bvec->bv_page; 1188 const struct folio *folio = page_folio(page); 1189 1190 return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset; 1191 } 1192 1193 /* 1194 * Copy decompressed data from working buffer to pages. 1195 * 1196 * @buf: The decompressed data buffer 1197 * @buf_len: The decompressed data length 1198 * @decompressed: Number of bytes that are already decompressed inside the 1199 * compressed extent 1200 * @cb: The compressed extent descriptor 1201 * @orig_bio: The original bio that the caller wants to read for 1202 * 1203 * An easier to understand graph is like below: 1204 * 1205 * |<- orig_bio ->| |<- orig_bio->| 1206 * |<------- full decompressed extent ----->| 1207 * |<----------- @cb range ---->| 1208 * | |<-- @buf_len -->| 1209 * |<--- @decompressed --->| 1210 * 1211 * Note that, @cb can be a subpage of the full decompressed extent, but 1212 * @cb->start always has the same as the orig_file_offset value of the full 1213 * decompressed extent. 1214 * 1215 * When reading compressed extent, we have to read the full compressed extent, 1216 * while @orig_bio may only want part of the range. 1217 * Thus this function will ensure only data covered by @orig_bio will be copied 1218 * to. 1219 * 1220 * Return 0 if we have copied all needed contents for @orig_bio. 1221 * Return >0 if we need continue decompress. 1222 */ 1223 int btrfs_decompress_buf2page(const char *buf, u32 buf_len, 1224 struct compressed_bio *cb, u32 decompressed) 1225 { 1226 struct bio *orig_bio = &cb->orig_bbio->bio; 1227 /* Offset inside the full decompressed extent */ 1228 u32 cur_offset; 1229 1230 cur_offset = decompressed; 1231 /* The main loop to do the copy */ 1232 while (cur_offset < decompressed + buf_len) { 1233 struct bio_vec bvec; 1234 size_t copy_len; 1235 u32 copy_start; 1236 /* Offset inside the full decompressed extent */ 1237 u32 bvec_offset; 1238 void *kaddr; 1239 1240 bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); 1241 /* 1242 * cb->start may underflow, but subtracting that value can still 1243 * give us correct offset inside the full decompressed extent. 1244 */ 1245 bvec_offset = file_offset_from_bvec(&bvec) - cb->start; 1246 1247 /* Haven't reached the bvec range, exit */ 1248 if (decompressed + buf_len <= bvec_offset) 1249 return 1; 1250 1251 copy_start = max(cur_offset, bvec_offset); 1252 copy_len = min(bvec_offset + bvec.bv_len, 1253 decompressed + buf_len) - copy_start; 1254 ASSERT(copy_len); 1255 1256 /* 1257 * Extra range check to ensure we didn't go beyond 1258 * @buf + @buf_len. 1259 */ 1260 ASSERT(copy_start - decompressed < buf_len); 1261 1262 kaddr = bvec_kmap_local(&bvec); 1263 memcpy(kaddr, buf + copy_start - decompressed, copy_len); 1264 kunmap_local(kaddr); 1265 1266 cur_offset += copy_len; 1267 bio_advance(orig_bio, copy_len); 1268 /* Finished the bio */ 1269 if (!orig_bio->bi_iter.bi_size) 1270 return 0; 1271 } 1272 return 1; 1273 } 1274 1275 /* 1276 * Shannon Entropy calculation 1277 * 1278 * Pure byte distribution analysis fails to determine compressibility of data. 1279 * Try calculating entropy to estimate the average minimum number of bits 1280 * needed to encode the sampled data. 1281 * 1282 * For convenience, return the percentage of needed bits, instead of amount of 1283 * bits directly. 1284 * 1285 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 1286 * and can be compressible with high probability 1287 * 1288 * @ENTROPY_LVL_HIGH - data are not compressible with high probability 1289 * 1290 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 1291 */ 1292 #define ENTROPY_LVL_ACEPTABLE (65) 1293 #define ENTROPY_LVL_HIGH (80) 1294 1295 /* 1296 * For increased precision in shannon_entropy calculation, 1297 * let's do pow(n, M) to save more digits after comma: 1298 * 1299 * - maximum int bit length is 64 1300 * - ilog2(MAX_SAMPLE_SIZE) -> 13 1301 * - 13 * 4 = 52 < 64 -> M = 4 1302 * 1303 * So use pow(n, 4). 1304 */ 1305 static inline u32 ilog2_w(u64 n) 1306 { 1307 return ilog2(n * n * n * n); 1308 } 1309 1310 static u32 shannon_entropy(struct heuristic_ws *ws) 1311 { 1312 const u32 entropy_max = 8 * ilog2_w(2); 1313 u32 entropy_sum = 0; 1314 u32 p, p_base, sz_base; 1315 u32 i; 1316 1317 sz_base = ilog2_w(ws->sample_size); 1318 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 1319 p = ws->bucket[i].count; 1320 p_base = ilog2_w(p); 1321 entropy_sum += p * (sz_base - p_base); 1322 } 1323 1324 entropy_sum /= ws->sample_size; 1325 return entropy_sum * 100 / entropy_max; 1326 } 1327 1328 #define RADIX_BASE 4U 1329 #define COUNTERS_SIZE (1U << RADIX_BASE) 1330 1331 static u8 get4bits(u64 num, int shift) { 1332 u8 low4bits; 1333 1334 num >>= shift; 1335 /* Reverse order */ 1336 low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); 1337 return low4bits; 1338 } 1339 1340 /* 1341 * Use 4 bits as radix base 1342 * Use 16 u32 counters for calculating new position in buf array 1343 * 1344 * @array - array that will be sorted 1345 * @array_buf - buffer array to store sorting results 1346 * must be equal in size to @array 1347 * @num - array size 1348 */ 1349 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, 1350 int num) 1351 { 1352 u64 max_num; 1353 u64 buf_num; 1354 u32 counters[COUNTERS_SIZE]; 1355 u32 new_addr; 1356 u32 addr; 1357 int bitlen; 1358 int shift; 1359 int i; 1360 1361 /* 1362 * Try avoid useless loop iterations for small numbers stored in big 1363 * counters. Example: 48 33 4 ... in 64bit array 1364 */ 1365 max_num = array[0].count; 1366 for (i = 1; i < num; i++) { 1367 buf_num = array[i].count; 1368 if (buf_num > max_num) 1369 max_num = buf_num; 1370 } 1371 1372 buf_num = ilog2(max_num); 1373 bitlen = ALIGN(buf_num, RADIX_BASE * 2); 1374 1375 shift = 0; 1376 while (shift < bitlen) { 1377 memset(counters, 0, sizeof(counters)); 1378 1379 for (i = 0; i < num; i++) { 1380 buf_num = array[i].count; 1381 addr = get4bits(buf_num, shift); 1382 counters[addr]++; 1383 } 1384 1385 for (i = 1; i < COUNTERS_SIZE; i++) 1386 counters[i] += counters[i - 1]; 1387 1388 for (i = num - 1; i >= 0; i--) { 1389 buf_num = array[i].count; 1390 addr = get4bits(buf_num, shift); 1391 counters[addr]--; 1392 new_addr = counters[addr]; 1393 array_buf[new_addr] = array[i]; 1394 } 1395 1396 shift += RADIX_BASE; 1397 1398 /* 1399 * Normal radix expects to move data from a temporary array, to 1400 * the main one. But that requires some CPU time. Avoid that 1401 * by doing another sort iteration to original array instead of 1402 * memcpy() 1403 */ 1404 memset(counters, 0, sizeof(counters)); 1405 1406 for (i = 0; i < num; i ++) { 1407 buf_num = array_buf[i].count; 1408 addr = get4bits(buf_num, shift); 1409 counters[addr]++; 1410 } 1411 1412 for (i = 1; i < COUNTERS_SIZE; i++) 1413 counters[i] += counters[i - 1]; 1414 1415 for (i = num - 1; i >= 0; i--) { 1416 buf_num = array_buf[i].count; 1417 addr = get4bits(buf_num, shift); 1418 counters[addr]--; 1419 new_addr = counters[addr]; 1420 array[new_addr] = array_buf[i]; 1421 } 1422 1423 shift += RADIX_BASE; 1424 } 1425 } 1426 1427 /* 1428 * Size of the core byte set - how many bytes cover 90% of the sample 1429 * 1430 * There are several types of structured binary data that use nearly all byte 1431 * values. The distribution can be uniform and counts in all buckets will be 1432 * nearly the same (eg. encrypted data). Unlikely to be compressible. 1433 * 1434 * Other possibility is normal (Gaussian) distribution, where the data could 1435 * be potentially compressible, but we have to take a few more steps to decide 1436 * how much. 1437 * 1438 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 1439 * compression algo can easy fix that 1440 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 1441 * probability is not compressible 1442 */ 1443 #define BYTE_CORE_SET_LOW (64) 1444 #define BYTE_CORE_SET_HIGH (200) 1445 1446 static int byte_core_set_size(struct heuristic_ws *ws) 1447 { 1448 u32 i; 1449 u32 coreset_sum = 0; 1450 const u32 core_set_threshold = ws->sample_size * 90 / 100; 1451 struct bucket_item *bucket = ws->bucket; 1452 1453 /* Sort in reverse order */ 1454 radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); 1455 1456 for (i = 0; i < BYTE_CORE_SET_LOW; i++) 1457 coreset_sum += bucket[i].count; 1458 1459 if (coreset_sum > core_set_threshold) 1460 return i; 1461 1462 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 1463 coreset_sum += bucket[i].count; 1464 if (coreset_sum > core_set_threshold) 1465 break; 1466 } 1467 1468 return i; 1469 } 1470 1471 /* 1472 * Count byte values in buckets. 1473 * This heuristic can detect textual data (configs, xml, json, html, etc). 1474 * Because in most text-like data byte set is restricted to limited number of 1475 * possible characters, and that restriction in most cases makes data easy to 1476 * compress. 1477 * 1478 * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 1479 * less - compressible 1480 * more - need additional analysis 1481 */ 1482 #define BYTE_SET_THRESHOLD (64) 1483 1484 static u32 byte_set_size(const struct heuristic_ws *ws) 1485 { 1486 u32 i; 1487 u32 byte_set_size = 0; 1488 1489 for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 1490 if (ws->bucket[i].count > 0) 1491 byte_set_size++; 1492 } 1493 1494 /* 1495 * Continue collecting count of byte values in buckets. If the byte 1496 * set size is bigger then the threshold, it's pointless to continue, 1497 * the detection technique would fail for this type of data. 1498 */ 1499 for (; i < BUCKET_SIZE; i++) { 1500 if (ws->bucket[i].count > 0) { 1501 byte_set_size++; 1502 if (byte_set_size > BYTE_SET_THRESHOLD) 1503 return byte_set_size; 1504 } 1505 } 1506 1507 return byte_set_size; 1508 } 1509 1510 static bool sample_repeated_patterns(struct heuristic_ws *ws) 1511 { 1512 const u32 half_of_sample = ws->sample_size / 2; 1513 const u8 *data = ws->sample; 1514 1515 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 1516 } 1517 1518 static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 1519 struct heuristic_ws *ws) 1520 { 1521 struct page *page; 1522 pgoff_t index, index_end; 1523 u32 i, curr_sample_pos; 1524 u8 *in_data; 1525 1526 /* 1527 * Compression handles the input data by chunks of 128KiB 1528 * (defined by BTRFS_MAX_UNCOMPRESSED) 1529 * 1530 * We do the same for the heuristic and loop over the whole range. 1531 * 1532 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 1533 * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 1534 */ 1535 if (end - start > BTRFS_MAX_UNCOMPRESSED) 1536 end = start + BTRFS_MAX_UNCOMPRESSED; 1537 1538 index = start >> PAGE_SHIFT; 1539 index_end = end >> PAGE_SHIFT; 1540 1541 /* Don't miss unaligned end */ 1542 if (!PAGE_ALIGNED(end)) 1543 index_end++; 1544 1545 curr_sample_pos = 0; 1546 while (index < index_end) { 1547 page = find_get_page(inode->i_mapping, index); 1548 in_data = kmap_local_page(page); 1549 /* Handle case where the start is not aligned to PAGE_SIZE */ 1550 i = start % PAGE_SIZE; 1551 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 1552 /* Don't sample any garbage from the last page */ 1553 if (start > end - SAMPLING_READ_SIZE) 1554 break; 1555 memcpy(&ws->sample[curr_sample_pos], &in_data[i], 1556 SAMPLING_READ_SIZE); 1557 i += SAMPLING_INTERVAL; 1558 start += SAMPLING_INTERVAL; 1559 curr_sample_pos += SAMPLING_READ_SIZE; 1560 } 1561 kunmap_local(in_data); 1562 put_page(page); 1563 1564 index++; 1565 } 1566 1567 ws->sample_size = curr_sample_pos; 1568 } 1569 1570 /* 1571 * Compression heuristic. 1572 * 1573 * The following types of analysis can be performed: 1574 * - detect mostly zero data 1575 * - detect data with low "byte set" size (text, etc) 1576 * - detect data with low/high "core byte" set 1577 * 1578 * Return non-zero if the compression should be done, 0 otherwise. 1579 */ 1580 int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) 1581 { 1582 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1583 struct list_head *ws_list = get_workspace(fs_info, 0, 0); 1584 struct heuristic_ws *ws; 1585 u32 i; 1586 u8 byte; 1587 int ret = 0; 1588 1589 ws = list_entry(ws_list, struct heuristic_ws, list); 1590 1591 heuristic_collect_sample(&inode->vfs_inode, start, end, ws); 1592 1593 if (sample_repeated_patterns(ws)) { 1594 ret = 1; 1595 goto out; 1596 } 1597 1598 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 1599 1600 for (i = 0; i < ws->sample_size; i++) { 1601 byte = ws->sample[i]; 1602 ws->bucket[byte].count++; 1603 } 1604 1605 i = byte_set_size(ws); 1606 if (i < BYTE_SET_THRESHOLD) { 1607 ret = 2; 1608 goto out; 1609 } 1610 1611 i = byte_core_set_size(ws); 1612 if (i <= BYTE_CORE_SET_LOW) { 1613 ret = 3; 1614 goto out; 1615 } 1616 1617 if (i >= BYTE_CORE_SET_HIGH) { 1618 ret = 0; 1619 goto out; 1620 } 1621 1622 i = shannon_entropy(ws); 1623 if (i <= ENTROPY_LVL_ACEPTABLE) { 1624 ret = 4; 1625 goto out; 1626 } 1627 1628 /* 1629 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 1630 * needed to give green light to compression. 1631 * 1632 * For now just assume that compression at that level is not worth the 1633 * resources because: 1634 * 1635 * 1. it is possible to defrag the data later 1636 * 1637 * 2. the data would turn out to be hardly compressible, eg. 150 byte 1638 * values, every bucket has counter at level ~54. The heuristic would 1639 * be confused. This can happen when data have some internal repeated 1640 * patterns like "abbacbbc...". This can be detected by analyzing 1641 * pairs of bytes, which is too costly. 1642 */ 1643 if (i < ENTROPY_LVL_HIGH) { 1644 ret = 5; 1645 goto out; 1646 } else { 1647 ret = 0; 1648 goto out; 1649 } 1650 1651 out: 1652 put_workspace(fs_info, 0, ws_list); 1653 return ret; 1654 } 1655 1656 /* 1657 * Convert the compression suffix (eg. after "zlib" starting with ":") to level. 1658 * 1659 * If the resulting level exceeds the algo's supported levels, it will be clamped. 1660 * 1661 * Return <0 if no valid string can be found. 1662 * Return 0 if everything is fine. 1663 */ 1664 int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) 1665 { 1666 int level = 0; 1667 int ret; 1668 1669 if (!type) { 1670 *level_ret = btrfs_compress_set_level(type, level); 1671 return 0; 1672 } 1673 1674 if (str[0] == ':') { 1675 ret = kstrtoint(str + 1, 10, &level); 1676 if (ret) 1677 return ret; 1678 } 1679 1680 *level_ret = btrfs_compress_set_level(type, level); 1681 return 0; 1682 } 1683