1 // SPDX-License-Identifier: BSD-3-Clause 2 /* 3 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * 3. Neither the name of the copyright holder nor the names of its 16 * contributors may be used to endorse or promote products derived from this 17 * software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2016-2018, Klara Inc. 34 * Copyright (c) 2016-2018, Allan Jude 35 * Copyright (c) 2018-2020, Sebastian Gottschall 36 * Copyright (c) 2019-2020, Michael Niewöhner 37 * Copyright (c) 2020, The FreeBSD Foundation [1] 38 * 39 * [1] Portions of this software were developed by Allan Jude 40 * under sponsorship from the FreeBSD Foundation. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/sysmacros.h> 45 #include <sys/zfs_context.h> 46 #include <sys/zio_compress.h> 47 #include <sys/spa.h> 48 #include <sys/zstd/zstd.h> 49 50 #define ZSTD_STATIC_LINKING_ONLY 51 #include "lib/zstd.h" 52 #include "lib/common/zstd_errors.h" 53 54 #ifndef IN_LIBSA 55 static uint_t zstd_earlyabort_pass = 1; 56 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; 57 static unsigned int zstd_abort_size = (128 * 1024); 58 #endif 59 60 #ifdef IN_BASE 61 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int); 62 #endif 63 64 static kstat_t *zstd_ksp = NULL; 65 66 typedef struct zstd_stats { 67 kstat_named_t zstd_stat_alloc_fail; 68 kstat_named_t zstd_stat_alloc_fallback; 69 kstat_named_t zstd_stat_com_alloc_fail; 70 kstat_named_t zstd_stat_dec_alloc_fail; 71 kstat_named_t zstd_stat_com_inval; 72 kstat_named_t zstd_stat_dec_inval; 73 kstat_named_t zstd_stat_dec_header_inval; 74 kstat_named_t zstd_stat_com_fail; 75 kstat_named_t zstd_stat_dec_fail; 76 /* 77 * LZ4 first-pass early abort verdict 78 */ 79 kstat_named_t zstd_stat_lz4pass_allowed; 80 kstat_named_t zstd_stat_lz4pass_rejected; 81 /* 82 * zstd-1 second-pass early abort verdict 83 */ 84 kstat_named_t zstd_stat_zstdpass_allowed; 85 kstat_named_t zstd_stat_zstdpass_rejected; 86 /* 87 * We excluded this from early abort for some reason 88 */ 89 kstat_named_t zstd_stat_passignored; 90 kstat_named_t zstd_stat_passignored_size; 91 kstat_named_t zstd_stat_buffers; 92 kstat_named_t zstd_stat_size; 93 } zstd_stats_t; 94 95 static zstd_stats_t zstd_stats = { 96 { "alloc_fail", KSTAT_DATA_UINT64 }, 97 { "alloc_fallback", KSTAT_DATA_UINT64 }, 98 { "compress_alloc_fail", KSTAT_DATA_UINT64 }, 99 { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, 100 { "compress_level_invalid", KSTAT_DATA_UINT64 }, 101 { "decompress_level_invalid", KSTAT_DATA_UINT64 }, 102 { "decompress_header_invalid", KSTAT_DATA_UINT64 }, 103 { "compress_failed", KSTAT_DATA_UINT64 }, 104 { "decompress_failed", KSTAT_DATA_UINT64 }, 105 { "lz4pass_allowed", KSTAT_DATA_UINT64 }, 106 { "lz4pass_rejected", KSTAT_DATA_UINT64 }, 107 { "zstdpass_allowed", KSTAT_DATA_UINT64 }, 108 { "zstdpass_rejected", KSTAT_DATA_UINT64 }, 109 { "passignored", KSTAT_DATA_UINT64 }, 110 { "passignored_size", KSTAT_DATA_UINT64 }, 111 { "buffers", KSTAT_DATA_UINT64 }, 112 { "size", KSTAT_DATA_UINT64 }, 113 }; 114 115 #ifdef _KERNEL 116 static int 117 kstat_zstd_update(kstat_t *ksp, int rw) 118 { 119 ASSERT(ksp != NULL); 120 121 if (rw == KSTAT_WRITE && ksp == zstd_ksp) { 122 ZSTDSTAT_ZERO(zstd_stat_alloc_fail); 123 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); 124 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); 125 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); 126 ZSTDSTAT_ZERO(zstd_stat_com_inval); 127 ZSTDSTAT_ZERO(zstd_stat_dec_inval); 128 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); 129 ZSTDSTAT_ZERO(zstd_stat_com_fail); 130 ZSTDSTAT_ZERO(zstd_stat_dec_fail); 131 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); 132 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); 133 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); 134 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); 135 ZSTDSTAT_ZERO(zstd_stat_passignored); 136 ZSTDSTAT_ZERO(zstd_stat_passignored_size); 137 } 138 139 return (0); 140 } 141 #endif 142 143 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ 144 enum zstd_kmem_type { 145 ZSTD_KMEM_UNKNOWN = 0, 146 /* Allocation type using kmem_vmalloc */ 147 ZSTD_KMEM_DEFAULT, 148 /* Pool based allocation using mempool_alloc */ 149 ZSTD_KMEM_POOL, 150 /* Reserved fallback memory for decompression only */ 151 ZSTD_KMEM_DCTX, 152 ZSTD_KMEM_COUNT, 153 }; 154 155 /* Structure for pooled memory objects */ 156 struct zstd_pool { 157 void *mem; 158 size_t size; 159 kmutex_t barrier; 160 hrtime_t timeout; 161 }; 162 163 /* Global structure for handling memory allocations */ 164 struct zstd_kmem { 165 enum zstd_kmem_type kmem_type; 166 size_t kmem_size; 167 struct zstd_pool *pool; 168 }; 169 170 /* Fallback memory structure used for decompression only if memory runs out */ 171 struct zstd_fallback_mem { 172 size_t mem_size; 173 void *mem; 174 kmutex_t barrier; 175 }; 176 177 struct zstd_levelmap { 178 int16_t zstd_level; 179 enum zio_zstd_levels level; 180 }; 181 182 /* 183 * ZSTD memory handlers 184 * 185 * For decompression we use a different handler which also provides fallback 186 * memory allocation in case memory runs out. 187 * 188 * The ZSTD handlers were split up for the most simplified implementation. 189 */ 190 #ifndef IN_LIBSA 191 static void *zstd_alloc(void *opaque, size_t size); 192 #endif 193 static void *zstd_dctx_alloc(void *opaque, size_t size); 194 static void zstd_free(void *opaque, void *ptr); 195 196 #ifndef IN_LIBSA 197 /* Compression memory handler */ 198 static const ZSTD_customMem zstd_malloc = { 199 zstd_alloc, 200 zstd_free, 201 NULL, 202 }; 203 #endif 204 205 /* Decompression memory handler */ 206 static const ZSTD_customMem zstd_dctx_malloc = { 207 zstd_dctx_alloc, 208 zstd_free, 209 NULL, 210 }; 211 212 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ 213 static struct zstd_levelmap zstd_levels[] = { 214 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, 215 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, 216 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, 217 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, 218 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, 219 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, 220 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, 221 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, 222 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, 223 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, 224 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, 225 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, 226 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, 227 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, 228 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, 229 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, 230 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, 231 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, 232 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, 233 {-1, ZIO_ZSTD_LEVEL_FAST_1}, 234 {-2, ZIO_ZSTD_LEVEL_FAST_2}, 235 {-3, ZIO_ZSTD_LEVEL_FAST_3}, 236 {-4, ZIO_ZSTD_LEVEL_FAST_4}, 237 {-5, ZIO_ZSTD_LEVEL_FAST_5}, 238 {-6, ZIO_ZSTD_LEVEL_FAST_6}, 239 {-7, ZIO_ZSTD_LEVEL_FAST_7}, 240 {-8, ZIO_ZSTD_LEVEL_FAST_8}, 241 {-9, ZIO_ZSTD_LEVEL_FAST_9}, 242 {-10, ZIO_ZSTD_LEVEL_FAST_10}, 243 {-20, ZIO_ZSTD_LEVEL_FAST_20}, 244 {-30, ZIO_ZSTD_LEVEL_FAST_30}, 245 {-40, ZIO_ZSTD_LEVEL_FAST_40}, 246 {-50, ZIO_ZSTD_LEVEL_FAST_50}, 247 {-60, ZIO_ZSTD_LEVEL_FAST_60}, 248 {-70, ZIO_ZSTD_LEVEL_FAST_70}, 249 {-80, ZIO_ZSTD_LEVEL_FAST_80}, 250 {-90, ZIO_ZSTD_LEVEL_FAST_90}, 251 {-100, ZIO_ZSTD_LEVEL_FAST_100}, 252 {-500, ZIO_ZSTD_LEVEL_FAST_500}, 253 {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, 254 }; 255 256 /* 257 * This variable represents the maximum count of the pool based on the number 258 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. 259 */ 260 static int pool_count = 16; 261 262 #define ZSTD_POOL_MAX pool_count 263 #define ZSTD_POOL_TIMEOUT 60 * 2 264 265 static struct zstd_fallback_mem zstd_dctx_fallback; 266 static struct zstd_pool *zstd_mempool_cctx; 267 static struct zstd_pool *zstd_mempool_dctx; 268 269 /* 270 * The library zstd code expects these if ADDRESS_SANITIZER gets defined, 271 * and while ASAN does this, KASAN defines that and does not. So to avoid 272 * changing the external code, we do this. 273 */ 274 #if defined(ZFS_ASAN_ENABLED) 275 #define ADDRESS_SANITIZER 1 276 #endif 277 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) 278 void __asan_unpoison_memory_region(void const volatile *addr, size_t size); 279 void __asan_poison_memory_region(void const volatile *addr, size_t size); 280 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; 281 void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; 282 #endif 283 284 285 static void 286 zstd_mempool_reap(struct zstd_pool *zstd_mempool) 287 { 288 struct zstd_pool *pool; 289 290 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { 291 return; 292 } 293 294 /* free obsolete slots */ 295 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 296 pool = &zstd_mempool[i]; 297 if (pool->mem && mutex_tryenter(&pool->barrier)) { 298 /* Free memory if unused object older than 2 minutes */ 299 if (pool->mem && gethrestime_sec() > pool->timeout) { 300 vmem_free(pool->mem, pool->size); 301 ZSTDSTAT_SUB(zstd_stat_buffers, 1); 302 ZSTDSTAT_SUB(zstd_stat_size, pool->size); 303 pool->mem = NULL; 304 pool->size = 0; 305 pool->timeout = 0; 306 } 307 mutex_exit(&pool->barrier); 308 } 309 } 310 } 311 312 /* 313 * Try to get a cached allocated buffer from memory pool or allocate a new one 314 * if necessary. If a object is older than 2 minutes and does not fit the 315 * requested size, it will be released and a new cached entry will be allocated. 316 * If other pooled objects are detected without being used for 2 minutes, they 317 * will be released, too. 318 * 319 * The concept is that high frequency memory allocations of bigger objects are 320 * expensive. So if a lot of work is going on, allocations will be kept for a 321 * while and can be reused in that time frame. 322 * 323 * The scheduled release will be updated every time a object is reused. 324 */ 325 326 static void * 327 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) 328 { 329 struct zstd_pool *pool; 330 struct zstd_kmem *mem = NULL; 331 332 if (!zstd_mempool) { 333 return (NULL); 334 } 335 336 /* Seek for preallocated memory slot and free obsolete slots */ 337 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 338 pool = &zstd_mempool[i]; 339 /* 340 * This lock is simply a marker for a pool object being in use. 341 * If it's already hold, it will be skipped. 342 * 343 * We need to create it before checking it to avoid race 344 * conditions caused by running in a threaded context. 345 * 346 * The lock is later released by zstd_mempool_free. 347 */ 348 if (mutex_tryenter(&pool->barrier)) { 349 /* 350 * Check if objects fits the size, if so we take it and 351 * update the timestamp. 352 */ 353 if (pool->mem && size <= pool->size) { 354 pool->timeout = gethrestime_sec() + 355 ZSTD_POOL_TIMEOUT; 356 mem = pool->mem; 357 return (mem); 358 } 359 mutex_exit(&pool->barrier); 360 } 361 } 362 363 /* 364 * If no preallocated slot was found, try to fill in a new one. 365 * 366 * We run a similar algorithm twice here to avoid pool fragmentation. 367 * The first one may generate holes in the list if objects get released. 368 * We always make sure that these holes get filled instead of adding new 369 * allocations constantly at the end. 370 */ 371 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 372 pool = &zstd_mempool[i]; 373 if (mutex_tryenter(&pool->barrier)) { 374 /* Object is free, try to allocate new one */ 375 if (!pool->mem) { 376 mem = vmem_alloc(size, KM_SLEEP); 377 if (mem) { 378 ZSTDSTAT_ADD(zstd_stat_buffers, 1); 379 ZSTDSTAT_ADD(zstd_stat_size, size); 380 pool->mem = mem; 381 pool->size = size; 382 /* Keep track for later release */ 383 mem->pool = pool; 384 mem->kmem_type = ZSTD_KMEM_POOL; 385 mem->kmem_size = size; 386 } 387 } 388 389 if (size <= pool->size) { 390 /* Update timestamp */ 391 pool->timeout = gethrestime_sec() + 392 ZSTD_POOL_TIMEOUT; 393 394 return (pool->mem); 395 } 396 397 mutex_exit(&pool->barrier); 398 } 399 } 400 401 /* 402 * If the pool is full or the allocation failed, try lazy allocation 403 * instead. 404 */ 405 if (!mem) { 406 mem = vmem_alloc(size, KM_NOSLEEP); 407 if (mem) { 408 mem->pool = NULL; 409 mem->kmem_type = ZSTD_KMEM_DEFAULT; 410 mem->kmem_size = size; 411 } 412 } 413 414 return (mem); 415 } 416 417 /* Mark object as released by releasing the barrier mutex */ 418 static void 419 zstd_mempool_free(struct zstd_kmem *z) 420 { 421 mutex_exit(&z->pool->barrier); 422 } 423 424 /* Convert ZFS internal enum to ZSTD level */ 425 static int 426 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) 427 { 428 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { 429 *zstd_level = zstd_levels[level - 1].zstd_level; 430 return (0); 431 } 432 if (level >= ZIO_ZSTD_LEVEL_FAST_1 && 433 level <= ZIO_ZSTD_LEVEL_FAST_1000) { 434 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 435 + ZIO_ZSTD_LEVEL_19].zstd_level; 436 return (0); 437 } 438 439 /* Invalid/unknown zfs compression enum - this should never happen. */ 440 return (1); 441 } 442 443 #ifndef IN_LIBSA 444 /* Compress block using zstd */ 445 static size_t 446 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, 447 int level) 448 { 449 size_t c_len; 450 int16_t zstd_level; 451 zfs_zstdhdr_t *hdr; 452 ZSTD_CCtx *cctx; 453 454 hdr = (zfs_zstdhdr_t *)d_start; 455 456 /* Skip compression if the specified level is invalid */ 457 if (zstd_enum_to_level(level, &zstd_level)) { 458 ZSTDSTAT_BUMP(zstd_stat_com_inval); 459 return (s_len); 460 } 461 462 ASSERT3U(d_len, >=, sizeof (*hdr)); 463 ASSERT3U(d_len, <=, s_len); 464 ASSERT3U(zstd_level, !=, 0); 465 466 cctx = ZSTD_createCCtx_advanced(zstd_malloc); 467 468 /* 469 * Out of kernel memory, gently fall through - this will disable 470 * compression in zio_compress_data 471 */ 472 if (!cctx) { 473 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); 474 return (s_len); 475 } 476 477 /* Set the compression level */ 478 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); 479 480 /* Use the "magicless" zstd header which saves us 4 header bytes */ 481 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); 482 483 /* 484 * Disable redundant checksum calculation and content size storage since 485 * this is already done by ZFS itself. 486 */ 487 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); 488 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); 489 490 c_len = ZSTD_compress2(cctx, 491 hdr->data, 492 d_len - sizeof (*hdr), 493 s_start, s_len); 494 495 ZSTD_freeCCtx(cctx); 496 497 /* Error in the compression routine, disable compression. */ 498 if (ZSTD_isError(c_len)) { 499 /* 500 * If we are aborting the compression because the saves are 501 * too small, that is not a failure. Everything else is a 502 * failure, so increment the compression failure counter. 503 */ 504 int err = ZSTD_getErrorCode(c_len); 505 if (err != ZSTD_error_dstSize_tooSmall) { 506 ZSTDSTAT_BUMP(zstd_stat_com_fail); 507 dprintf("Error: %s", ZSTD_getErrorString(err)); 508 } 509 return (s_len); 510 } 511 512 /* 513 * Encode the compressed buffer size at the start. We'll need this in 514 * decompression to counter the effects of padding which might be added 515 * to the compressed buffer and which, if unhandled, would confuse the 516 * hell out of our decompression function. 517 */ 518 hdr->c_len = BE_32(c_len); 519 520 /* 521 * Check version for overflow. 522 * The limit of 24 bits must not be exceeded. This allows a maximum 523 * version 1677.72.15 which we don't expect to be ever reached. 524 */ 525 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); 526 527 /* 528 * Encode the compression level as well. We may need to know the 529 * original compression level if compressed_arc is disabled, to match 530 * the compression settings to write this block to the L2ARC. 531 * 532 * Encode the actual level, so if the enum changes in the future, we 533 * will be compatible. 534 * 535 * The upper 24 bits store the ZSTD version to be able to provide 536 * future compatibility, since new versions might enhance the 537 * compression algorithm in a way, where the compressed data will 538 * change. 539 * 540 * As soon as such incompatibility occurs, handling code needs to be 541 * added, differentiating between the versions. 542 */ 543 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); 544 zfs_set_hdrlevel(hdr, level); 545 hdr->raw_version_level = BE_32(hdr->raw_version_level); 546 547 return (c_len + sizeof (*hdr)); 548 } 549 550 static size_t 551 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, 552 int level) 553 { 554 int16_t zstd_level; 555 if (zstd_enum_to_level(level, &zstd_level)) { 556 ZSTDSTAT_BUMP(zstd_stat_com_inval); 557 return (s_len); 558 } 559 /* 560 * A zstd early abort heuristic. 561 * 562 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 563 * 128k), don't try any of this, just go. 564 * (because experimentally that was a reasonable cutoff for a perf win 565 * with tiny ratio change) 566 * - First, we try LZ4 compression, and if it doesn't early abort, we 567 * jump directly to whatever compression level we intended to try. 568 * - Second, we try zstd-1 - if that errors out (usually, but not 569 * exclusively, if it would overflow), we give up early. 570 * 571 * If it works, instead we go on and compress anyway. 572 * 573 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 574 * compressible data, it was losing up to 8.5% of the compressed 575 * savings versus no early abort, and all the zstd-fast levels are 576 * worse indications on their own than LZ4, and don't improve the LZ4 577 * pass noticably if stacked like this. 578 */ 579 size_t actual_abort_size = zstd_abort_size; 580 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 581 s_len >= actual_abort_size) { 582 abd_t sabd, dabd; 583 abd_get_from_buf_struct(&sabd, s_start, s_len); 584 abd_get_from_buf_struct(&dabd, d_start, d_len); 585 int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); 586 abd_free(&dabd); 587 abd_free(&sabd); 588 if (pass_len < d_len) { 589 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 590 goto keep_trying; 591 } 592 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 593 594 pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len, 595 d_len, ZIO_ZSTD_LEVEL_1); 596 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 597 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 598 return (s_len); 599 } 600 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 601 } else { 602 ZSTDSTAT_BUMP(zstd_stat_passignored); 603 if (s_len < actual_abort_size) { 604 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 605 } 606 } 607 keep_trying: 608 return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level)); 609 610 } 611 #endif 612 613 /* Decompress block using zstd and return its stored level */ 614 static int 615 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len, 616 size_t d_len, uint8_t *level) 617 { 618 ZSTD_DCtx *dctx; 619 size_t result; 620 int16_t zstd_level; 621 uint32_t c_len; 622 const zfs_zstdhdr_t *hdr; 623 zfs_zstdhdr_t hdr_copy; 624 625 hdr = (const zfs_zstdhdr_t *)s_start; 626 c_len = BE_32(hdr->c_len); 627 628 /* 629 * Make a copy instead of directly converting the header, since we must 630 * not modify the original data that may be used again later. 631 */ 632 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); 633 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); 634 635 /* 636 * NOTE: We ignore the ZSTD version for now. As soon as any 637 * incompatibility occurs, it has to be handled accordingly. 638 * The version can be accessed via `hdr_copy.version`. 639 */ 640 641 /* 642 * Convert and check the level 643 * An invalid level is a strong indicator for data corruption! In such 644 * case return an error so the upper layers can try to fix it. 645 */ 646 if (zstd_enum_to_level(curlevel, &zstd_level)) { 647 ZSTDSTAT_BUMP(zstd_stat_dec_inval); 648 return (1); 649 } 650 651 ASSERT3U(d_len, >=, s_len); 652 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); 653 654 /* Invalid compressed buffer size encoded at start */ 655 if (c_len + sizeof (*hdr) > s_len) { 656 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); 657 return (1); 658 } 659 660 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); 661 if (!dctx) { 662 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); 663 return (1); 664 } 665 666 /* Set header type to "magicless" */ 667 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); 668 669 /* Decompress the data and release the context */ 670 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); 671 ZSTD_freeDCtx(dctx); 672 673 /* 674 * Returns 0 on success (decompression function returned non-negative) 675 * and non-zero on failure (decompression function returned negative. 676 */ 677 if (ZSTD_isError(result)) { 678 ZSTDSTAT_BUMP(zstd_stat_dec_fail); 679 return (1); 680 } 681 682 if (level) { 683 *level = curlevel; 684 } 685 686 return (0); 687 } 688 689 /* Decompress datablock using zstd */ 690 #ifdef IN_BASE 691 int 692 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 693 size_t d_len, int level __maybe_unused) 694 { 695 696 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 697 NULL)); 698 } 699 #else 700 static int 701 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 702 size_t d_len, int level __maybe_unused) 703 { 704 705 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 706 NULL)); 707 } 708 #endif 709 710 #ifndef IN_LIBSA 711 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress) 712 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress) 713 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level) 714 715 /* Allocator for zstd compression context using mempool_allocator */ 716 static void * 717 zstd_alloc(void *opaque __maybe_unused, size_t size) 718 { 719 size_t nbytes = sizeof (struct zstd_kmem) + size; 720 struct zstd_kmem *z = NULL; 721 722 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); 723 724 if (!z) { 725 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 726 return (NULL); 727 } 728 729 return ((void*)z + (sizeof (struct zstd_kmem))); 730 } 731 732 #endif 733 /* 734 * Allocator for zstd decompression context using mempool_allocator with 735 * fallback to reserved memory if allocation fails 736 */ 737 static void * 738 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) 739 { 740 size_t nbytes = sizeof (struct zstd_kmem) + size; 741 struct zstd_kmem *z = NULL; 742 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; 743 744 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); 745 if (!z) { 746 /* Try harder, decompression shall not fail */ 747 z = vmem_alloc(nbytes, KM_SLEEP); 748 if (z) { 749 z->pool = NULL; 750 } 751 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 752 } else { 753 return ((void*)z + (sizeof (struct zstd_kmem))); 754 } 755 756 /* Fallback if everything fails */ 757 if (!z) { 758 /* 759 * Barrier since we only can handle it in a single thread. All 760 * other following threads need to wait here until decompression 761 * is completed. zstd_free will release this barrier later. 762 */ 763 mutex_enter(&zstd_dctx_fallback.barrier); 764 765 z = zstd_dctx_fallback.mem; 766 type = ZSTD_KMEM_DCTX; 767 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); 768 } 769 770 /* Allocation should always be successful */ 771 if (!z) { 772 return (NULL); 773 } 774 775 z->kmem_type = type; 776 z->kmem_size = nbytes; 777 778 return ((void*)z + (sizeof (struct zstd_kmem))); 779 } 780 781 /* Free allocated memory by its specific type */ 782 static void 783 zstd_free(void *opaque __maybe_unused, void *ptr) 784 { 785 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); 786 enum zstd_kmem_type type; 787 788 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); 789 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); 790 791 type = z->kmem_type; 792 switch (type) { 793 case ZSTD_KMEM_DEFAULT: 794 vmem_free(z, z->kmem_size); 795 break; 796 case ZSTD_KMEM_POOL: 797 zstd_mempool_free(z); 798 break; 799 case ZSTD_KMEM_DCTX: 800 mutex_exit(&zstd_dctx_fallback.barrier); 801 break; 802 default: 803 break; 804 } 805 } 806 807 /* Allocate fallback memory to ensure safe decompression */ 808 static void __init 809 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) 810 { 811 mem->mem_size = size; 812 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); 813 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); 814 } 815 816 /* Initialize memory pool barrier mutexes */ 817 static void __init 818 zstd_mempool_init(void) 819 { 820 zstd_mempool_cctx = 821 vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 822 zstd_mempool_dctx = 823 vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 824 825 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 826 mutex_init(&zstd_mempool_cctx[i].barrier, NULL, 827 MUTEX_DEFAULT, NULL); 828 mutex_init(&zstd_mempool_dctx[i].barrier, NULL, 829 MUTEX_DEFAULT, NULL); 830 } 831 } 832 833 /* Initialize zstd-related memory handling */ 834 static int __init 835 zstd_meminit(void) 836 { 837 zstd_mempool_init(); 838 839 /* 840 * Estimate the size of the fallback decompression context. 841 * The expected size on x64 with current ZSTD should be about 160 KB. 842 */ 843 create_fallback_mem(&zstd_dctx_fallback, 844 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), 845 PAGESIZE)); 846 847 return (0); 848 } 849 850 /* Release object from pool and free memory */ 851 static void 852 release_pool(struct zstd_pool *pool) 853 { 854 mutex_destroy(&pool->barrier); 855 vmem_free(pool->mem, pool->size); 856 pool->mem = NULL; 857 pool->size = 0; 858 } 859 860 /* Release memory pool objects */ 861 static void 862 zstd_mempool_deinit(void) 863 { 864 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 865 release_pool(&zstd_mempool_cctx[i]); 866 release_pool(&zstd_mempool_dctx[i]); 867 } 868 869 vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 870 vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 871 zstd_mempool_dctx = NULL; 872 zstd_mempool_cctx = NULL; 873 } 874 875 /* release unused memory from pool */ 876 877 void 878 zfs_zstd_cache_reap_now(void) 879 { 880 881 /* 882 * Short-circuit if there are no buffers to begin with. 883 */ 884 if (ZSTDSTAT(zstd_stat_buffers) == 0) 885 return; 886 887 /* 888 * calling alloc with zero size seeks 889 * and releases old unused objects 890 */ 891 zstd_mempool_reap(zstd_mempool_cctx); 892 zstd_mempool_reap(zstd_mempool_dctx); 893 } 894 895 extern int __init 896 zstd_init(void) 897 { 898 /* Set pool size by using maximum sane thread count * 4 */ 899 pool_count = (boot_ncpus * 4); 900 zstd_meminit(); 901 902 /* Initialize kstat */ 903 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", 904 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), 905 KSTAT_FLAG_VIRTUAL); 906 if (zstd_ksp != NULL) { 907 zstd_ksp->ks_data = &zstd_stats; 908 kstat_install(zstd_ksp); 909 #ifdef _KERNEL 910 zstd_ksp->ks_update = kstat_zstd_update; 911 #endif 912 } 913 914 return (0); 915 } 916 917 extern void 918 zstd_fini(void) 919 { 920 /* Deinitialize kstat */ 921 if (zstd_ksp != NULL) { 922 kstat_delete(zstd_ksp); 923 zstd_ksp = NULL; 924 } 925 926 /* Release fallback memory */ 927 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); 928 mutex_destroy(&zstd_dctx_fallback.barrier); 929 930 /* Deinit memory pool */ 931 zstd_mempool_deinit(); 932 } 933 934 #if defined(_KERNEL) 935 #ifdef __FreeBSD__ 936 module_init(zstd_init); 937 module_exit(zstd_fini); 938 #endif 939 940 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, 941 "Enable early abort attempts when using zstd"); 942 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, 943 "Minimal size of block to attempt early abort"); 944 #endif 945