1 // SPDX-License-Identifier: BSD-3-Clause 2 /* 3 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * 3. Neither the name of the copyright holder nor the names of its 16 * contributors may be used to endorse or promote products derived from this 17 * software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2016-2018, Klara Inc. 34 * Copyright (c) 2016-2018, Allan Jude 35 * Copyright (c) 2018-2020, Sebastian Gottschall 36 * Copyright (c) 2019-2020, Michael Niewöhner 37 * Copyright (c) 2020, The FreeBSD Foundation [1] 38 * 39 * [1] Portions of this software were developed by Allan Jude 40 * under sponsorship from the FreeBSD Foundation. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/sysmacros.h> 45 #include <sys/zfs_context.h> 46 #include <sys/zio_compress.h> 47 #include <sys/spa.h> 48 #include <sys/zstd/zstd.h> 49 50 #define ZSTD_STATIC_LINKING_ONLY 51 #include "lib/zstd.h" 52 #include "lib/common/zstd_errors.h" 53 54 #ifndef IN_LIBSA 55 static uint_t zstd_earlyabort_pass = 1; 56 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; 57 static unsigned int zstd_abort_size = (128 * 1024); 58 #endif 59 60 #ifdef IN_BASE 61 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int); 62 #endif 63 64 static kstat_t *zstd_ksp = NULL; 65 66 typedef struct zstd_stats { 67 kstat_named_t zstd_stat_alloc_fail; 68 kstat_named_t zstd_stat_alloc_fallback; 69 kstat_named_t zstd_stat_com_alloc_fail; 70 kstat_named_t zstd_stat_dec_alloc_fail; 71 kstat_named_t zstd_stat_com_inval; 72 kstat_named_t zstd_stat_dec_inval; 73 kstat_named_t zstd_stat_dec_header_inval; 74 kstat_named_t zstd_stat_com_fail; 75 kstat_named_t zstd_stat_dec_fail; 76 /* 77 * LZ4 first-pass early abort verdict 78 */ 79 kstat_named_t zstd_stat_lz4pass_allowed; 80 kstat_named_t zstd_stat_lz4pass_rejected; 81 /* 82 * zstd-1 second-pass early abort verdict 83 */ 84 kstat_named_t zstd_stat_zstdpass_allowed; 85 kstat_named_t zstd_stat_zstdpass_rejected; 86 /* 87 * We excluded this from early abort for some reason 88 */ 89 kstat_named_t zstd_stat_passignored; 90 kstat_named_t zstd_stat_passignored_size; 91 kstat_named_t zstd_stat_buffers; 92 kstat_named_t zstd_stat_size; 93 } zstd_stats_t; 94 95 static zstd_stats_t zstd_stats = { 96 { "alloc_fail", KSTAT_DATA_UINT64 }, 97 { "alloc_fallback", KSTAT_DATA_UINT64 }, 98 { "compress_alloc_fail", KSTAT_DATA_UINT64 }, 99 { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, 100 { "compress_level_invalid", KSTAT_DATA_UINT64 }, 101 { "decompress_level_invalid", KSTAT_DATA_UINT64 }, 102 { "decompress_header_invalid", KSTAT_DATA_UINT64 }, 103 { "compress_failed", KSTAT_DATA_UINT64 }, 104 { "decompress_failed", KSTAT_DATA_UINT64 }, 105 { "lz4pass_allowed", KSTAT_DATA_UINT64 }, 106 { "lz4pass_rejected", KSTAT_DATA_UINT64 }, 107 { "zstdpass_allowed", KSTAT_DATA_UINT64 }, 108 { "zstdpass_rejected", KSTAT_DATA_UINT64 }, 109 { "passignored", KSTAT_DATA_UINT64 }, 110 { "passignored_size", KSTAT_DATA_UINT64 }, 111 { "buffers", KSTAT_DATA_UINT64 }, 112 { "size", KSTAT_DATA_UINT64 }, 113 }; 114 115 #ifdef _KERNEL 116 static int 117 kstat_zstd_update(kstat_t *ksp, int rw) 118 { 119 ASSERT(ksp != NULL); 120 121 if (rw == KSTAT_WRITE && ksp == zstd_ksp) { 122 ZSTDSTAT_ZERO(zstd_stat_alloc_fail); 123 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); 124 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); 125 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); 126 ZSTDSTAT_ZERO(zstd_stat_com_inval); 127 ZSTDSTAT_ZERO(zstd_stat_dec_inval); 128 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); 129 ZSTDSTAT_ZERO(zstd_stat_com_fail); 130 ZSTDSTAT_ZERO(zstd_stat_dec_fail); 131 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); 132 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); 133 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); 134 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); 135 ZSTDSTAT_ZERO(zstd_stat_passignored); 136 ZSTDSTAT_ZERO(zstd_stat_passignored_size); 137 } 138 139 return (0); 140 } 141 #endif 142 143 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ 144 enum zstd_kmem_type { 145 ZSTD_KMEM_UNKNOWN = 0, 146 /* Allocation type using kmem_vmalloc */ 147 ZSTD_KMEM_DEFAULT, 148 /* Pool based allocation using mempool_alloc */ 149 ZSTD_KMEM_POOL, 150 /* Reserved fallback memory for decompression only */ 151 ZSTD_KMEM_DCTX, 152 ZSTD_KMEM_COUNT, 153 }; 154 155 /* Structure for pooled memory objects */ 156 struct zstd_pool { 157 void *mem; 158 size_t size; 159 kmutex_t barrier; 160 hrtime_t timeout; 161 }; 162 163 /* Global structure for handling memory allocations */ 164 struct zstd_kmem { 165 enum zstd_kmem_type kmem_type; 166 size_t kmem_size; 167 struct zstd_pool *pool; 168 }; 169 170 /* Fallback memory structure used for decompression only if memory runs out */ 171 struct zstd_fallback_mem { 172 size_t mem_size; 173 void *mem; 174 kmutex_t barrier; 175 }; 176 177 struct zstd_levelmap { 178 int16_t zstd_level; 179 enum zio_zstd_levels level; 180 }; 181 182 /* 183 * ZSTD memory handlers 184 * 185 * For decompression we use a different handler which also provides fallback 186 * memory allocation in case memory runs out. 187 * 188 * The ZSTD handlers were split up for the most simplified implementation. 189 */ 190 #ifndef IN_LIBSA 191 static void *zstd_alloc(void *opaque, size_t size); 192 #endif 193 static void *zstd_dctx_alloc(void *opaque, size_t size); 194 static void zstd_free(void *opaque, void *ptr); 195 196 #ifndef IN_LIBSA 197 /* Compression memory handler */ 198 static const ZSTD_customMem zstd_malloc = { 199 zstd_alloc, 200 zstd_free, 201 NULL, 202 }; 203 #endif 204 205 /* Decompression memory handler */ 206 static const ZSTD_customMem zstd_dctx_malloc = { 207 zstd_dctx_alloc, 208 zstd_free, 209 NULL, 210 }; 211 212 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ 213 static struct zstd_levelmap zstd_levels[] = { 214 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, 215 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, 216 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, 217 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, 218 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, 219 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, 220 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, 221 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, 222 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, 223 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, 224 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, 225 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, 226 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, 227 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, 228 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, 229 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, 230 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, 231 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, 232 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, 233 {-1, ZIO_ZSTD_LEVEL_FAST_1}, 234 {-2, ZIO_ZSTD_LEVEL_FAST_2}, 235 {-3, ZIO_ZSTD_LEVEL_FAST_3}, 236 {-4, ZIO_ZSTD_LEVEL_FAST_4}, 237 {-5, ZIO_ZSTD_LEVEL_FAST_5}, 238 {-6, ZIO_ZSTD_LEVEL_FAST_6}, 239 {-7, ZIO_ZSTD_LEVEL_FAST_7}, 240 {-8, ZIO_ZSTD_LEVEL_FAST_8}, 241 {-9, ZIO_ZSTD_LEVEL_FAST_9}, 242 {-10, ZIO_ZSTD_LEVEL_FAST_10}, 243 {-20, ZIO_ZSTD_LEVEL_FAST_20}, 244 {-30, ZIO_ZSTD_LEVEL_FAST_30}, 245 {-40, ZIO_ZSTD_LEVEL_FAST_40}, 246 {-50, ZIO_ZSTD_LEVEL_FAST_50}, 247 {-60, ZIO_ZSTD_LEVEL_FAST_60}, 248 {-70, ZIO_ZSTD_LEVEL_FAST_70}, 249 {-80, ZIO_ZSTD_LEVEL_FAST_80}, 250 {-90, ZIO_ZSTD_LEVEL_FAST_90}, 251 {-100, ZIO_ZSTD_LEVEL_FAST_100}, 252 {-500, ZIO_ZSTD_LEVEL_FAST_500}, 253 {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, 254 }; 255 256 /* 257 * This variable represents the maximum count of the pool based on the number 258 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. 259 */ 260 static int pool_count = 16; 261 262 #define ZSTD_POOL_MAX pool_count 263 #define ZSTD_POOL_TIMEOUT 60 * 2 264 265 static struct zstd_fallback_mem zstd_dctx_fallback; 266 static struct zstd_pool *zstd_mempool_cctx; 267 static struct zstd_pool *zstd_mempool_dctx; 268 269 /* 270 * The library zstd code expects these if ADDRESS_SANITIZER gets defined, 271 * and while ASAN does this, KASAN defines that and does not. So to avoid 272 * changing the external code, we do this. 273 */ 274 #if defined(ZFS_ASAN_ENABLED) 275 #define ADDRESS_SANITIZER 1 276 #endif 277 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) 278 void __asan_unpoison_memory_region(void const volatile *addr, size_t size); 279 void __asan_poison_memory_region(void const volatile *addr, size_t size); 280 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; 281 void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; 282 #endif 283 284 285 static void 286 zstd_mempool_reap(struct zstd_pool *zstd_mempool) 287 { 288 struct zstd_pool *pool; 289 290 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { 291 return; 292 } 293 294 /* free obsolete slots */ 295 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 296 pool = &zstd_mempool[i]; 297 if (pool->mem && mutex_tryenter(&pool->barrier)) { 298 /* Free memory if unused object older than 2 minutes */ 299 if (pool->mem && gethrestime_sec() > pool->timeout) { 300 vmem_free(pool->mem, pool->size); 301 ZSTDSTAT_SUB(zstd_stat_buffers, 1); 302 ZSTDSTAT_SUB(zstd_stat_size, pool->size); 303 pool->mem = NULL; 304 pool->size = 0; 305 pool->timeout = 0; 306 } 307 mutex_exit(&pool->barrier); 308 } 309 } 310 } 311 312 /* 313 * Try to get a cached allocated buffer from memory pool or allocate a new one 314 * if necessary. If a object is older than 2 minutes and does not fit the 315 * requested size, it will be released and a new cached entry will be allocated. 316 * If other pooled objects are detected without being used for 2 minutes, they 317 * will be released, too. 318 * 319 * The concept is that high frequency memory allocations of bigger objects are 320 * expensive. So if a lot of work is going on, allocations will be kept for a 321 * while and can be reused in that time frame. 322 * 323 * The scheduled release will be updated every time a object is reused. 324 */ 325 326 static void * 327 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) 328 { 329 struct zstd_pool *pool; 330 struct zstd_kmem *mem = NULL; 331 332 if (!zstd_mempool) { 333 return (NULL); 334 } 335 336 /* Seek for preallocated memory slot and free obsolete slots */ 337 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 338 pool = &zstd_mempool[i]; 339 /* 340 * This lock is simply a marker for a pool object being in use. 341 * If it's already hold, it will be skipped. 342 * 343 * We need to create it before checking it to avoid race 344 * conditions caused by running in a threaded context. 345 * 346 * The lock is later released by zstd_mempool_free. 347 */ 348 if (mutex_tryenter(&pool->barrier)) { 349 /* 350 * Check if objects fits the size, if so we take it and 351 * update the timestamp. 352 */ 353 if (pool->mem && size <= pool->size) { 354 pool->timeout = gethrestime_sec() + 355 ZSTD_POOL_TIMEOUT; 356 mem = pool->mem; 357 return (mem); 358 } 359 mutex_exit(&pool->barrier); 360 } 361 } 362 363 /* 364 * If no preallocated slot was found, try to fill in a new one. 365 * 366 * We run a similar algorithm twice here to avoid pool fragmentation. 367 * The first one may generate holes in the list if objects get released. 368 * We always make sure that these holes get filled instead of adding new 369 * allocations constantly at the end. 370 */ 371 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 372 pool = &zstd_mempool[i]; 373 if (mutex_tryenter(&pool->barrier)) { 374 /* Object is free, try to allocate new one */ 375 if (!pool->mem) { 376 mem = vmem_alloc(size, KM_SLEEP); 377 if (mem) { 378 ZSTDSTAT_ADD(zstd_stat_buffers, 1); 379 ZSTDSTAT_ADD(zstd_stat_size, size); 380 pool->mem = mem; 381 pool->size = size; 382 /* Keep track for later release */ 383 mem->pool = pool; 384 mem->kmem_type = ZSTD_KMEM_POOL; 385 mem->kmem_size = size; 386 } 387 } 388 389 if (size <= pool->size) { 390 /* Update timestamp */ 391 pool->timeout = gethrestime_sec() + 392 ZSTD_POOL_TIMEOUT; 393 394 return (pool->mem); 395 } 396 397 mutex_exit(&pool->barrier); 398 } 399 } 400 401 /* 402 * If the pool is full or the allocation failed, try lazy allocation 403 * instead. 404 */ 405 if (!mem) { 406 mem = vmem_alloc(size, KM_NOSLEEP); 407 if (mem) { 408 mem->pool = NULL; 409 mem->kmem_type = ZSTD_KMEM_DEFAULT; 410 mem->kmem_size = size; 411 } 412 } 413 414 return (mem); 415 } 416 417 /* Mark object as released by releasing the barrier mutex */ 418 static void 419 zstd_mempool_free(struct zstd_kmem *z) 420 { 421 mutex_exit(&z->pool->barrier); 422 } 423 424 /* Convert ZFS internal enum to ZSTD level */ 425 static int 426 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) 427 { 428 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { 429 *zstd_level = zstd_levels[level - 1].zstd_level; 430 return (0); 431 } 432 if (level >= ZIO_ZSTD_LEVEL_FAST_1 && 433 level <= ZIO_ZSTD_LEVEL_FAST_1000) { 434 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 435 + ZIO_ZSTD_LEVEL_19].zstd_level; 436 return (0); 437 } 438 439 /* Invalid/unknown zfs compression enum - this should never happen. */ 440 return (1); 441 } 442 443 #ifndef IN_LIBSA 444 static size_t 445 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, 446 int level) 447 { 448 int16_t zstd_level; 449 if (zstd_enum_to_level(level, &zstd_level)) { 450 ZSTDSTAT_BUMP(zstd_stat_com_inval); 451 return (s_len); 452 } 453 /* 454 * A zstd early abort heuristic. 455 * 456 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 457 * 128k), don't try any of this, just go. 458 * (because experimentally that was a reasonable cutoff for a perf win 459 * with tiny ratio change) 460 * - First, we try LZ4 compression, and if it doesn't early abort, we 461 * jump directly to whatever compression level we intended to try. 462 * - Second, we try zstd-1 - if that errors out (usually, but not 463 * exclusively, if it would overflow), we give up early. 464 * 465 * If it works, instead we go on and compress anyway. 466 * 467 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 468 * compressible data, it was losing up to 8.5% of the compressed 469 * savings versus no early abort, and all the zstd-fast levels are 470 * worse indications on their own than LZ4, and don't improve the LZ4 471 * pass noticably if stacked like this. 472 */ 473 size_t actual_abort_size = zstd_abort_size; 474 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 475 s_len >= actual_abort_size) { 476 int pass_len = 1; 477 pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0); 478 if (pass_len < d_len) { 479 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 480 goto keep_trying; 481 } 482 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 483 484 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, 485 ZIO_ZSTD_LEVEL_1); 486 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 487 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 488 return (s_len); 489 } 490 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 491 } else { 492 ZSTDSTAT_BUMP(zstd_stat_passignored); 493 if (s_len < actual_abort_size) { 494 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 495 } 496 } 497 keep_trying: 498 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); 499 500 } 501 502 /* Compress block using zstd */ 503 static size_t 504 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, 505 int level) 506 { 507 size_t c_len; 508 int16_t zstd_level; 509 zfs_zstdhdr_t *hdr; 510 ZSTD_CCtx *cctx; 511 512 hdr = (zfs_zstdhdr_t *)d_start; 513 514 /* Skip compression if the specified level is invalid */ 515 if (zstd_enum_to_level(level, &zstd_level)) { 516 ZSTDSTAT_BUMP(zstd_stat_com_inval); 517 return (s_len); 518 } 519 520 ASSERT3U(d_len, >=, sizeof (*hdr)); 521 ASSERT3U(d_len, <=, s_len); 522 ASSERT3U(zstd_level, !=, 0); 523 524 cctx = ZSTD_createCCtx_advanced(zstd_malloc); 525 526 /* 527 * Out of kernel memory, gently fall through - this will disable 528 * compression in zio_compress_data 529 */ 530 if (!cctx) { 531 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); 532 return (s_len); 533 } 534 535 /* Set the compression level */ 536 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); 537 538 /* Use the "magicless" zstd header which saves us 4 header bytes */ 539 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); 540 541 /* 542 * Disable redundant checksum calculation and content size storage since 543 * this is already done by ZFS itself. 544 */ 545 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); 546 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); 547 548 c_len = ZSTD_compress2(cctx, 549 hdr->data, 550 d_len - sizeof (*hdr), 551 s_start, s_len); 552 553 ZSTD_freeCCtx(cctx); 554 555 /* Error in the compression routine, disable compression. */ 556 if (ZSTD_isError(c_len)) { 557 /* 558 * If we are aborting the compression because the saves are 559 * too small, that is not a failure. Everything else is a 560 * failure, so increment the compression failure counter. 561 */ 562 int err = ZSTD_getErrorCode(c_len); 563 if (err != ZSTD_error_dstSize_tooSmall) { 564 ZSTDSTAT_BUMP(zstd_stat_com_fail); 565 dprintf("Error: %s", ZSTD_getErrorString(err)); 566 } 567 return (s_len); 568 } 569 570 /* 571 * Encode the compressed buffer size at the start. We'll need this in 572 * decompression to counter the effects of padding which might be added 573 * to the compressed buffer and which, if unhandled, would confuse the 574 * hell out of our decompression function. 575 */ 576 hdr->c_len = BE_32(c_len); 577 578 /* 579 * Check version for overflow. 580 * The limit of 24 bits must not be exceeded. This allows a maximum 581 * version 1677.72.15 which we don't expect to be ever reached. 582 */ 583 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); 584 585 /* 586 * Encode the compression level as well. We may need to know the 587 * original compression level if compressed_arc is disabled, to match 588 * the compression settings to write this block to the L2ARC. 589 * 590 * Encode the actual level, so if the enum changes in the future, we 591 * will be compatible. 592 * 593 * The upper 24 bits store the ZSTD version to be able to provide 594 * future compatibility, since new versions might enhance the 595 * compression algorithm in a way, where the compressed data will 596 * change. 597 * 598 * As soon as such incompatibility occurs, handling code needs to be 599 * added, differentiating between the versions. 600 */ 601 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); 602 zfs_set_hdrlevel(hdr, level); 603 hdr->raw_version_level = BE_32(hdr->raw_version_level); 604 605 return (c_len + sizeof (*hdr)); 606 } 607 608 static size_t 609 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, 610 int level) 611 { 612 int16_t zstd_level; 613 if (zstd_enum_to_level(level, &zstd_level)) { 614 ZSTDSTAT_BUMP(zstd_stat_com_inval); 615 return (s_len); 616 } 617 /* 618 * A zstd early abort heuristic. 619 * 620 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 621 * 128k), don't try any of this, just go. 622 * (because experimentally that was a reasonable cutoff for a perf win 623 * with tiny ratio change) 624 * - First, we try LZ4 compression, and if it doesn't early abort, we 625 * jump directly to whatever compression level we intended to try. 626 * - Second, we try zstd-1 - if that errors out (usually, but not 627 * exclusively, if it would overflow), we give up early. 628 * 629 * If it works, instead we go on and compress anyway. 630 * 631 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 632 * compressible data, it was losing up to 8.5% of the compressed 633 * savings versus no early abort, and all the zstd-fast levels are 634 * worse indications on their own than LZ4, and don't improve the LZ4 635 * pass noticably if stacked like this. 636 */ 637 size_t actual_abort_size = zstd_abort_size; 638 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 639 s_len >= actual_abort_size) { 640 int pass_len = 1; 641 abd_t sabd, dabd; 642 abd_get_from_buf_struct(&sabd, s_start, s_len); 643 abd_get_from_buf_struct(&dabd, d_start, d_len); 644 pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); 645 abd_free(&dabd); 646 abd_free(&sabd); 647 if (pass_len < d_len) { 648 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 649 goto keep_trying; 650 } 651 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 652 653 pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len, 654 d_len, ZIO_ZSTD_LEVEL_1); 655 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 656 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 657 return (s_len); 658 } 659 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 660 } else { 661 ZSTDSTAT_BUMP(zstd_stat_passignored); 662 if (s_len < actual_abort_size) { 663 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 664 } 665 } 666 keep_trying: 667 return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level)); 668 669 } 670 #endif 671 672 /* Decompress block using zstd and return its stored level */ 673 static int 674 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len, 675 size_t d_len, uint8_t *level) 676 { 677 ZSTD_DCtx *dctx; 678 size_t result; 679 int16_t zstd_level; 680 uint32_t c_len; 681 const zfs_zstdhdr_t *hdr; 682 zfs_zstdhdr_t hdr_copy; 683 684 hdr = (const zfs_zstdhdr_t *)s_start; 685 c_len = BE_32(hdr->c_len); 686 687 /* 688 * Make a copy instead of directly converting the header, since we must 689 * not modify the original data that may be used again later. 690 */ 691 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); 692 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); 693 694 /* 695 * NOTE: We ignore the ZSTD version for now. As soon as any 696 * incompatibility occurs, it has to be handled accordingly. 697 * The version can be accessed via `hdr_copy.version`. 698 */ 699 700 /* 701 * Convert and check the level 702 * An invalid level is a strong indicator for data corruption! In such 703 * case return an error so the upper layers can try to fix it. 704 */ 705 if (zstd_enum_to_level(curlevel, &zstd_level)) { 706 ZSTDSTAT_BUMP(zstd_stat_dec_inval); 707 return (1); 708 } 709 710 ASSERT3U(d_len, >=, s_len); 711 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); 712 713 /* Invalid compressed buffer size encoded at start */ 714 if (c_len + sizeof (*hdr) > s_len) { 715 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); 716 return (1); 717 } 718 719 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); 720 if (!dctx) { 721 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); 722 return (1); 723 } 724 725 /* Set header type to "magicless" */ 726 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); 727 728 /* Decompress the data and release the context */ 729 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); 730 ZSTD_freeDCtx(dctx); 731 732 /* 733 * Returns 0 on success (decompression function returned non-negative) 734 * and non-zero on failure (decompression function returned negative. 735 */ 736 if (ZSTD_isError(result)) { 737 ZSTDSTAT_BUMP(zstd_stat_dec_fail); 738 return (1); 739 } 740 741 if (level) { 742 *level = curlevel; 743 } 744 745 return (0); 746 } 747 748 /* Decompress datablock using zstd */ 749 #ifdef IN_BASE 750 int 751 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 752 size_t d_len, int level __maybe_unused) 753 { 754 755 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 756 NULL)); 757 } 758 #else 759 static int 760 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 761 size_t d_len, int level __maybe_unused) 762 { 763 764 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 765 NULL)); 766 } 767 #endif 768 769 #ifndef IN_LIBSA 770 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress) 771 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress) 772 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level) 773 774 /* Allocator for zstd compression context using mempool_allocator */ 775 static void * 776 zstd_alloc(void *opaque __maybe_unused, size_t size) 777 { 778 size_t nbytes = sizeof (struct zstd_kmem) + size; 779 struct zstd_kmem *z = NULL; 780 781 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); 782 783 if (!z) { 784 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 785 return (NULL); 786 } 787 788 return ((void*)z + (sizeof (struct zstd_kmem))); 789 } 790 791 #endif 792 /* 793 * Allocator for zstd decompression context using mempool_allocator with 794 * fallback to reserved memory if allocation fails 795 */ 796 static void * 797 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) 798 { 799 size_t nbytes = sizeof (struct zstd_kmem) + size; 800 struct zstd_kmem *z = NULL; 801 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; 802 803 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); 804 if (!z) { 805 /* Try harder, decompression shall not fail */ 806 z = vmem_alloc(nbytes, KM_SLEEP); 807 if (z) { 808 z->pool = NULL; 809 } 810 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 811 } else { 812 return ((void*)z + (sizeof (struct zstd_kmem))); 813 } 814 815 /* Fallback if everything fails */ 816 if (!z) { 817 /* 818 * Barrier since we only can handle it in a single thread. All 819 * other following threads need to wait here until decompression 820 * is completed. zstd_free will release this barrier later. 821 */ 822 mutex_enter(&zstd_dctx_fallback.barrier); 823 824 z = zstd_dctx_fallback.mem; 825 type = ZSTD_KMEM_DCTX; 826 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); 827 } 828 829 /* Allocation should always be successful */ 830 if (!z) { 831 return (NULL); 832 } 833 834 z->kmem_type = type; 835 z->kmem_size = nbytes; 836 837 return ((void*)z + (sizeof (struct zstd_kmem))); 838 } 839 840 /* Free allocated memory by its specific type */ 841 static void 842 zstd_free(void *opaque __maybe_unused, void *ptr) 843 { 844 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); 845 enum zstd_kmem_type type; 846 847 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); 848 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); 849 850 type = z->kmem_type; 851 switch (type) { 852 case ZSTD_KMEM_DEFAULT: 853 vmem_free(z, z->kmem_size); 854 break; 855 case ZSTD_KMEM_POOL: 856 zstd_mempool_free(z); 857 break; 858 case ZSTD_KMEM_DCTX: 859 mutex_exit(&zstd_dctx_fallback.barrier); 860 break; 861 default: 862 break; 863 } 864 } 865 866 /* Allocate fallback memory to ensure safe decompression */ 867 static void __init 868 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) 869 { 870 mem->mem_size = size; 871 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); 872 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); 873 } 874 875 /* Initialize memory pool barrier mutexes */ 876 static void __init 877 zstd_mempool_init(void) 878 { 879 zstd_mempool_cctx = 880 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 881 zstd_mempool_dctx = 882 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 883 884 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 885 mutex_init(&zstd_mempool_cctx[i].barrier, NULL, 886 MUTEX_DEFAULT, NULL); 887 mutex_init(&zstd_mempool_dctx[i].barrier, NULL, 888 MUTEX_DEFAULT, NULL); 889 } 890 } 891 892 /* Initialize zstd-related memory handling */ 893 static int __init 894 zstd_meminit(void) 895 { 896 zstd_mempool_init(); 897 898 /* 899 * Estimate the size of the fallback decompression context. 900 * The expected size on x64 with current ZSTD should be about 160 KB. 901 */ 902 create_fallback_mem(&zstd_dctx_fallback, 903 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), 904 PAGESIZE)); 905 906 return (0); 907 } 908 909 /* Release object from pool and free memory */ 910 static void 911 release_pool(struct zstd_pool *pool) 912 { 913 mutex_destroy(&pool->barrier); 914 vmem_free(pool->mem, pool->size); 915 pool->mem = NULL; 916 pool->size = 0; 917 } 918 919 /* Release memory pool objects */ 920 static void 921 zstd_mempool_deinit(void) 922 { 923 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 924 release_pool(&zstd_mempool_cctx[i]); 925 release_pool(&zstd_mempool_dctx[i]); 926 } 927 928 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 929 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 930 zstd_mempool_dctx = NULL; 931 zstd_mempool_cctx = NULL; 932 } 933 934 /* release unused memory from pool */ 935 936 void 937 zfs_zstd_cache_reap_now(void) 938 { 939 940 /* 941 * Short-circuit if there are no buffers to begin with. 942 */ 943 if (ZSTDSTAT(zstd_stat_buffers) == 0) 944 return; 945 946 /* 947 * calling alloc with zero size seeks 948 * and releases old unused objects 949 */ 950 zstd_mempool_reap(zstd_mempool_cctx); 951 zstd_mempool_reap(zstd_mempool_dctx); 952 } 953 954 extern int __init 955 zstd_init(void) 956 { 957 /* Set pool size by using maximum sane thread count * 4 */ 958 pool_count = (boot_ncpus * 4); 959 zstd_meminit(); 960 961 /* Initialize kstat */ 962 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", 963 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), 964 KSTAT_FLAG_VIRTUAL); 965 if (zstd_ksp != NULL) { 966 zstd_ksp->ks_data = &zstd_stats; 967 kstat_install(zstd_ksp); 968 #ifdef _KERNEL 969 zstd_ksp->ks_update = kstat_zstd_update; 970 #endif 971 } 972 973 return (0); 974 } 975 976 extern void 977 zstd_fini(void) 978 { 979 /* Deinitialize kstat */ 980 if (zstd_ksp != NULL) { 981 kstat_delete(zstd_ksp); 982 zstd_ksp = NULL; 983 } 984 985 /* Release fallback memory */ 986 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); 987 mutex_destroy(&zstd_dctx_fallback.barrier); 988 989 /* Deinit memory pool */ 990 zstd_mempool_deinit(); 991 } 992 993 #if defined(_KERNEL) 994 #ifdef __FreeBSD__ 995 module_init(zstd_init); 996 module_exit(zstd_fini); 997 #endif 998 999 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, 1000 "Enable early abort attempts when using zstd"); 1001 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, 1002 "Minimal size of block to attempt early abort"); 1003 #endif 1004