1 /* 2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are met: 6 * 7 * 1. Redistributions of source code must retain the above copyright notice, 8 * this list of conditions and the following disclaimer. 9 * 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * 3. Neither the name of the copyright holder nor the names of its 15 * contributors may be used to endorse or promote products derived from this 16 * software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * Copyright (c) 2016-2018, Klara Inc. 33 * Copyright (c) 2016-2018, Allan Jude 34 * Copyright (c) 2018-2020, Sebastian Gottschall 35 * Copyright (c) 2019-2020, Michael Niewöhner 36 * Copyright (c) 2020, The FreeBSD Foundation [1] 37 * 38 * [1] Portions of this software were developed by Allan Jude 39 * under sponsorship from the FreeBSD Foundation. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/sysmacros.h> 44 #include <sys/zfs_context.h> 45 #include <sys/zio_compress.h> 46 #include <sys/spa.h> 47 #include <sys/zstd/zstd.h> 48 49 #define ZSTD_STATIC_LINKING_ONLY 50 #include "lib/zstd.h" 51 #include "lib/common/zstd_errors.h" 52 53 #ifndef IN_LIBSA 54 static uint_t zstd_earlyabort_pass = 1; 55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; 56 static unsigned int zstd_abort_size = (128 * 1024); 57 #endif 58 59 static kstat_t *zstd_ksp = NULL; 60 61 typedef struct zstd_stats { 62 kstat_named_t zstd_stat_alloc_fail; 63 kstat_named_t zstd_stat_alloc_fallback; 64 kstat_named_t zstd_stat_com_alloc_fail; 65 kstat_named_t zstd_stat_dec_alloc_fail; 66 kstat_named_t zstd_stat_com_inval; 67 kstat_named_t zstd_stat_dec_inval; 68 kstat_named_t zstd_stat_dec_header_inval; 69 kstat_named_t zstd_stat_com_fail; 70 kstat_named_t zstd_stat_dec_fail; 71 /* 72 * LZ4 first-pass early abort verdict 73 */ 74 kstat_named_t zstd_stat_lz4pass_allowed; 75 kstat_named_t zstd_stat_lz4pass_rejected; 76 /* 77 * zstd-1 second-pass early abort verdict 78 */ 79 kstat_named_t zstd_stat_zstdpass_allowed; 80 kstat_named_t zstd_stat_zstdpass_rejected; 81 /* 82 * We excluded this from early abort for some reason 83 */ 84 kstat_named_t zstd_stat_passignored; 85 kstat_named_t zstd_stat_passignored_size; 86 kstat_named_t zstd_stat_buffers; 87 kstat_named_t zstd_stat_size; 88 } zstd_stats_t; 89 90 static zstd_stats_t zstd_stats = { 91 { "alloc_fail", KSTAT_DATA_UINT64 }, 92 { "alloc_fallback", KSTAT_DATA_UINT64 }, 93 { "compress_alloc_fail", KSTAT_DATA_UINT64 }, 94 { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, 95 { "compress_level_invalid", KSTAT_DATA_UINT64 }, 96 { "decompress_level_invalid", KSTAT_DATA_UINT64 }, 97 { "decompress_header_invalid", KSTAT_DATA_UINT64 }, 98 { "compress_failed", KSTAT_DATA_UINT64 }, 99 { "decompress_failed", KSTAT_DATA_UINT64 }, 100 { "lz4pass_allowed", KSTAT_DATA_UINT64 }, 101 { "lz4pass_rejected", KSTAT_DATA_UINT64 }, 102 { "zstdpass_allowed", KSTAT_DATA_UINT64 }, 103 { "zstdpass_rejected", KSTAT_DATA_UINT64 }, 104 { "passignored", KSTAT_DATA_UINT64 }, 105 { "passignored_size", KSTAT_DATA_UINT64 }, 106 { "buffers", KSTAT_DATA_UINT64 }, 107 { "size", KSTAT_DATA_UINT64 }, 108 }; 109 110 #ifdef _KERNEL 111 static int 112 kstat_zstd_update(kstat_t *ksp, int rw) 113 { 114 ASSERT(ksp != NULL); 115 116 if (rw == KSTAT_WRITE && ksp == zstd_ksp) { 117 ZSTDSTAT_ZERO(zstd_stat_alloc_fail); 118 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); 119 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); 120 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); 121 ZSTDSTAT_ZERO(zstd_stat_com_inval); 122 ZSTDSTAT_ZERO(zstd_stat_dec_inval); 123 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); 124 ZSTDSTAT_ZERO(zstd_stat_com_fail); 125 ZSTDSTAT_ZERO(zstd_stat_dec_fail); 126 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); 127 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); 128 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); 129 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); 130 ZSTDSTAT_ZERO(zstd_stat_passignored); 131 ZSTDSTAT_ZERO(zstd_stat_passignored_size); 132 } 133 134 return (0); 135 } 136 #endif 137 138 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ 139 enum zstd_kmem_type { 140 ZSTD_KMEM_UNKNOWN = 0, 141 /* Allocation type using kmem_vmalloc */ 142 ZSTD_KMEM_DEFAULT, 143 /* Pool based allocation using mempool_alloc */ 144 ZSTD_KMEM_POOL, 145 /* Reserved fallback memory for decompression only */ 146 ZSTD_KMEM_DCTX, 147 ZSTD_KMEM_COUNT, 148 }; 149 150 /* Structure for pooled memory objects */ 151 struct zstd_pool { 152 void *mem; 153 size_t size; 154 kmutex_t barrier; 155 hrtime_t timeout; 156 }; 157 158 /* Global structure for handling memory allocations */ 159 struct zstd_kmem { 160 enum zstd_kmem_type kmem_type; 161 size_t kmem_size; 162 struct zstd_pool *pool; 163 }; 164 165 /* Fallback memory structure used for decompression only if memory runs out */ 166 struct zstd_fallback_mem { 167 size_t mem_size; 168 void *mem; 169 kmutex_t barrier; 170 }; 171 172 struct zstd_levelmap { 173 int16_t zstd_level; 174 enum zio_zstd_levels level; 175 }; 176 177 /* 178 * ZSTD memory handlers 179 * 180 * For decompression we use a different handler which also provides fallback 181 * memory allocation in case memory runs out. 182 * 183 * The ZSTD handlers were split up for the most simplified implementation. 184 */ 185 #ifndef IN_LIBSA 186 static void *zstd_alloc(void *opaque, size_t size); 187 #endif 188 static void *zstd_dctx_alloc(void *opaque, size_t size); 189 static void zstd_free(void *opaque, void *ptr); 190 191 #ifndef IN_LIBSA 192 /* Compression memory handler */ 193 static const ZSTD_customMem zstd_malloc = { 194 zstd_alloc, 195 zstd_free, 196 NULL, 197 }; 198 #endif 199 200 /* Decompression memory handler */ 201 static const ZSTD_customMem zstd_dctx_malloc = { 202 zstd_dctx_alloc, 203 zstd_free, 204 NULL, 205 }; 206 207 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ 208 static struct zstd_levelmap zstd_levels[] = { 209 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, 210 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, 211 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, 212 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, 213 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, 214 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, 215 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, 216 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, 217 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, 218 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, 219 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, 220 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, 221 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, 222 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, 223 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, 224 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, 225 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, 226 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, 227 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, 228 {-1, ZIO_ZSTD_LEVEL_FAST_1}, 229 {-2, ZIO_ZSTD_LEVEL_FAST_2}, 230 {-3, ZIO_ZSTD_LEVEL_FAST_3}, 231 {-4, ZIO_ZSTD_LEVEL_FAST_4}, 232 {-5, ZIO_ZSTD_LEVEL_FAST_5}, 233 {-6, ZIO_ZSTD_LEVEL_FAST_6}, 234 {-7, ZIO_ZSTD_LEVEL_FAST_7}, 235 {-8, ZIO_ZSTD_LEVEL_FAST_8}, 236 {-9, ZIO_ZSTD_LEVEL_FAST_9}, 237 {-10, ZIO_ZSTD_LEVEL_FAST_10}, 238 {-20, ZIO_ZSTD_LEVEL_FAST_20}, 239 {-30, ZIO_ZSTD_LEVEL_FAST_30}, 240 {-40, ZIO_ZSTD_LEVEL_FAST_40}, 241 {-50, ZIO_ZSTD_LEVEL_FAST_50}, 242 {-60, ZIO_ZSTD_LEVEL_FAST_60}, 243 {-70, ZIO_ZSTD_LEVEL_FAST_70}, 244 {-80, ZIO_ZSTD_LEVEL_FAST_80}, 245 {-90, ZIO_ZSTD_LEVEL_FAST_90}, 246 {-100, ZIO_ZSTD_LEVEL_FAST_100}, 247 {-500, ZIO_ZSTD_LEVEL_FAST_500}, 248 {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, 249 }; 250 251 /* 252 * This variable represents the maximum count of the pool based on the number 253 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. 254 */ 255 static int pool_count = 16; 256 257 #define ZSTD_POOL_MAX pool_count 258 #define ZSTD_POOL_TIMEOUT 60 * 2 259 260 static struct zstd_fallback_mem zstd_dctx_fallback; 261 static struct zstd_pool *zstd_mempool_cctx; 262 static struct zstd_pool *zstd_mempool_dctx; 263 264 /* 265 * The library zstd code expects these if ADDRESS_SANITIZER gets defined, 266 * and while ASAN does this, KASAN defines that and does not. So to avoid 267 * changing the external code, we do this. 268 */ 269 #if defined(ZFS_ASAN_ENABLED) 270 #define ADDRESS_SANITIZER 1 271 #endif 272 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) 273 void __asan_unpoison_memory_region(void const volatile *addr, size_t size); 274 void __asan_poison_memory_region(void const volatile *addr, size_t size); 275 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; 276 void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; 277 #endif 278 279 280 static void 281 zstd_mempool_reap(struct zstd_pool *zstd_mempool) 282 { 283 struct zstd_pool *pool; 284 285 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { 286 return; 287 } 288 289 /* free obsolete slots */ 290 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 291 pool = &zstd_mempool[i]; 292 if (pool->mem && mutex_tryenter(&pool->barrier)) { 293 /* Free memory if unused object older than 2 minutes */ 294 if (pool->mem && gethrestime_sec() > pool->timeout) { 295 vmem_free(pool->mem, pool->size); 296 ZSTDSTAT_SUB(zstd_stat_buffers, 1); 297 ZSTDSTAT_SUB(zstd_stat_size, pool->size); 298 pool->mem = NULL; 299 pool->size = 0; 300 pool->timeout = 0; 301 } 302 mutex_exit(&pool->barrier); 303 } 304 } 305 } 306 307 /* 308 * Try to get a cached allocated buffer from memory pool or allocate a new one 309 * if necessary. If a object is older than 2 minutes and does not fit the 310 * requested size, it will be released and a new cached entry will be allocated. 311 * If other pooled objects are detected without being used for 2 minutes, they 312 * will be released, too. 313 * 314 * The concept is that high frequency memory allocations of bigger objects are 315 * expensive. So if a lot of work is going on, allocations will be kept for a 316 * while and can be reused in that time frame. 317 * 318 * The scheduled release will be updated every time a object is reused. 319 */ 320 321 static void * 322 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) 323 { 324 struct zstd_pool *pool; 325 struct zstd_kmem *mem = NULL; 326 327 if (!zstd_mempool) { 328 return (NULL); 329 } 330 331 /* Seek for preallocated memory slot and free obsolete slots */ 332 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 333 pool = &zstd_mempool[i]; 334 /* 335 * This lock is simply a marker for a pool object being in use. 336 * If it's already hold, it will be skipped. 337 * 338 * We need to create it before checking it to avoid race 339 * conditions caused by running in a threaded context. 340 * 341 * The lock is later released by zstd_mempool_free. 342 */ 343 if (mutex_tryenter(&pool->barrier)) { 344 /* 345 * Check if objects fits the size, if so we take it and 346 * update the timestamp. 347 */ 348 if (pool->mem && size <= pool->size) { 349 pool->timeout = gethrestime_sec() + 350 ZSTD_POOL_TIMEOUT; 351 mem = pool->mem; 352 return (mem); 353 } 354 mutex_exit(&pool->barrier); 355 } 356 } 357 358 /* 359 * If no preallocated slot was found, try to fill in a new one. 360 * 361 * We run a similar algorithm twice here to avoid pool fragmentation. 362 * The first one may generate holes in the list if objects get released. 363 * We always make sure that these holes get filled instead of adding new 364 * allocations constantly at the end. 365 */ 366 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 367 pool = &zstd_mempool[i]; 368 if (mutex_tryenter(&pool->barrier)) { 369 /* Object is free, try to allocate new one */ 370 if (!pool->mem) { 371 mem = vmem_alloc(size, KM_SLEEP); 372 if (mem) { 373 ZSTDSTAT_ADD(zstd_stat_buffers, 1); 374 ZSTDSTAT_ADD(zstd_stat_size, size); 375 pool->mem = mem; 376 pool->size = size; 377 /* Keep track for later release */ 378 mem->pool = pool; 379 mem->kmem_type = ZSTD_KMEM_POOL; 380 mem->kmem_size = size; 381 } 382 } 383 384 if (size <= pool->size) { 385 /* Update timestamp */ 386 pool->timeout = gethrestime_sec() + 387 ZSTD_POOL_TIMEOUT; 388 389 return (pool->mem); 390 } 391 392 mutex_exit(&pool->barrier); 393 } 394 } 395 396 /* 397 * If the pool is full or the allocation failed, try lazy allocation 398 * instead. 399 */ 400 if (!mem) { 401 mem = vmem_alloc(size, KM_NOSLEEP); 402 if (mem) { 403 mem->pool = NULL; 404 mem->kmem_type = ZSTD_KMEM_DEFAULT; 405 mem->kmem_size = size; 406 } 407 } 408 409 return (mem); 410 } 411 412 /* Mark object as released by releasing the barrier mutex */ 413 static void 414 zstd_mempool_free(struct zstd_kmem *z) 415 { 416 mutex_exit(&z->pool->barrier); 417 } 418 419 /* Convert ZFS internal enum to ZSTD level */ 420 static int 421 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) 422 { 423 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { 424 *zstd_level = zstd_levels[level - 1].zstd_level; 425 return (0); 426 } 427 if (level >= ZIO_ZSTD_LEVEL_FAST_1 && 428 level <= ZIO_ZSTD_LEVEL_FAST_1000) { 429 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 430 + ZIO_ZSTD_LEVEL_19].zstd_level; 431 return (0); 432 } 433 434 /* Invalid/unknown zfs compression enum - this should never happen. */ 435 return (1); 436 } 437 438 #ifndef IN_LIBSA 439 size_t 440 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, 441 int level) 442 { 443 int16_t zstd_level; 444 if (zstd_enum_to_level(level, &zstd_level)) { 445 ZSTDSTAT_BUMP(zstd_stat_com_inval); 446 return (s_len); 447 } 448 /* 449 * A zstd early abort heuristic. 450 * 451 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 452 * 128k), don't try any of this, just go. 453 * (because experimentally that was a reasonable cutoff for a perf win 454 * with tiny ratio change) 455 * - First, we try LZ4 compression, and if it doesn't early abort, we 456 * jump directly to whatever compression level we intended to try. 457 * - Second, we try zstd-1 - if that errors out (usually, but not 458 * exclusively, if it would overflow), we give up early. 459 * 460 * If it works, instead we go on and compress anyway. 461 * 462 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 463 * compressible data, it was losing up to 8.5% of the compressed 464 * savings versus no early abort, and all the zstd-fast levels are 465 * worse indications on their own than LZ4, and don't improve the LZ4 466 * pass noticably if stacked like this. 467 */ 468 size_t actual_abort_size = zstd_abort_size; 469 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 470 s_len >= actual_abort_size) { 471 int pass_len = 1; 472 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); 473 if (pass_len < d_len) { 474 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 475 goto keep_trying; 476 } 477 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 478 479 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, 480 ZIO_ZSTD_LEVEL_1); 481 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 482 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 483 return (s_len); 484 } 485 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 486 } else { 487 ZSTDSTAT_BUMP(zstd_stat_passignored); 488 if (s_len < actual_abort_size) { 489 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 490 } 491 } 492 keep_trying: 493 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); 494 495 } 496 497 /* Compress block using zstd */ 498 size_t 499 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, 500 int level) 501 { 502 size_t c_len; 503 int16_t zstd_level; 504 zfs_zstdhdr_t *hdr; 505 ZSTD_CCtx *cctx; 506 507 hdr = (zfs_zstdhdr_t *)d_start; 508 509 /* Skip compression if the specified level is invalid */ 510 if (zstd_enum_to_level(level, &zstd_level)) { 511 ZSTDSTAT_BUMP(zstd_stat_com_inval); 512 return (s_len); 513 } 514 515 ASSERT3U(d_len, >=, sizeof (*hdr)); 516 ASSERT3U(d_len, <=, s_len); 517 ASSERT3U(zstd_level, !=, 0); 518 519 cctx = ZSTD_createCCtx_advanced(zstd_malloc); 520 521 /* 522 * Out of kernel memory, gently fall through - this will disable 523 * compression in zio_compress_data 524 */ 525 if (!cctx) { 526 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); 527 return (s_len); 528 } 529 530 /* Set the compression level */ 531 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); 532 533 /* Use the "magicless" zstd header which saves us 4 header bytes */ 534 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); 535 536 /* 537 * Disable redundant checksum calculation and content size storage since 538 * this is already done by ZFS itself. 539 */ 540 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); 541 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); 542 543 c_len = ZSTD_compress2(cctx, 544 hdr->data, 545 d_len - sizeof (*hdr), 546 s_start, s_len); 547 548 ZSTD_freeCCtx(cctx); 549 550 /* Error in the compression routine, disable compression. */ 551 if (ZSTD_isError(c_len)) { 552 /* 553 * If we are aborting the compression because the saves are 554 * too small, that is not a failure. Everything else is a 555 * failure, so increment the compression failure counter. 556 */ 557 int err = ZSTD_getErrorCode(c_len); 558 if (err != ZSTD_error_dstSize_tooSmall) { 559 ZSTDSTAT_BUMP(zstd_stat_com_fail); 560 dprintf("Error: %s", ZSTD_getErrorString(err)); 561 } 562 return (s_len); 563 } 564 565 /* 566 * Encode the compressed buffer size at the start. We'll need this in 567 * decompression to counter the effects of padding which might be added 568 * to the compressed buffer and which, if unhandled, would confuse the 569 * hell out of our decompression function. 570 */ 571 hdr->c_len = BE_32(c_len); 572 573 /* 574 * Check version for overflow. 575 * The limit of 24 bits must not be exceeded. This allows a maximum 576 * version 1677.72.15 which we don't expect to be ever reached. 577 */ 578 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); 579 580 /* 581 * Encode the compression level as well. We may need to know the 582 * original compression level if compressed_arc is disabled, to match 583 * the compression settings to write this block to the L2ARC. 584 * 585 * Encode the actual level, so if the enum changes in the future, we 586 * will be compatible. 587 * 588 * The upper 24 bits store the ZSTD version to be able to provide 589 * future compatibility, since new versions might enhance the 590 * compression algorithm in a way, where the compressed data will 591 * change. 592 * 593 * As soon as such incompatibility occurs, handling code needs to be 594 * added, differentiating between the versions. 595 */ 596 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); 597 zfs_set_hdrlevel(hdr, level); 598 hdr->raw_version_level = BE_32(hdr->raw_version_level); 599 600 return (c_len + sizeof (*hdr)); 601 } 602 #endif 603 604 /* Decompress block using zstd and return its stored level */ 605 int 606 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, 607 size_t d_len, uint8_t *level) 608 { 609 ZSTD_DCtx *dctx; 610 size_t result; 611 int16_t zstd_level; 612 uint32_t c_len; 613 const zfs_zstdhdr_t *hdr; 614 zfs_zstdhdr_t hdr_copy; 615 616 hdr = (const zfs_zstdhdr_t *)s_start; 617 c_len = BE_32(hdr->c_len); 618 619 /* 620 * Make a copy instead of directly converting the header, since we must 621 * not modify the original data that may be used again later. 622 */ 623 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); 624 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); 625 626 /* 627 * NOTE: We ignore the ZSTD version for now. As soon as any 628 * incompatibility occurs, it has to be handled accordingly. 629 * The version can be accessed via `hdr_copy.version`. 630 */ 631 632 /* 633 * Convert and check the level 634 * An invalid level is a strong indicator for data corruption! In such 635 * case return an error so the upper layers can try to fix it. 636 */ 637 if (zstd_enum_to_level(curlevel, &zstd_level)) { 638 ZSTDSTAT_BUMP(zstd_stat_dec_inval); 639 return (1); 640 } 641 642 ASSERT3U(d_len, >=, s_len); 643 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); 644 645 /* Invalid compressed buffer size encoded at start */ 646 if (c_len + sizeof (*hdr) > s_len) { 647 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); 648 return (1); 649 } 650 651 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); 652 if (!dctx) { 653 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); 654 return (1); 655 } 656 657 /* Set header type to "magicless" */ 658 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); 659 660 /* Decompress the data and release the context */ 661 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); 662 ZSTD_freeDCtx(dctx); 663 664 /* 665 * Returns 0 on success (decompression function returned non-negative) 666 * and non-zero on failure (decompression function returned negative. 667 */ 668 if (ZSTD_isError(result)) { 669 ZSTDSTAT_BUMP(zstd_stat_dec_fail); 670 return (1); 671 } 672 673 if (level) { 674 *level = curlevel; 675 } 676 677 return (0); 678 } 679 680 /* Decompress datablock using zstd */ 681 int 682 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, 683 int level __maybe_unused) 684 { 685 686 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, 687 NULL)); 688 } 689 690 #ifndef IN_LIBSA 691 /* Allocator for zstd compression context using mempool_allocator */ 692 static void * 693 zstd_alloc(void *opaque __maybe_unused, size_t size) 694 { 695 size_t nbytes = sizeof (struct zstd_kmem) + size; 696 struct zstd_kmem *z = NULL; 697 698 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); 699 700 if (!z) { 701 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 702 return (NULL); 703 } 704 705 return ((void*)z + (sizeof (struct zstd_kmem))); 706 } 707 #endif 708 709 /* 710 * Allocator for zstd decompression context using mempool_allocator with 711 * fallback to reserved memory if allocation fails 712 */ 713 static void * 714 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) 715 { 716 size_t nbytes = sizeof (struct zstd_kmem) + size; 717 struct zstd_kmem *z = NULL; 718 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; 719 720 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); 721 if (!z) { 722 /* Try harder, decompression shall not fail */ 723 z = vmem_alloc(nbytes, KM_SLEEP); 724 if (z) { 725 z->pool = NULL; 726 } 727 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 728 } else { 729 return ((void*)z + (sizeof (struct zstd_kmem))); 730 } 731 732 /* Fallback if everything fails */ 733 if (!z) { 734 /* 735 * Barrier since we only can handle it in a single thread. All 736 * other following threads need to wait here until decompression 737 * is completed. zstd_free will release this barrier later. 738 */ 739 mutex_enter(&zstd_dctx_fallback.barrier); 740 741 z = zstd_dctx_fallback.mem; 742 type = ZSTD_KMEM_DCTX; 743 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); 744 } 745 746 /* Allocation should always be successful */ 747 if (!z) { 748 return (NULL); 749 } 750 751 z->kmem_type = type; 752 z->kmem_size = nbytes; 753 754 return ((void*)z + (sizeof (struct zstd_kmem))); 755 } 756 757 /* Free allocated memory by its specific type */ 758 static void 759 zstd_free(void *opaque __maybe_unused, void *ptr) 760 { 761 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); 762 enum zstd_kmem_type type; 763 764 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); 765 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); 766 767 type = z->kmem_type; 768 switch (type) { 769 case ZSTD_KMEM_DEFAULT: 770 vmem_free(z, z->kmem_size); 771 break; 772 case ZSTD_KMEM_POOL: 773 zstd_mempool_free(z); 774 break; 775 case ZSTD_KMEM_DCTX: 776 mutex_exit(&zstd_dctx_fallback.barrier); 777 break; 778 default: 779 break; 780 } 781 } 782 783 /* Allocate fallback memory to ensure safe decompression */ 784 static void __init 785 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) 786 { 787 mem->mem_size = size; 788 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); 789 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); 790 } 791 792 /* Initialize memory pool barrier mutexes */ 793 static void __init 794 zstd_mempool_init(void) 795 { 796 zstd_mempool_cctx = 797 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 798 zstd_mempool_dctx = 799 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 800 801 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 802 mutex_init(&zstd_mempool_cctx[i].barrier, NULL, 803 MUTEX_DEFAULT, NULL); 804 mutex_init(&zstd_mempool_dctx[i].barrier, NULL, 805 MUTEX_DEFAULT, NULL); 806 } 807 } 808 809 /* Initialize zstd-related memory handling */ 810 static int __init 811 zstd_meminit(void) 812 { 813 zstd_mempool_init(); 814 815 /* 816 * Estimate the size of the fallback decompression context. 817 * The expected size on x64 with current ZSTD should be about 160 KB. 818 */ 819 create_fallback_mem(&zstd_dctx_fallback, 820 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), 821 PAGESIZE)); 822 823 return (0); 824 } 825 826 /* Release object from pool and free memory */ 827 static void 828 release_pool(struct zstd_pool *pool) 829 { 830 mutex_destroy(&pool->barrier); 831 vmem_free(pool->mem, pool->size); 832 pool->mem = NULL; 833 pool->size = 0; 834 } 835 836 /* Release memory pool objects */ 837 static void 838 zstd_mempool_deinit(void) 839 { 840 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 841 release_pool(&zstd_mempool_cctx[i]); 842 release_pool(&zstd_mempool_dctx[i]); 843 } 844 845 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 846 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 847 zstd_mempool_dctx = NULL; 848 zstd_mempool_cctx = NULL; 849 } 850 851 /* release unused memory from pool */ 852 853 void 854 zfs_zstd_cache_reap_now(void) 855 { 856 857 /* 858 * Short-circuit if there are no buffers to begin with. 859 */ 860 if (ZSTDSTAT(zstd_stat_buffers) == 0) 861 return; 862 863 /* 864 * calling alloc with zero size seeks 865 * and releases old unused objects 866 */ 867 zstd_mempool_reap(zstd_mempool_cctx); 868 zstd_mempool_reap(zstd_mempool_dctx); 869 } 870 871 extern int __init 872 zstd_init(void) 873 { 874 /* Set pool size by using maximum sane thread count * 4 */ 875 pool_count = (boot_ncpus * 4); 876 zstd_meminit(); 877 878 /* Initialize kstat */ 879 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", 880 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), 881 KSTAT_FLAG_VIRTUAL); 882 if (zstd_ksp != NULL) { 883 zstd_ksp->ks_data = &zstd_stats; 884 kstat_install(zstd_ksp); 885 #ifdef _KERNEL 886 zstd_ksp->ks_update = kstat_zstd_update; 887 #endif 888 } 889 890 return (0); 891 } 892 893 extern void 894 zstd_fini(void) 895 { 896 /* Deinitialize kstat */ 897 if (zstd_ksp != NULL) { 898 kstat_delete(zstd_ksp); 899 zstd_ksp = NULL; 900 } 901 902 /* Release fallback memory */ 903 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); 904 mutex_destroy(&zstd_dctx_fallback.barrier); 905 906 /* Deinit memory pool */ 907 zstd_mempool_deinit(); 908 } 909 910 #if defined(_KERNEL) 911 #ifdef __FreeBSD__ 912 module_init(zstd_init); 913 module_exit(zstd_fini); 914 #endif 915 916 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, 917 "Enable early abort attempts when using zstd"); 918 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, 919 "Minimal size of block to attempt early abort"); 920 #endif 921