1 /* 2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are met: 6 * 7 * 1. Redistributions of source code must retain the above copyright notice, 8 * this list of conditions and the following disclaimer. 9 * 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * 3. Neither the name of the copyright holder nor the names of its 15 * contributors may be used to endorse or promote products derived from this 16 * software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * Copyright (c) 2016-2018, Klara Inc. 33 * Copyright (c) 2016-2018, Allan Jude 34 * Copyright (c) 2018-2020, Sebastian Gottschall 35 * Copyright (c) 2019-2020, Michael Niewöhner 36 * Copyright (c) 2020, The FreeBSD Foundation [1] 37 * 38 * [1] Portions of this software were developed by Allan Jude 39 * under sponsorship from the FreeBSD Foundation. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/sysmacros.h> 44 #include <sys/zfs_context.h> 45 #include <sys/zio_compress.h> 46 #include <sys/spa.h> 47 #include <sys/zstd/zstd.h> 48 49 #define ZSTD_STATIC_LINKING_ONLY 50 #include "lib/zstd.h" 51 #include "lib/common/zstd_errors.h" 52 53 #ifndef IN_LIBSA 54 static uint_t zstd_earlyabort_pass = 1; 55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; 56 static unsigned int zstd_abort_size = (128 * 1024); 57 #endif 58 59 #ifdef IN_BASE 60 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int); 61 #endif 62 63 static kstat_t *zstd_ksp = NULL; 64 65 typedef struct zstd_stats { 66 kstat_named_t zstd_stat_alloc_fail; 67 kstat_named_t zstd_stat_alloc_fallback; 68 kstat_named_t zstd_stat_com_alloc_fail; 69 kstat_named_t zstd_stat_dec_alloc_fail; 70 kstat_named_t zstd_stat_com_inval; 71 kstat_named_t zstd_stat_dec_inval; 72 kstat_named_t zstd_stat_dec_header_inval; 73 kstat_named_t zstd_stat_com_fail; 74 kstat_named_t zstd_stat_dec_fail; 75 /* 76 * LZ4 first-pass early abort verdict 77 */ 78 kstat_named_t zstd_stat_lz4pass_allowed; 79 kstat_named_t zstd_stat_lz4pass_rejected; 80 /* 81 * zstd-1 second-pass early abort verdict 82 */ 83 kstat_named_t zstd_stat_zstdpass_allowed; 84 kstat_named_t zstd_stat_zstdpass_rejected; 85 /* 86 * We excluded this from early abort for some reason 87 */ 88 kstat_named_t zstd_stat_passignored; 89 kstat_named_t zstd_stat_passignored_size; 90 kstat_named_t zstd_stat_buffers; 91 kstat_named_t zstd_stat_size; 92 } zstd_stats_t; 93 94 static zstd_stats_t zstd_stats = { 95 { "alloc_fail", KSTAT_DATA_UINT64 }, 96 { "alloc_fallback", KSTAT_DATA_UINT64 }, 97 { "compress_alloc_fail", KSTAT_DATA_UINT64 }, 98 { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, 99 { "compress_level_invalid", KSTAT_DATA_UINT64 }, 100 { "decompress_level_invalid", KSTAT_DATA_UINT64 }, 101 { "decompress_header_invalid", KSTAT_DATA_UINT64 }, 102 { "compress_failed", KSTAT_DATA_UINT64 }, 103 { "decompress_failed", KSTAT_DATA_UINT64 }, 104 { "lz4pass_allowed", KSTAT_DATA_UINT64 }, 105 { "lz4pass_rejected", KSTAT_DATA_UINT64 }, 106 { "zstdpass_allowed", KSTAT_DATA_UINT64 }, 107 { "zstdpass_rejected", KSTAT_DATA_UINT64 }, 108 { "passignored", KSTAT_DATA_UINT64 }, 109 { "passignored_size", KSTAT_DATA_UINT64 }, 110 { "buffers", KSTAT_DATA_UINT64 }, 111 { "size", KSTAT_DATA_UINT64 }, 112 }; 113 114 #ifdef _KERNEL 115 static int 116 kstat_zstd_update(kstat_t *ksp, int rw) 117 { 118 ASSERT(ksp != NULL); 119 120 if (rw == KSTAT_WRITE && ksp == zstd_ksp) { 121 ZSTDSTAT_ZERO(zstd_stat_alloc_fail); 122 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); 123 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); 124 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); 125 ZSTDSTAT_ZERO(zstd_stat_com_inval); 126 ZSTDSTAT_ZERO(zstd_stat_dec_inval); 127 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); 128 ZSTDSTAT_ZERO(zstd_stat_com_fail); 129 ZSTDSTAT_ZERO(zstd_stat_dec_fail); 130 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); 131 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); 132 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); 133 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); 134 ZSTDSTAT_ZERO(zstd_stat_passignored); 135 ZSTDSTAT_ZERO(zstd_stat_passignored_size); 136 } 137 138 return (0); 139 } 140 #endif 141 142 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ 143 enum zstd_kmem_type { 144 ZSTD_KMEM_UNKNOWN = 0, 145 /* Allocation type using kmem_vmalloc */ 146 ZSTD_KMEM_DEFAULT, 147 /* Pool based allocation using mempool_alloc */ 148 ZSTD_KMEM_POOL, 149 /* Reserved fallback memory for decompression only */ 150 ZSTD_KMEM_DCTX, 151 ZSTD_KMEM_COUNT, 152 }; 153 154 /* Structure for pooled memory objects */ 155 struct zstd_pool { 156 void *mem; 157 size_t size; 158 kmutex_t barrier; 159 hrtime_t timeout; 160 }; 161 162 /* Global structure for handling memory allocations */ 163 struct zstd_kmem { 164 enum zstd_kmem_type kmem_type; 165 size_t kmem_size; 166 struct zstd_pool *pool; 167 }; 168 169 /* Fallback memory structure used for decompression only if memory runs out */ 170 struct zstd_fallback_mem { 171 size_t mem_size; 172 void *mem; 173 kmutex_t barrier; 174 }; 175 176 struct zstd_levelmap { 177 int16_t zstd_level; 178 enum zio_zstd_levels level; 179 }; 180 181 /* 182 * ZSTD memory handlers 183 * 184 * For decompression we use a different handler which also provides fallback 185 * memory allocation in case memory runs out. 186 * 187 * The ZSTD handlers were split up for the most simplified implementation. 188 */ 189 #ifndef IN_LIBSA 190 static void *zstd_alloc(void *opaque, size_t size); 191 #endif 192 static void *zstd_dctx_alloc(void *opaque, size_t size); 193 static void zstd_free(void *opaque, void *ptr); 194 195 #ifndef IN_LIBSA 196 /* Compression memory handler */ 197 static const ZSTD_customMem zstd_malloc = { 198 zstd_alloc, 199 zstd_free, 200 NULL, 201 }; 202 #endif 203 204 /* Decompression memory handler */ 205 static const ZSTD_customMem zstd_dctx_malloc = { 206 zstd_dctx_alloc, 207 zstd_free, 208 NULL, 209 }; 210 211 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ 212 static struct zstd_levelmap zstd_levels[] = { 213 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, 214 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, 215 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, 216 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, 217 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, 218 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, 219 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, 220 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, 221 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, 222 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, 223 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, 224 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, 225 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, 226 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, 227 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, 228 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, 229 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, 230 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, 231 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, 232 {-1, ZIO_ZSTD_LEVEL_FAST_1}, 233 {-2, ZIO_ZSTD_LEVEL_FAST_2}, 234 {-3, ZIO_ZSTD_LEVEL_FAST_3}, 235 {-4, ZIO_ZSTD_LEVEL_FAST_4}, 236 {-5, ZIO_ZSTD_LEVEL_FAST_5}, 237 {-6, ZIO_ZSTD_LEVEL_FAST_6}, 238 {-7, ZIO_ZSTD_LEVEL_FAST_7}, 239 {-8, ZIO_ZSTD_LEVEL_FAST_8}, 240 {-9, ZIO_ZSTD_LEVEL_FAST_9}, 241 {-10, ZIO_ZSTD_LEVEL_FAST_10}, 242 {-20, ZIO_ZSTD_LEVEL_FAST_20}, 243 {-30, ZIO_ZSTD_LEVEL_FAST_30}, 244 {-40, ZIO_ZSTD_LEVEL_FAST_40}, 245 {-50, ZIO_ZSTD_LEVEL_FAST_50}, 246 {-60, ZIO_ZSTD_LEVEL_FAST_60}, 247 {-70, ZIO_ZSTD_LEVEL_FAST_70}, 248 {-80, ZIO_ZSTD_LEVEL_FAST_80}, 249 {-90, ZIO_ZSTD_LEVEL_FAST_90}, 250 {-100, ZIO_ZSTD_LEVEL_FAST_100}, 251 {-500, ZIO_ZSTD_LEVEL_FAST_500}, 252 {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, 253 }; 254 255 /* 256 * This variable represents the maximum count of the pool based on the number 257 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. 258 */ 259 static int pool_count = 16; 260 261 #define ZSTD_POOL_MAX pool_count 262 #define ZSTD_POOL_TIMEOUT 60 * 2 263 264 static struct zstd_fallback_mem zstd_dctx_fallback; 265 static struct zstd_pool *zstd_mempool_cctx; 266 static struct zstd_pool *zstd_mempool_dctx; 267 268 /* 269 * The library zstd code expects these if ADDRESS_SANITIZER gets defined, 270 * and while ASAN does this, KASAN defines that and does not. So to avoid 271 * changing the external code, we do this. 272 */ 273 #if defined(ZFS_ASAN_ENABLED) 274 #define ADDRESS_SANITIZER 1 275 #endif 276 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) 277 void __asan_unpoison_memory_region(void const volatile *addr, size_t size); 278 void __asan_poison_memory_region(void const volatile *addr, size_t size); 279 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; 280 void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; 281 #endif 282 283 284 static void 285 zstd_mempool_reap(struct zstd_pool *zstd_mempool) 286 { 287 struct zstd_pool *pool; 288 289 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { 290 return; 291 } 292 293 /* free obsolete slots */ 294 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 295 pool = &zstd_mempool[i]; 296 if (pool->mem && mutex_tryenter(&pool->barrier)) { 297 /* Free memory if unused object older than 2 minutes */ 298 if (pool->mem && gethrestime_sec() > pool->timeout) { 299 vmem_free(pool->mem, pool->size); 300 ZSTDSTAT_SUB(zstd_stat_buffers, 1); 301 ZSTDSTAT_SUB(zstd_stat_size, pool->size); 302 pool->mem = NULL; 303 pool->size = 0; 304 pool->timeout = 0; 305 } 306 mutex_exit(&pool->barrier); 307 } 308 } 309 } 310 311 /* 312 * Try to get a cached allocated buffer from memory pool or allocate a new one 313 * if necessary. If a object is older than 2 minutes and does not fit the 314 * requested size, it will be released and a new cached entry will be allocated. 315 * If other pooled objects are detected without being used for 2 minutes, they 316 * will be released, too. 317 * 318 * The concept is that high frequency memory allocations of bigger objects are 319 * expensive. So if a lot of work is going on, allocations will be kept for a 320 * while and can be reused in that time frame. 321 * 322 * The scheduled release will be updated every time a object is reused. 323 */ 324 325 static void * 326 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) 327 { 328 struct zstd_pool *pool; 329 struct zstd_kmem *mem = NULL; 330 331 if (!zstd_mempool) { 332 return (NULL); 333 } 334 335 /* Seek for preallocated memory slot and free obsolete slots */ 336 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 337 pool = &zstd_mempool[i]; 338 /* 339 * This lock is simply a marker for a pool object being in use. 340 * If it's already hold, it will be skipped. 341 * 342 * We need to create it before checking it to avoid race 343 * conditions caused by running in a threaded context. 344 * 345 * The lock is later released by zstd_mempool_free. 346 */ 347 if (mutex_tryenter(&pool->barrier)) { 348 /* 349 * Check if objects fits the size, if so we take it and 350 * update the timestamp. 351 */ 352 if (pool->mem && size <= pool->size) { 353 pool->timeout = gethrestime_sec() + 354 ZSTD_POOL_TIMEOUT; 355 mem = pool->mem; 356 return (mem); 357 } 358 mutex_exit(&pool->barrier); 359 } 360 } 361 362 /* 363 * If no preallocated slot was found, try to fill in a new one. 364 * 365 * We run a similar algorithm twice here to avoid pool fragmentation. 366 * The first one may generate holes in the list if objects get released. 367 * We always make sure that these holes get filled instead of adding new 368 * allocations constantly at the end. 369 */ 370 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 371 pool = &zstd_mempool[i]; 372 if (mutex_tryenter(&pool->barrier)) { 373 /* Object is free, try to allocate new one */ 374 if (!pool->mem) { 375 mem = vmem_alloc(size, KM_SLEEP); 376 if (mem) { 377 ZSTDSTAT_ADD(zstd_stat_buffers, 1); 378 ZSTDSTAT_ADD(zstd_stat_size, size); 379 pool->mem = mem; 380 pool->size = size; 381 /* Keep track for later release */ 382 mem->pool = pool; 383 mem->kmem_type = ZSTD_KMEM_POOL; 384 mem->kmem_size = size; 385 } 386 } 387 388 if (size <= pool->size) { 389 /* Update timestamp */ 390 pool->timeout = gethrestime_sec() + 391 ZSTD_POOL_TIMEOUT; 392 393 return (pool->mem); 394 } 395 396 mutex_exit(&pool->barrier); 397 } 398 } 399 400 /* 401 * If the pool is full or the allocation failed, try lazy allocation 402 * instead. 403 */ 404 if (!mem) { 405 mem = vmem_alloc(size, KM_NOSLEEP); 406 if (mem) { 407 mem->pool = NULL; 408 mem->kmem_type = ZSTD_KMEM_DEFAULT; 409 mem->kmem_size = size; 410 } 411 } 412 413 return (mem); 414 } 415 416 /* Mark object as released by releasing the barrier mutex */ 417 static void 418 zstd_mempool_free(struct zstd_kmem *z) 419 { 420 mutex_exit(&z->pool->barrier); 421 } 422 423 /* Convert ZFS internal enum to ZSTD level */ 424 static int 425 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) 426 { 427 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { 428 *zstd_level = zstd_levels[level - 1].zstd_level; 429 return (0); 430 } 431 if (level >= ZIO_ZSTD_LEVEL_FAST_1 && 432 level <= ZIO_ZSTD_LEVEL_FAST_1000) { 433 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 434 + ZIO_ZSTD_LEVEL_19].zstd_level; 435 return (0); 436 } 437 438 /* Invalid/unknown zfs compression enum - this should never happen. */ 439 return (1); 440 } 441 442 #ifndef IN_LIBSA 443 static size_t 444 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, 445 int level) 446 { 447 int16_t zstd_level; 448 if (zstd_enum_to_level(level, &zstd_level)) { 449 ZSTDSTAT_BUMP(zstd_stat_com_inval); 450 return (s_len); 451 } 452 /* 453 * A zstd early abort heuristic. 454 * 455 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 456 * 128k), don't try any of this, just go. 457 * (because experimentally that was a reasonable cutoff for a perf win 458 * with tiny ratio change) 459 * - First, we try LZ4 compression, and if it doesn't early abort, we 460 * jump directly to whatever compression level we intended to try. 461 * - Second, we try zstd-1 - if that errors out (usually, but not 462 * exclusively, if it would overflow), we give up early. 463 * 464 * If it works, instead we go on and compress anyway. 465 * 466 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 467 * compressible data, it was losing up to 8.5% of the compressed 468 * savings versus no early abort, and all the zstd-fast levels are 469 * worse indications on their own than LZ4, and don't improve the LZ4 470 * pass noticably if stacked like this. 471 */ 472 size_t actual_abort_size = zstd_abort_size; 473 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 474 s_len >= actual_abort_size) { 475 int pass_len = 1; 476 pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0); 477 if (pass_len < d_len) { 478 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 479 goto keep_trying; 480 } 481 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 482 483 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, 484 ZIO_ZSTD_LEVEL_1); 485 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 486 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 487 return (s_len); 488 } 489 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 490 } else { 491 ZSTDSTAT_BUMP(zstd_stat_passignored); 492 if (s_len < actual_abort_size) { 493 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 494 } 495 } 496 keep_trying: 497 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); 498 499 } 500 501 /* Compress block using zstd */ 502 static size_t 503 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len, 504 int level) 505 { 506 size_t c_len; 507 int16_t zstd_level; 508 zfs_zstdhdr_t *hdr; 509 ZSTD_CCtx *cctx; 510 511 hdr = (zfs_zstdhdr_t *)d_start; 512 513 /* Skip compression if the specified level is invalid */ 514 if (zstd_enum_to_level(level, &zstd_level)) { 515 ZSTDSTAT_BUMP(zstd_stat_com_inval); 516 return (s_len); 517 } 518 519 ASSERT3U(d_len, >=, sizeof (*hdr)); 520 ASSERT3U(d_len, <=, s_len); 521 ASSERT3U(zstd_level, !=, 0); 522 523 cctx = ZSTD_createCCtx_advanced(zstd_malloc); 524 525 /* 526 * Out of kernel memory, gently fall through - this will disable 527 * compression in zio_compress_data 528 */ 529 if (!cctx) { 530 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); 531 return (s_len); 532 } 533 534 /* Set the compression level */ 535 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); 536 537 /* Use the "magicless" zstd header which saves us 4 header bytes */ 538 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); 539 540 /* 541 * Disable redundant checksum calculation and content size storage since 542 * this is already done by ZFS itself. 543 */ 544 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); 545 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); 546 547 c_len = ZSTD_compress2(cctx, 548 hdr->data, 549 d_len - sizeof (*hdr), 550 s_start, s_len); 551 552 ZSTD_freeCCtx(cctx); 553 554 /* Error in the compression routine, disable compression. */ 555 if (ZSTD_isError(c_len)) { 556 /* 557 * If we are aborting the compression because the saves are 558 * too small, that is not a failure. Everything else is a 559 * failure, so increment the compression failure counter. 560 */ 561 int err = ZSTD_getErrorCode(c_len); 562 if (err != ZSTD_error_dstSize_tooSmall) { 563 ZSTDSTAT_BUMP(zstd_stat_com_fail); 564 dprintf("Error: %s", ZSTD_getErrorString(err)); 565 } 566 return (s_len); 567 } 568 569 /* 570 * Encode the compressed buffer size at the start. We'll need this in 571 * decompression to counter the effects of padding which might be added 572 * to the compressed buffer and which, if unhandled, would confuse the 573 * hell out of our decompression function. 574 */ 575 hdr->c_len = BE_32(c_len); 576 577 /* 578 * Check version for overflow. 579 * The limit of 24 bits must not be exceeded. This allows a maximum 580 * version 1677.72.15 which we don't expect to be ever reached. 581 */ 582 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); 583 584 /* 585 * Encode the compression level as well. We may need to know the 586 * original compression level if compressed_arc is disabled, to match 587 * the compression settings to write this block to the L2ARC. 588 * 589 * Encode the actual level, so if the enum changes in the future, we 590 * will be compatible. 591 * 592 * The upper 24 bits store the ZSTD version to be able to provide 593 * future compatibility, since new versions might enhance the 594 * compression algorithm in a way, where the compressed data will 595 * change. 596 * 597 * As soon as such incompatibility occurs, handling code needs to be 598 * added, differentiating between the versions. 599 */ 600 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); 601 zfs_set_hdrlevel(hdr, level); 602 hdr->raw_version_level = BE_32(hdr->raw_version_level); 603 604 return (c_len + sizeof (*hdr)); 605 } 606 607 static size_t 608 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, 609 int level) 610 { 611 int16_t zstd_level; 612 if (zstd_enum_to_level(level, &zstd_level)) { 613 ZSTDSTAT_BUMP(zstd_stat_com_inval); 614 return (s_len); 615 } 616 /* 617 * A zstd early abort heuristic. 618 * 619 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 620 * 128k), don't try any of this, just go. 621 * (because experimentally that was a reasonable cutoff for a perf win 622 * with tiny ratio change) 623 * - First, we try LZ4 compression, and if it doesn't early abort, we 624 * jump directly to whatever compression level we intended to try. 625 * - Second, we try zstd-1 - if that errors out (usually, but not 626 * exclusively, if it would overflow), we give up early. 627 * 628 * If it works, instead we go on and compress anyway. 629 * 630 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 631 * compressible data, it was losing up to 8.5% of the compressed 632 * savings versus no early abort, and all the zstd-fast levels are 633 * worse indications on their own than LZ4, and don't improve the LZ4 634 * pass noticably if stacked like this. 635 */ 636 size_t actual_abort_size = zstd_abort_size; 637 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 638 s_len >= actual_abort_size) { 639 int pass_len = 1; 640 abd_t sabd, dabd; 641 abd_get_from_buf_struct(&sabd, s_start, s_len); 642 abd_get_from_buf_struct(&dabd, d_start, d_len); 643 pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); 644 abd_free(&dabd); 645 abd_free(&sabd); 646 if (pass_len < d_len) { 647 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 648 goto keep_trying; 649 } 650 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 651 652 pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len, 653 d_len, ZIO_ZSTD_LEVEL_1); 654 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 655 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 656 return (s_len); 657 } 658 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 659 } else { 660 ZSTDSTAT_BUMP(zstd_stat_passignored); 661 if (s_len < actual_abort_size) { 662 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 663 } 664 } 665 keep_trying: 666 return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level)); 667 668 } 669 #endif 670 671 /* Decompress block using zstd and return its stored level */ 672 static int 673 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len, 674 size_t d_len, uint8_t *level) 675 { 676 ZSTD_DCtx *dctx; 677 size_t result; 678 int16_t zstd_level; 679 uint32_t c_len; 680 const zfs_zstdhdr_t *hdr; 681 zfs_zstdhdr_t hdr_copy; 682 683 hdr = (const zfs_zstdhdr_t *)s_start; 684 c_len = BE_32(hdr->c_len); 685 686 /* 687 * Make a copy instead of directly converting the header, since we must 688 * not modify the original data that may be used again later. 689 */ 690 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); 691 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); 692 693 /* 694 * NOTE: We ignore the ZSTD version for now. As soon as any 695 * incompatibility occurs, it has to be handled accordingly. 696 * The version can be accessed via `hdr_copy.version`. 697 */ 698 699 /* 700 * Convert and check the level 701 * An invalid level is a strong indicator for data corruption! In such 702 * case return an error so the upper layers can try to fix it. 703 */ 704 if (zstd_enum_to_level(curlevel, &zstd_level)) { 705 ZSTDSTAT_BUMP(zstd_stat_dec_inval); 706 return (1); 707 } 708 709 ASSERT3U(d_len, >=, s_len); 710 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); 711 712 /* Invalid compressed buffer size encoded at start */ 713 if (c_len + sizeof (*hdr) > s_len) { 714 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); 715 return (1); 716 } 717 718 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); 719 if (!dctx) { 720 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); 721 return (1); 722 } 723 724 /* Set header type to "magicless" */ 725 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); 726 727 /* Decompress the data and release the context */ 728 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); 729 ZSTD_freeDCtx(dctx); 730 731 /* 732 * Returns 0 on success (decompression function returned non-negative) 733 * and non-zero on failure (decompression function returned negative. 734 */ 735 if (ZSTD_isError(result)) { 736 ZSTDSTAT_BUMP(zstd_stat_dec_fail); 737 return (1); 738 } 739 740 if (level) { 741 *level = curlevel; 742 } 743 744 return (0); 745 } 746 747 /* Decompress datablock using zstd */ 748 #ifdef IN_BASE 749 int 750 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 751 size_t d_len, int level __maybe_unused) 752 { 753 754 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 755 NULL)); 756 } 757 #else 758 static int 759 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len, 760 size_t d_len, int level __maybe_unused) 761 { 762 763 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len, 764 NULL)); 765 } 766 #endif 767 768 #ifndef IN_LIBSA 769 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress) 770 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress) 771 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level) 772 773 /* Allocator for zstd compression context using mempool_allocator */ 774 static void * 775 zstd_alloc(void *opaque __maybe_unused, size_t size) 776 { 777 size_t nbytes = sizeof (struct zstd_kmem) + size; 778 struct zstd_kmem *z = NULL; 779 780 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); 781 782 if (!z) { 783 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 784 return (NULL); 785 } 786 787 return ((void*)z + (sizeof (struct zstd_kmem))); 788 } 789 790 #endif 791 /* 792 * Allocator for zstd decompression context using mempool_allocator with 793 * fallback to reserved memory if allocation fails 794 */ 795 static void * 796 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) 797 { 798 size_t nbytes = sizeof (struct zstd_kmem) + size; 799 struct zstd_kmem *z = NULL; 800 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; 801 802 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); 803 if (!z) { 804 /* Try harder, decompression shall not fail */ 805 z = vmem_alloc(nbytes, KM_SLEEP); 806 if (z) { 807 z->pool = NULL; 808 } 809 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 810 } else { 811 return ((void*)z + (sizeof (struct zstd_kmem))); 812 } 813 814 /* Fallback if everything fails */ 815 if (!z) { 816 /* 817 * Barrier since we only can handle it in a single thread. All 818 * other following threads need to wait here until decompression 819 * is completed. zstd_free will release this barrier later. 820 */ 821 mutex_enter(&zstd_dctx_fallback.barrier); 822 823 z = zstd_dctx_fallback.mem; 824 type = ZSTD_KMEM_DCTX; 825 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); 826 } 827 828 /* Allocation should always be successful */ 829 if (!z) { 830 return (NULL); 831 } 832 833 z->kmem_type = type; 834 z->kmem_size = nbytes; 835 836 return ((void*)z + (sizeof (struct zstd_kmem))); 837 } 838 839 /* Free allocated memory by its specific type */ 840 static void 841 zstd_free(void *opaque __maybe_unused, void *ptr) 842 { 843 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); 844 enum zstd_kmem_type type; 845 846 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); 847 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); 848 849 type = z->kmem_type; 850 switch (type) { 851 case ZSTD_KMEM_DEFAULT: 852 vmem_free(z, z->kmem_size); 853 break; 854 case ZSTD_KMEM_POOL: 855 zstd_mempool_free(z); 856 break; 857 case ZSTD_KMEM_DCTX: 858 mutex_exit(&zstd_dctx_fallback.barrier); 859 break; 860 default: 861 break; 862 } 863 } 864 865 /* Allocate fallback memory to ensure safe decompression */ 866 static void __init 867 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) 868 { 869 mem->mem_size = size; 870 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); 871 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); 872 } 873 874 /* Initialize memory pool barrier mutexes */ 875 static void __init 876 zstd_mempool_init(void) 877 { 878 zstd_mempool_cctx = 879 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 880 zstd_mempool_dctx = 881 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 882 883 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 884 mutex_init(&zstd_mempool_cctx[i].barrier, NULL, 885 MUTEX_DEFAULT, NULL); 886 mutex_init(&zstd_mempool_dctx[i].barrier, NULL, 887 MUTEX_DEFAULT, NULL); 888 } 889 } 890 891 /* Initialize zstd-related memory handling */ 892 static int __init 893 zstd_meminit(void) 894 { 895 zstd_mempool_init(); 896 897 /* 898 * Estimate the size of the fallback decompression context. 899 * The expected size on x64 with current ZSTD should be about 160 KB. 900 */ 901 create_fallback_mem(&zstd_dctx_fallback, 902 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), 903 PAGESIZE)); 904 905 return (0); 906 } 907 908 /* Release object from pool and free memory */ 909 static void 910 release_pool(struct zstd_pool *pool) 911 { 912 mutex_destroy(&pool->barrier); 913 vmem_free(pool->mem, pool->size); 914 pool->mem = NULL; 915 pool->size = 0; 916 } 917 918 /* Release memory pool objects */ 919 static void 920 zstd_mempool_deinit(void) 921 { 922 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 923 release_pool(&zstd_mempool_cctx[i]); 924 release_pool(&zstd_mempool_dctx[i]); 925 } 926 927 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 928 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 929 zstd_mempool_dctx = NULL; 930 zstd_mempool_cctx = NULL; 931 } 932 933 /* release unused memory from pool */ 934 935 void 936 zfs_zstd_cache_reap_now(void) 937 { 938 939 /* 940 * Short-circuit if there are no buffers to begin with. 941 */ 942 if (ZSTDSTAT(zstd_stat_buffers) == 0) 943 return; 944 945 /* 946 * calling alloc with zero size seeks 947 * and releases old unused objects 948 */ 949 zstd_mempool_reap(zstd_mempool_cctx); 950 zstd_mempool_reap(zstd_mempool_dctx); 951 } 952 953 extern int __init 954 zstd_init(void) 955 { 956 /* Set pool size by using maximum sane thread count * 4 */ 957 pool_count = (boot_ncpus * 4); 958 zstd_meminit(); 959 960 /* Initialize kstat */ 961 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", 962 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), 963 KSTAT_FLAG_VIRTUAL); 964 if (zstd_ksp != NULL) { 965 zstd_ksp->ks_data = &zstd_stats; 966 kstat_install(zstd_ksp); 967 #ifdef _KERNEL 968 zstd_ksp->ks_update = kstat_zstd_update; 969 #endif 970 } 971 972 return (0); 973 } 974 975 extern void 976 zstd_fini(void) 977 { 978 /* Deinitialize kstat */ 979 if (zstd_ksp != NULL) { 980 kstat_delete(zstd_ksp); 981 zstd_ksp = NULL; 982 } 983 984 /* Release fallback memory */ 985 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); 986 mutex_destroy(&zstd_dctx_fallback.barrier); 987 988 /* Deinit memory pool */ 989 zstd_mempool_deinit(); 990 } 991 992 #if defined(_KERNEL) 993 #ifdef __FreeBSD__ 994 module_init(zstd_init); 995 module_exit(zstd_fini); 996 #endif 997 998 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, 999 "Enable early abort attempts when using zstd"); 1000 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, 1001 "Minimal size of block to attempt early abort"); 1002 #endif 1003