xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/common/zstd_errors.h"
52 
53 #ifndef IN_LIBSA
54 static uint_t zstd_earlyabort_pass = 1;
55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
56 static unsigned int zstd_abort_size = (128 * 1024);
57 #endif
58 
59 #ifdef IN_BASE
60 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int);
61 #endif
62 
63 static kstat_t *zstd_ksp = NULL;
64 
65 typedef struct zstd_stats {
66 	kstat_named_t	zstd_stat_alloc_fail;
67 	kstat_named_t	zstd_stat_alloc_fallback;
68 	kstat_named_t	zstd_stat_com_alloc_fail;
69 	kstat_named_t	zstd_stat_dec_alloc_fail;
70 	kstat_named_t	zstd_stat_com_inval;
71 	kstat_named_t	zstd_stat_dec_inval;
72 	kstat_named_t	zstd_stat_dec_header_inval;
73 	kstat_named_t	zstd_stat_com_fail;
74 	kstat_named_t	zstd_stat_dec_fail;
75 	/*
76 	 * LZ4 first-pass early abort verdict
77 	 */
78 	kstat_named_t	zstd_stat_lz4pass_allowed;
79 	kstat_named_t	zstd_stat_lz4pass_rejected;
80 	/*
81 	 * zstd-1 second-pass early abort verdict
82 	 */
83 	kstat_named_t	zstd_stat_zstdpass_allowed;
84 	kstat_named_t	zstd_stat_zstdpass_rejected;
85 	/*
86 	 * We excluded this from early abort for some reason
87 	 */
88 	kstat_named_t	zstd_stat_passignored;
89 	kstat_named_t	zstd_stat_passignored_size;
90 	kstat_named_t	zstd_stat_buffers;
91 	kstat_named_t	zstd_stat_size;
92 } zstd_stats_t;
93 
94 static zstd_stats_t zstd_stats = {
95 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
96 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
97 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
98 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
99 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
100 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
101 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
102 	{ "compress_failed",		KSTAT_DATA_UINT64 },
103 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
104 	{ "lz4pass_allowed",		KSTAT_DATA_UINT64 },
105 	{ "lz4pass_rejected",		KSTAT_DATA_UINT64 },
106 	{ "zstdpass_allowed",		KSTAT_DATA_UINT64 },
107 	{ "zstdpass_rejected",		KSTAT_DATA_UINT64 },
108 	{ "passignored",		KSTAT_DATA_UINT64 },
109 	{ "passignored_size",		KSTAT_DATA_UINT64 },
110 	{ "buffers",			KSTAT_DATA_UINT64 },
111 	{ "size",			KSTAT_DATA_UINT64 },
112 };
113 
114 #ifdef _KERNEL
115 static int
116 kstat_zstd_update(kstat_t *ksp, int rw)
117 {
118 	ASSERT(ksp != NULL);
119 
120 	if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
121 		ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
122 		ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
123 		ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
124 		ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
125 		ZSTDSTAT_ZERO(zstd_stat_com_inval);
126 		ZSTDSTAT_ZERO(zstd_stat_dec_inval);
127 		ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
128 		ZSTDSTAT_ZERO(zstd_stat_com_fail);
129 		ZSTDSTAT_ZERO(zstd_stat_dec_fail);
130 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
131 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
132 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
133 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
134 		ZSTDSTAT_ZERO(zstd_stat_passignored);
135 		ZSTDSTAT_ZERO(zstd_stat_passignored_size);
136 	}
137 
138 	return (0);
139 }
140 #endif
141 
142 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
143 enum zstd_kmem_type {
144 	ZSTD_KMEM_UNKNOWN = 0,
145 	/* Allocation type using kmem_vmalloc */
146 	ZSTD_KMEM_DEFAULT,
147 	/* Pool based allocation using mempool_alloc */
148 	ZSTD_KMEM_POOL,
149 	/* Reserved fallback memory for decompression only */
150 	ZSTD_KMEM_DCTX,
151 	ZSTD_KMEM_COUNT,
152 };
153 
154 /* Structure for pooled memory objects */
155 struct zstd_pool {
156 	void *mem;
157 	size_t size;
158 	kmutex_t barrier;
159 	hrtime_t timeout;
160 };
161 
162 /* Global structure for handling memory allocations */
163 struct zstd_kmem {
164 	enum zstd_kmem_type kmem_type;
165 	size_t kmem_size;
166 	struct zstd_pool *pool;
167 };
168 
169 /* Fallback memory structure used for decompression only if memory runs out */
170 struct zstd_fallback_mem {
171 	size_t mem_size;
172 	void *mem;
173 	kmutex_t barrier;
174 };
175 
176 struct zstd_levelmap {
177 	int16_t zstd_level;
178 	enum zio_zstd_levels level;
179 };
180 
181 /*
182  * ZSTD memory handlers
183  *
184  * For decompression we use a different handler which also provides fallback
185  * memory allocation in case memory runs out.
186  *
187  * The ZSTD handlers were split up for the most simplified implementation.
188  */
189 #ifndef IN_LIBSA
190 static void *zstd_alloc(void *opaque, size_t size);
191 #endif
192 static void *zstd_dctx_alloc(void *opaque, size_t size);
193 static void zstd_free(void *opaque, void *ptr);
194 
195 #ifndef IN_LIBSA
196 /* Compression memory handler */
197 static const ZSTD_customMem zstd_malloc = {
198 	zstd_alloc,
199 	zstd_free,
200 	NULL,
201 };
202 #endif
203 
204 /* Decompression memory handler */
205 static const ZSTD_customMem zstd_dctx_malloc = {
206 	zstd_dctx_alloc,
207 	zstd_free,
208 	NULL,
209 };
210 
211 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
212 static struct zstd_levelmap zstd_levels[] = {
213 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
214 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
215 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
216 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
217 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
218 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
219 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
220 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
221 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
222 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
223 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
224 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
225 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
226 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
227 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
228 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
229 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
230 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
231 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
232 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
233 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
234 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
235 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
236 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
237 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
238 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
239 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
240 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
241 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
242 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
243 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
244 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
245 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
246 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
247 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
248 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
249 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
250 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
251 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
252 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
253 };
254 
255 /*
256  * This variable represents the maximum count of the pool based on the number
257  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
258  */
259 static int pool_count = 16;
260 
261 #define	ZSTD_POOL_MAX		pool_count
262 #define	ZSTD_POOL_TIMEOUT	60 * 2
263 
264 static struct zstd_fallback_mem zstd_dctx_fallback;
265 static struct zstd_pool *zstd_mempool_cctx;
266 static struct zstd_pool *zstd_mempool_dctx;
267 
268 /*
269  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
270  * and while ASAN does this, KASAN defines that and does not. So to avoid
271  * changing the external code, we do this.
272  */
273 #if defined(ZFS_ASAN_ENABLED)
274 #define	ADDRESS_SANITIZER 1
275 #endif
276 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
277 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
278 void __asan_poison_memory_region(void const volatile *addr, size_t size);
279 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
280 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
281 #endif
282 
283 
284 static void
285 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
286 {
287 	struct zstd_pool *pool;
288 
289 	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
290 		return;
291 	}
292 
293 	/* free obsolete slots */
294 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
295 		pool = &zstd_mempool[i];
296 		if (pool->mem && mutex_tryenter(&pool->barrier)) {
297 			/* Free memory if unused object older than 2 minutes */
298 			if (pool->mem && gethrestime_sec() > pool->timeout) {
299 				vmem_free(pool->mem, pool->size);
300 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
301 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
302 				pool->mem = NULL;
303 				pool->size = 0;
304 				pool->timeout = 0;
305 			}
306 			mutex_exit(&pool->barrier);
307 		}
308 	}
309 }
310 
311 /*
312  * Try to get a cached allocated buffer from memory pool or allocate a new one
313  * if necessary. If a object is older than 2 minutes and does not fit the
314  * requested size, it will be released and a new cached entry will be allocated.
315  * If other pooled objects are detected without being used for 2 minutes, they
316  * will be released, too.
317  *
318  * The concept is that high frequency memory allocations of bigger objects are
319  * expensive. So if a lot of work is going on, allocations will be kept for a
320  * while and can be reused in that time frame.
321  *
322  * The scheduled release will be updated every time a object is reused.
323  */
324 
325 static void *
326 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
327 {
328 	struct zstd_pool *pool;
329 	struct zstd_kmem *mem = NULL;
330 
331 	if (!zstd_mempool) {
332 		return (NULL);
333 	}
334 
335 	/* Seek for preallocated memory slot and free obsolete slots */
336 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
337 		pool = &zstd_mempool[i];
338 		/*
339 		 * This lock is simply a marker for a pool object being in use.
340 		 * If it's already hold, it will be skipped.
341 		 *
342 		 * We need to create it before checking it to avoid race
343 		 * conditions caused by running in a threaded context.
344 		 *
345 		 * The lock is later released by zstd_mempool_free.
346 		 */
347 		if (mutex_tryenter(&pool->barrier)) {
348 			/*
349 			 * Check if objects fits the size, if so we take it and
350 			 * update the timestamp.
351 			 */
352 			if (pool->mem && size <= pool->size) {
353 				pool->timeout = gethrestime_sec() +
354 				    ZSTD_POOL_TIMEOUT;
355 				mem = pool->mem;
356 				return (mem);
357 			}
358 			mutex_exit(&pool->barrier);
359 		}
360 	}
361 
362 	/*
363 	 * If no preallocated slot was found, try to fill in a new one.
364 	 *
365 	 * We run a similar algorithm twice here to avoid pool fragmentation.
366 	 * The first one may generate holes in the list if objects get released.
367 	 * We always make sure that these holes get filled instead of adding new
368 	 * allocations constantly at the end.
369 	 */
370 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
371 		pool = &zstd_mempool[i];
372 		if (mutex_tryenter(&pool->barrier)) {
373 			/* Object is free, try to allocate new one */
374 			if (!pool->mem) {
375 				mem = vmem_alloc(size, KM_SLEEP);
376 				if (mem) {
377 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
378 					ZSTDSTAT_ADD(zstd_stat_size, size);
379 					pool->mem = mem;
380 					pool->size = size;
381 					/* Keep track for later release */
382 					mem->pool = pool;
383 					mem->kmem_type = ZSTD_KMEM_POOL;
384 					mem->kmem_size = size;
385 				}
386 			}
387 
388 			if (size <= pool->size) {
389 				/* Update timestamp */
390 				pool->timeout = gethrestime_sec() +
391 				    ZSTD_POOL_TIMEOUT;
392 
393 				return (pool->mem);
394 			}
395 
396 			mutex_exit(&pool->barrier);
397 		}
398 	}
399 
400 	/*
401 	 * If the pool is full or the allocation failed, try lazy allocation
402 	 * instead.
403 	 */
404 	if (!mem) {
405 		mem = vmem_alloc(size, KM_NOSLEEP);
406 		if (mem) {
407 			mem->pool = NULL;
408 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
409 			mem->kmem_size = size;
410 		}
411 	}
412 
413 	return (mem);
414 }
415 
416 /* Mark object as released by releasing the barrier mutex */
417 static void
418 zstd_mempool_free(struct zstd_kmem *z)
419 {
420 	mutex_exit(&z->pool->barrier);
421 }
422 
423 /* Convert ZFS internal enum to ZSTD level */
424 static int
425 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
426 {
427 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
428 		*zstd_level = zstd_levels[level - 1].zstd_level;
429 		return (0);
430 	}
431 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
432 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
433 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
434 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
435 		return (0);
436 	}
437 
438 	/* Invalid/unknown zfs compression enum - this should never happen. */
439 	return (1);
440 }
441 
442 #ifndef IN_LIBSA
443 static size_t
444 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
445     int level)
446 {
447 	int16_t zstd_level;
448 	if (zstd_enum_to_level(level, &zstd_level)) {
449 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
450 		return (s_len);
451 	}
452 	/*
453 	 * A zstd early abort heuristic.
454 	 *
455 	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
456 	 *   128k), don't try any of this, just go.
457 	 *   (because experimentally that was a reasonable cutoff for a perf win
458 	 *   with tiny ratio change)
459 	 * - First, we try LZ4 compression, and if it doesn't early abort, we
460 	 *   jump directly to whatever compression level we intended to try.
461 	 * - Second, we try zstd-1 - if that errors out (usually, but not
462 	 *   exclusively, if it would overflow), we give up early.
463 	 *
464 	 *   If it works, instead we go on and compress anyway.
465 	 *
466 	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
467 	 * compressible data, it was losing up to 8.5% of the compressed
468 	 * savings versus no early abort, and all the zstd-fast levels are
469 	 * worse indications on their own than LZ4, and don't improve the LZ4
470 	 * pass noticably if stacked like this.
471 	 */
472 	size_t actual_abort_size = zstd_abort_size;
473 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
474 	    s_len >= actual_abort_size) {
475 		int pass_len = 1;
476 		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
477 		if (pass_len < d_len) {
478 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
479 			goto keep_trying;
480 		}
481 		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
482 
483 		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
484 		    ZIO_ZSTD_LEVEL_1);
485 		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
486 			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
487 			return (s_len);
488 		}
489 		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
490 	} else {
491 		ZSTDSTAT_BUMP(zstd_stat_passignored);
492 		if (s_len < actual_abort_size) {
493 			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
494 		}
495 	}
496 keep_trying:
497 	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
498 
499 }
500 
501 /* Compress block using zstd */
502 static size_t
503 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
504     int level)
505 {
506 	size_t c_len;
507 	int16_t zstd_level;
508 	zfs_zstdhdr_t *hdr;
509 	ZSTD_CCtx *cctx;
510 
511 	hdr = (zfs_zstdhdr_t *)d_start;
512 
513 	/* Skip compression if the specified level is invalid */
514 	if (zstd_enum_to_level(level, &zstd_level)) {
515 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
516 		return (s_len);
517 	}
518 
519 	ASSERT3U(d_len, >=, sizeof (*hdr));
520 	ASSERT3U(d_len, <=, s_len);
521 	ASSERT3U(zstd_level, !=, 0);
522 
523 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
524 
525 	/*
526 	 * Out of kernel memory, gently fall through - this will disable
527 	 * compression in zio_compress_data
528 	 */
529 	if (!cctx) {
530 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
531 		return (s_len);
532 	}
533 
534 	/* Set the compression level */
535 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
536 
537 	/* Use the "magicless" zstd header which saves us 4 header bytes */
538 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
539 
540 	/*
541 	 * Disable redundant checksum calculation and content size storage since
542 	 * this is already done by ZFS itself.
543 	 */
544 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
545 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
546 
547 	c_len = ZSTD_compress2(cctx,
548 	    hdr->data,
549 	    d_len - sizeof (*hdr),
550 	    s_start, s_len);
551 
552 	ZSTD_freeCCtx(cctx);
553 
554 	/* Error in the compression routine, disable compression. */
555 	if (ZSTD_isError(c_len)) {
556 		/*
557 		 * If we are aborting the compression because the saves are
558 		 * too small, that is not a failure. Everything else is a
559 		 * failure, so increment the compression failure counter.
560 		 */
561 		int err = ZSTD_getErrorCode(c_len);
562 		if (err != ZSTD_error_dstSize_tooSmall) {
563 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
564 			dprintf("Error: %s", ZSTD_getErrorString(err));
565 		}
566 		return (s_len);
567 	}
568 
569 	/*
570 	 * Encode the compressed buffer size at the start. We'll need this in
571 	 * decompression to counter the effects of padding which might be added
572 	 * to the compressed buffer and which, if unhandled, would confuse the
573 	 * hell out of our decompression function.
574 	 */
575 	hdr->c_len = BE_32(c_len);
576 
577 	/*
578 	 * Check version for overflow.
579 	 * The limit of 24 bits must not be exceeded. This allows a maximum
580 	 * version 1677.72.15 which we don't expect to be ever reached.
581 	 */
582 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
583 
584 	/*
585 	 * Encode the compression level as well. We may need to know the
586 	 * original compression level if compressed_arc is disabled, to match
587 	 * the compression settings to write this block to the L2ARC.
588 	 *
589 	 * Encode the actual level, so if the enum changes in the future, we
590 	 * will be compatible.
591 	 *
592 	 * The upper 24 bits store the ZSTD version to be able to provide
593 	 * future compatibility, since new versions might enhance the
594 	 * compression algorithm in a way, where the compressed data will
595 	 * change.
596 	 *
597 	 * As soon as such incompatibility occurs, handling code needs to be
598 	 * added, differentiating between the versions.
599 	 */
600 	zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
601 	zfs_set_hdrlevel(hdr, level);
602 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
603 
604 	return (c_len + sizeof (*hdr));
605 }
606 
607 static size_t
608 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
609     int level)
610 {
611 	int16_t zstd_level;
612 	if (zstd_enum_to_level(level, &zstd_level)) {
613 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
614 		return (s_len);
615 	}
616 	/*
617 	 * A zstd early abort heuristic.
618 	 *
619 	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
620 	 *   128k), don't try any of this, just go.
621 	 *   (because experimentally that was a reasonable cutoff for a perf win
622 	 *   with tiny ratio change)
623 	 * - First, we try LZ4 compression, and if it doesn't early abort, we
624 	 *   jump directly to whatever compression level we intended to try.
625 	 * - Second, we try zstd-1 - if that errors out (usually, but not
626 	 *   exclusively, if it would overflow), we give up early.
627 	 *
628 	 *   If it works, instead we go on and compress anyway.
629 	 *
630 	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
631 	 * compressible data, it was losing up to 8.5% of the compressed
632 	 * savings versus no early abort, and all the zstd-fast levels are
633 	 * worse indications on their own than LZ4, and don't improve the LZ4
634 	 * pass noticably if stacked like this.
635 	 */
636 	size_t actual_abort_size = zstd_abort_size;
637 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
638 	    s_len >= actual_abort_size) {
639 		int pass_len = 1;
640 		abd_t sabd, dabd;
641 		abd_get_from_buf_struct(&sabd, s_start, s_len);
642 		abd_get_from_buf_struct(&dabd, d_start, d_len);
643 		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
644 		abd_free(&dabd);
645 		abd_free(&sabd);
646 		if (pass_len < d_len) {
647 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
648 			goto keep_trying;
649 		}
650 		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
651 
652 		pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
653 		    d_len, ZIO_ZSTD_LEVEL_1);
654 		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
655 			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
656 			return (s_len);
657 		}
658 		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
659 	} else {
660 		ZSTDSTAT_BUMP(zstd_stat_passignored);
661 		if (s_len < actual_abort_size) {
662 			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
663 		}
664 	}
665 keep_trying:
666 	return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
667 
668 }
669 #endif
670 
671 /* Decompress block using zstd and return its stored level */
672 static int
673 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
674     size_t d_len, uint8_t *level)
675 {
676 	ZSTD_DCtx *dctx;
677 	size_t result;
678 	int16_t zstd_level;
679 	uint32_t c_len;
680 	const zfs_zstdhdr_t *hdr;
681 	zfs_zstdhdr_t hdr_copy;
682 
683 	hdr = (const zfs_zstdhdr_t *)s_start;
684 	c_len = BE_32(hdr->c_len);
685 
686 	/*
687 	 * Make a copy instead of directly converting the header, since we must
688 	 * not modify the original data that may be used again later.
689 	 */
690 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
691 	uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
692 
693 	/*
694 	 * NOTE: We ignore the ZSTD version for now. As soon as any
695 	 * incompatibility occurs, it has to be handled accordingly.
696 	 * The version can be accessed via `hdr_copy.version`.
697 	 */
698 
699 	/*
700 	 * Convert and check the level
701 	 * An invalid level is a strong indicator for data corruption! In such
702 	 * case return an error so the upper layers can try to fix it.
703 	 */
704 	if (zstd_enum_to_level(curlevel, &zstd_level)) {
705 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
706 		return (1);
707 	}
708 
709 	ASSERT3U(d_len, >=, s_len);
710 	ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
711 
712 	/* Invalid compressed buffer size encoded at start */
713 	if (c_len + sizeof (*hdr) > s_len) {
714 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
715 		return (1);
716 	}
717 
718 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
719 	if (!dctx) {
720 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
721 		return (1);
722 	}
723 
724 	/* Set header type to "magicless" */
725 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
726 
727 	/* Decompress the data and release the context */
728 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
729 	ZSTD_freeDCtx(dctx);
730 
731 	/*
732 	 * Returns 0 on success (decompression function returned non-negative)
733 	 * and non-zero on failure (decompression function returned negative.
734 	 */
735 	if (ZSTD_isError(result)) {
736 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
737 		return (1);
738 	}
739 
740 	if (level) {
741 		*level = curlevel;
742 	}
743 
744 	return (0);
745 }
746 
747 /* Decompress datablock using zstd */
748 #ifdef IN_BASE
749 int
750 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
751     size_t d_len, int level __maybe_unused)
752 {
753 
754 	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
755 	    NULL));
756 }
757 #else
758 static int
759 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
760     size_t d_len, int level __maybe_unused)
761 {
762 
763 	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
764 	    NULL));
765 }
766 #endif
767 
768 #ifndef IN_LIBSA
769 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
770 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
771 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
772 
773 /* Allocator for zstd compression context using mempool_allocator */
774 static void *
775 zstd_alloc(void *opaque __maybe_unused, size_t size)
776 {
777 	size_t nbytes = sizeof (struct zstd_kmem) + size;
778 	struct zstd_kmem *z = NULL;
779 
780 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
781 
782 	if (!z) {
783 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
784 		return (NULL);
785 	}
786 
787 	return ((void*)z + (sizeof (struct zstd_kmem)));
788 }
789 
790 #endif
791 /*
792  * Allocator for zstd decompression context using mempool_allocator with
793  * fallback to reserved memory if allocation fails
794  */
795 static void *
796 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
797 {
798 	size_t nbytes = sizeof (struct zstd_kmem) + size;
799 	struct zstd_kmem *z = NULL;
800 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
801 
802 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
803 	if (!z) {
804 		/* Try harder, decompression shall not fail */
805 		z = vmem_alloc(nbytes, KM_SLEEP);
806 		if (z) {
807 			z->pool = NULL;
808 		}
809 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
810 	} else {
811 		return ((void*)z + (sizeof (struct zstd_kmem)));
812 	}
813 
814 	/* Fallback if everything fails */
815 	if (!z) {
816 		/*
817 		 * Barrier since we only can handle it in a single thread. All
818 		 * other following threads need to wait here until decompression
819 		 * is completed. zstd_free will release this barrier later.
820 		 */
821 		mutex_enter(&zstd_dctx_fallback.barrier);
822 
823 		z = zstd_dctx_fallback.mem;
824 		type = ZSTD_KMEM_DCTX;
825 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
826 	}
827 
828 	/* Allocation should always be successful */
829 	if (!z) {
830 		return (NULL);
831 	}
832 
833 	z->kmem_type = type;
834 	z->kmem_size = nbytes;
835 
836 	return ((void*)z + (sizeof (struct zstd_kmem)));
837 }
838 
839 /* Free allocated memory by its specific type */
840 static void
841 zstd_free(void *opaque __maybe_unused, void *ptr)
842 {
843 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
844 	enum zstd_kmem_type type;
845 
846 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
847 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
848 
849 	type = z->kmem_type;
850 	switch (type) {
851 	case ZSTD_KMEM_DEFAULT:
852 		vmem_free(z, z->kmem_size);
853 		break;
854 	case ZSTD_KMEM_POOL:
855 		zstd_mempool_free(z);
856 		break;
857 	case ZSTD_KMEM_DCTX:
858 		mutex_exit(&zstd_dctx_fallback.barrier);
859 		break;
860 	default:
861 		break;
862 	}
863 }
864 
865 /* Allocate fallback memory to ensure safe decompression */
866 static void __init
867 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
868 {
869 	mem->mem_size = size;
870 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
871 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
872 }
873 
874 /* Initialize memory pool barrier mutexes */
875 static void __init
876 zstd_mempool_init(void)
877 {
878 	zstd_mempool_cctx =
879 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
880 	zstd_mempool_dctx =
881 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
882 
883 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
884 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
885 		    MUTEX_DEFAULT, NULL);
886 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
887 		    MUTEX_DEFAULT, NULL);
888 	}
889 }
890 
891 /* Initialize zstd-related memory handling */
892 static int __init
893 zstd_meminit(void)
894 {
895 	zstd_mempool_init();
896 
897 	/*
898 	 * Estimate the size of the fallback decompression context.
899 	 * The expected size on x64 with current ZSTD should be about 160 KB.
900 	 */
901 	create_fallback_mem(&zstd_dctx_fallback,
902 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
903 	    PAGESIZE));
904 
905 	return (0);
906 }
907 
908 /* Release object from pool and free memory */
909 static void
910 release_pool(struct zstd_pool *pool)
911 {
912 	mutex_destroy(&pool->barrier);
913 	vmem_free(pool->mem, pool->size);
914 	pool->mem = NULL;
915 	pool->size = 0;
916 }
917 
918 /* Release memory pool objects */
919 static void
920 zstd_mempool_deinit(void)
921 {
922 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
923 		release_pool(&zstd_mempool_cctx[i]);
924 		release_pool(&zstd_mempool_dctx[i]);
925 	}
926 
927 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
928 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
929 	zstd_mempool_dctx = NULL;
930 	zstd_mempool_cctx = NULL;
931 }
932 
933 /* release unused memory from pool */
934 
935 void
936 zfs_zstd_cache_reap_now(void)
937 {
938 
939 	/*
940 	 * Short-circuit if there are no buffers to begin with.
941 	 */
942 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
943 		return;
944 
945 	/*
946 	 * calling alloc with zero size seeks
947 	 * and releases old unused objects
948 	 */
949 	zstd_mempool_reap(zstd_mempool_cctx);
950 	zstd_mempool_reap(zstd_mempool_dctx);
951 }
952 
953 extern int __init
954 zstd_init(void)
955 {
956 	/* Set pool size by using maximum sane thread count * 4 */
957 	pool_count = (boot_ncpus * 4);
958 	zstd_meminit();
959 
960 	/* Initialize kstat */
961 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
962 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
963 	    KSTAT_FLAG_VIRTUAL);
964 	if (zstd_ksp != NULL) {
965 		zstd_ksp->ks_data = &zstd_stats;
966 		kstat_install(zstd_ksp);
967 #ifdef _KERNEL
968 		zstd_ksp->ks_update = kstat_zstd_update;
969 #endif
970 	}
971 
972 	return (0);
973 }
974 
975 extern void
976 zstd_fini(void)
977 {
978 	/* Deinitialize kstat */
979 	if (zstd_ksp != NULL) {
980 		kstat_delete(zstd_ksp);
981 		zstd_ksp = NULL;
982 	}
983 
984 	/* Release fallback memory */
985 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
986 	mutex_destroy(&zstd_dctx_fallback.barrier);
987 
988 	/* Deinit memory pool */
989 	zstd_mempool_deinit();
990 }
991 
992 #if defined(_KERNEL)
993 #ifdef __FreeBSD__
994 module_init(zstd_init);
995 module_exit(zstd_fini);
996 #endif
997 
998 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
999 	"Enable early abort attempts when using zstd");
1000 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
1001 	"Minimal size of block to attempt early abort");
1002 #endif
1003