xref: /freebsd/sys/contrib/openzfs/module/zstd/zfs_zstd.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: BSD-3-Clause
2 /*
3  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2016-2018, Klara Inc.
34  * Copyright (c) 2016-2018, Allan Jude
35  * Copyright (c) 2018-2020, Sebastian Gottschall
36  * Copyright (c) 2019-2020, Michael Niewöhner
37  * Copyright (c) 2020, The FreeBSD Foundation [1]
38  *
39  * [1] Portions of this software were developed by Allan Jude
40  *     under sponsorship from the FreeBSD Foundation.
41  */
42 
43 #include <sys/param.h>
44 #include <sys/sysmacros.h>
45 #include <sys/zfs_context.h>
46 #include <sys/zio_compress.h>
47 #include <sys/spa.h>
48 #include <sys/zstd/zstd.h>
49 
50 #define	ZSTD_STATIC_LINKING_ONLY
51 #include "lib/zstd.h"
52 #include "lib/common/zstd_errors.h"
53 
54 #ifndef IN_LIBSA
55 static uint_t zstd_earlyabort_pass = 1;
56 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
57 static unsigned int zstd_abort_size = (128 * 1024);
58 #endif
59 
60 #ifdef IN_BASE
61 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int);
62 #endif
63 
64 static kstat_t *zstd_ksp = NULL;
65 
66 typedef struct zstd_stats {
67 	kstat_named_t	zstd_stat_alloc_fail;
68 	kstat_named_t	zstd_stat_alloc_fallback;
69 	kstat_named_t	zstd_stat_com_alloc_fail;
70 	kstat_named_t	zstd_stat_dec_alloc_fail;
71 	kstat_named_t	zstd_stat_com_inval;
72 	kstat_named_t	zstd_stat_dec_inval;
73 	kstat_named_t	zstd_stat_dec_header_inval;
74 	kstat_named_t	zstd_stat_com_fail;
75 	kstat_named_t	zstd_stat_dec_fail;
76 	/*
77 	 * LZ4 first-pass early abort verdict
78 	 */
79 	kstat_named_t	zstd_stat_lz4pass_allowed;
80 	kstat_named_t	zstd_stat_lz4pass_rejected;
81 	/*
82 	 * zstd-1 second-pass early abort verdict
83 	 */
84 	kstat_named_t	zstd_stat_zstdpass_allowed;
85 	kstat_named_t	zstd_stat_zstdpass_rejected;
86 	/*
87 	 * We excluded this from early abort for some reason
88 	 */
89 	kstat_named_t	zstd_stat_passignored;
90 	kstat_named_t	zstd_stat_passignored_size;
91 	kstat_named_t	zstd_stat_buffers;
92 	kstat_named_t	zstd_stat_size;
93 } zstd_stats_t;
94 
95 static zstd_stats_t zstd_stats = {
96 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
97 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
98 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
99 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
100 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
101 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
102 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
103 	{ "compress_failed",		KSTAT_DATA_UINT64 },
104 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
105 	{ "lz4pass_allowed",		KSTAT_DATA_UINT64 },
106 	{ "lz4pass_rejected",		KSTAT_DATA_UINT64 },
107 	{ "zstdpass_allowed",		KSTAT_DATA_UINT64 },
108 	{ "zstdpass_rejected",		KSTAT_DATA_UINT64 },
109 	{ "passignored",		KSTAT_DATA_UINT64 },
110 	{ "passignored_size",		KSTAT_DATA_UINT64 },
111 	{ "buffers",			KSTAT_DATA_UINT64 },
112 	{ "size",			KSTAT_DATA_UINT64 },
113 };
114 
115 #ifdef _KERNEL
116 static int
kstat_zstd_update(kstat_t * ksp,int rw)117 kstat_zstd_update(kstat_t *ksp, int rw)
118 {
119 	ASSERT(ksp != NULL);
120 
121 	if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
122 		ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
123 		ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
124 		ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
125 		ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
126 		ZSTDSTAT_ZERO(zstd_stat_com_inval);
127 		ZSTDSTAT_ZERO(zstd_stat_dec_inval);
128 		ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
129 		ZSTDSTAT_ZERO(zstd_stat_com_fail);
130 		ZSTDSTAT_ZERO(zstd_stat_dec_fail);
131 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
132 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
133 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
134 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
135 		ZSTDSTAT_ZERO(zstd_stat_passignored);
136 		ZSTDSTAT_ZERO(zstd_stat_passignored_size);
137 	}
138 
139 	return (0);
140 }
141 #endif
142 
143 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
144 enum zstd_kmem_type {
145 	ZSTD_KMEM_UNKNOWN = 0,
146 	/* Allocation type using kmem_vmalloc */
147 	ZSTD_KMEM_DEFAULT,
148 	/* Pool based allocation using mempool_alloc */
149 	ZSTD_KMEM_POOL,
150 	/* Reserved fallback memory for decompression only */
151 	ZSTD_KMEM_DCTX,
152 	ZSTD_KMEM_COUNT,
153 };
154 
155 /* Structure for pooled memory objects */
156 struct zstd_pool {
157 	void *mem;
158 	size_t size;
159 	kmutex_t barrier;
160 	hrtime_t timeout;
161 };
162 
163 /* Global structure for handling memory allocations */
164 struct zstd_kmem {
165 	enum zstd_kmem_type kmem_type;
166 	size_t kmem_size;
167 	struct zstd_pool *pool;
168 };
169 
170 /* Fallback memory structure used for decompression only if memory runs out */
171 struct zstd_fallback_mem {
172 	size_t mem_size;
173 	void *mem;
174 	kmutex_t barrier;
175 };
176 
177 struct zstd_levelmap {
178 	int16_t zstd_level;
179 	enum zio_zstd_levels level;
180 };
181 
182 /*
183  * ZSTD memory handlers
184  *
185  * For decompression we use a different handler which also provides fallback
186  * memory allocation in case memory runs out.
187  *
188  * The ZSTD handlers were split up for the most simplified implementation.
189  */
190 #ifndef IN_LIBSA
191 static void *zstd_alloc(void *opaque, size_t size);
192 #endif
193 static void *zstd_dctx_alloc(void *opaque, size_t size);
194 static void zstd_free(void *opaque, void *ptr);
195 
196 #ifndef IN_LIBSA
197 /* Compression memory handler */
198 static const ZSTD_customMem zstd_malloc = {
199 	zstd_alloc,
200 	zstd_free,
201 	NULL,
202 };
203 #endif
204 
205 /* Decompression memory handler */
206 static const ZSTD_customMem zstd_dctx_malloc = {
207 	zstd_dctx_alloc,
208 	zstd_free,
209 	NULL,
210 };
211 
212 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
213 static struct zstd_levelmap zstd_levels[] = {
214 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
215 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
216 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
217 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
218 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
219 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
220 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
221 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
222 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
223 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
224 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
225 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
226 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
227 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
228 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
229 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
230 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
231 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
232 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
233 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
234 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
235 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
236 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
237 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
238 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
239 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
240 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
241 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
242 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
243 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
244 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
245 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
246 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
247 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
248 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
249 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
250 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
251 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
252 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
253 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
254 };
255 
256 /*
257  * This variable represents the maximum count of the pool based on the number
258  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
259  */
260 static int pool_count = 16;
261 
262 #define	ZSTD_POOL_MAX		pool_count
263 #define	ZSTD_POOL_TIMEOUT	60 * 2
264 
265 static struct zstd_fallback_mem zstd_dctx_fallback;
266 static struct zstd_pool *zstd_mempool_cctx;
267 static struct zstd_pool *zstd_mempool_dctx;
268 
269 /*
270  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
271  * and while ASAN does this, KASAN defines that and does not. So to avoid
272  * changing the external code, we do this.
273  */
274 #if defined(ZFS_ASAN_ENABLED)
275 #define	ADDRESS_SANITIZER 1
276 #endif
277 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
278 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
279 void __asan_poison_memory_region(void const volatile *addr, size_t size);
__asan_unpoison_memory_region(void const volatile * addr,size_t size)280 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
__asan_poison_memory_region(void const volatile * addr,size_t size)281 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
282 #endif
283 
284 
285 static void
zstd_mempool_reap(struct zstd_pool * zstd_mempool)286 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
287 {
288 	struct zstd_pool *pool;
289 
290 	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
291 		return;
292 	}
293 
294 	/* free obsolete slots */
295 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
296 		pool = &zstd_mempool[i];
297 		if (pool->mem && mutex_tryenter(&pool->barrier)) {
298 			/* Free memory if unused object older than 2 minutes */
299 			if (pool->mem && gethrestime_sec() > pool->timeout) {
300 				vmem_free(pool->mem, pool->size);
301 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
302 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
303 				pool->mem = NULL;
304 				pool->size = 0;
305 				pool->timeout = 0;
306 			}
307 			mutex_exit(&pool->barrier);
308 		}
309 	}
310 }
311 
312 /*
313  * Try to get a cached allocated buffer from memory pool or allocate a new one
314  * if necessary. If a object is older than 2 minutes and does not fit the
315  * requested size, it will be released and a new cached entry will be allocated.
316  * If other pooled objects are detected without being used for 2 minutes, they
317  * will be released, too.
318  *
319  * The concept is that high frequency memory allocations of bigger objects are
320  * expensive. So if a lot of work is going on, allocations will be kept for a
321  * while and can be reused in that time frame.
322  *
323  * The scheduled release will be updated every time a object is reused.
324  */
325 
326 static void *
zstd_mempool_alloc(struct zstd_pool * zstd_mempool,size_t size)327 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
328 {
329 	struct zstd_pool *pool;
330 	struct zstd_kmem *mem = NULL;
331 
332 	if (!zstd_mempool) {
333 		return (NULL);
334 	}
335 
336 	/* Seek for preallocated memory slot and free obsolete slots */
337 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
338 		pool = &zstd_mempool[i];
339 		/*
340 		 * This lock is simply a marker for a pool object being in use.
341 		 * If it's already hold, it will be skipped.
342 		 *
343 		 * We need to create it before checking it to avoid race
344 		 * conditions caused by running in a threaded context.
345 		 *
346 		 * The lock is later released by zstd_mempool_free.
347 		 */
348 		if (mutex_tryenter(&pool->barrier)) {
349 			/*
350 			 * Check if objects fits the size, if so we take it and
351 			 * update the timestamp.
352 			 */
353 			if (pool->mem && size <= pool->size) {
354 				pool->timeout = gethrestime_sec() +
355 				    ZSTD_POOL_TIMEOUT;
356 				mem = pool->mem;
357 				return (mem);
358 			}
359 			mutex_exit(&pool->barrier);
360 		}
361 	}
362 
363 	/*
364 	 * If no preallocated slot was found, try to fill in a new one.
365 	 *
366 	 * We run a similar algorithm twice here to avoid pool fragmentation.
367 	 * The first one may generate holes in the list if objects get released.
368 	 * We always make sure that these holes get filled instead of adding new
369 	 * allocations constantly at the end.
370 	 */
371 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
372 		pool = &zstd_mempool[i];
373 		if (mutex_tryenter(&pool->barrier)) {
374 			/* Object is free, try to allocate new one */
375 			if (!pool->mem) {
376 				mem = vmem_alloc(size, KM_SLEEP);
377 				if (mem) {
378 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
379 					ZSTDSTAT_ADD(zstd_stat_size, size);
380 					pool->mem = mem;
381 					pool->size = size;
382 					/* Keep track for later release */
383 					mem->pool = pool;
384 					mem->kmem_type = ZSTD_KMEM_POOL;
385 					mem->kmem_size = size;
386 				}
387 			}
388 
389 			if (size <= pool->size) {
390 				/* Update timestamp */
391 				pool->timeout = gethrestime_sec() +
392 				    ZSTD_POOL_TIMEOUT;
393 
394 				return (pool->mem);
395 			}
396 
397 			mutex_exit(&pool->barrier);
398 		}
399 	}
400 
401 	/*
402 	 * If the pool is full or the allocation failed, try lazy allocation
403 	 * instead.
404 	 */
405 	if (!mem) {
406 		mem = vmem_alloc(size, KM_NOSLEEP);
407 		if (mem) {
408 			mem->pool = NULL;
409 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
410 			mem->kmem_size = size;
411 		}
412 	}
413 
414 	return (mem);
415 }
416 
417 /* Mark object as released by releasing the barrier mutex */
418 static void
zstd_mempool_free(struct zstd_kmem * z)419 zstd_mempool_free(struct zstd_kmem *z)
420 {
421 	mutex_exit(&z->pool->barrier);
422 }
423 
424 /* Convert ZFS internal enum to ZSTD level */
425 static int
zstd_enum_to_level(enum zio_zstd_levels level,int16_t * zstd_level)426 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
427 {
428 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
429 		*zstd_level = zstd_levels[level - 1].zstd_level;
430 		return (0);
431 	}
432 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
433 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
434 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
435 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
436 		return (0);
437 	}
438 
439 	/* Invalid/unknown zfs compression enum - this should never happen. */
440 	return (1);
441 }
442 
443 #ifndef IN_LIBSA
444 static size_t
zfs_zstd_compress_wrap(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)445 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
446     int level)
447 {
448 	int16_t zstd_level;
449 	if (zstd_enum_to_level(level, &zstd_level)) {
450 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
451 		return (s_len);
452 	}
453 	/*
454 	 * A zstd early abort heuristic.
455 	 *
456 	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
457 	 *   128k), don't try any of this, just go.
458 	 *   (because experimentally that was a reasonable cutoff for a perf win
459 	 *   with tiny ratio change)
460 	 * - First, we try LZ4 compression, and if it doesn't early abort, we
461 	 *   jump directly to whatever compression level we intended to try.
462 	 * - Second, we try zstd-1 - if that errors out (usually, but not
463 	 *   exclusively, if it would overflow), we give up early.
464 	 *
465 	 *   If it works, instead we go on and compress anyway.
466 	 *
467 	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
468 	 * compressible data, it was losing up to 8.5% of the compressed
469 	 * savings versus no early abort, and all the zstd-fast levels are
470 	 * worse indications on their own than LZ4, and don't improve the LZ4
471 	 * pass noticably if stacked like this.
472 	 */
473 	size_t actual_abort_size = zstd_abort_size;
474 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
475 	    s_len >= actual_abort_size) {
476 		int pass_len = 1;
477 		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
478 		if (pass_len < d_len) {
479 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
480 			goto keep_trying;
481 		}
482 		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
483 
484 		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
485 		    ZIO_ZSTD_LEVEL_1);
486 		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
487 			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
488 			return (s_len);
489 		}
490 		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
491 	} else {
492 		ZSTDSTAT_BUMP(zstd_stat_passignored);
493 		if (s_len < actual_abort_size) {
494 			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
495 		}
496 	}
497 keep_trying:
498 	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
499 
500 }
501 
502 /* Compress block using zstd */
503 static size_t
zfs_zstd_compress_impl(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)504 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
505     int level)
506 {
507 	size_t c_len;
508 	int16_t zstd_level;
509 	zfs_zstdhdr_t *hdr;
510 	ZSTD_CCtx *cctx;
511 
512 	hdr = (zfs_zstdhdr_t *)d_start;
513 
514 	/* Skip compression if the specified level is invalid */
515 	if (zstd_enum_to_level(level, &zstd_level)) {
516 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
517 		return (s_len);
518 	}
519 
520 	ASSERT3U(d_len, >=, sizeof (*hdr));
521 	ASSERT3U(d_len, <=, s_len);
522 	ASSERT3U(zstd_level, !=, 0);
523 
524 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
525 
526 	/*
527 	 * Out of kernel memory, gently fall through - this will disable
528 	 * compression in zio_compress_data
529 	 */
530 	if (!cctx) {
531 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
532 		return (s_len);
533 	}
534 
535 	/* Set the compression level */
536 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
537 
538 	/* Use the "magicless" zstd header which saves us 4 header bytes */
539 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
540 
541 	/*
542 	 * Disable redundant checksum calculation and content size storage since
543 	 * this is already done by ZFS itself.
544 	 */
545 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
546 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
547 
548 	c_len = ZSTD_compress2(cctx,
549 	    hdr->data,
550 	    d_len - sizeof (*hdr),
551 	    s_start, s_len);
552 
553 	ZSTD_freeCCtx(cctx);
554 
555 	/* Error in the compression routine, disable compression. */
556 	if (ZSTD_isError(c_len)) {
557 		/*
558 		 * If we are aborting the compression because the saves are
559 		 * too small, that is not a failure. Everything else is a
560 		 * failure, so increment the compression failure counter.
561 		 */
562 		int err = ZSTD_getErrorCode(c_len);
563 		if (err != ZSTD_error_dstSize_tooSmall) {
564 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
565 			dprintf("Error: %s", ZSTD_getErrorString(err));
566 		}
567 		return (s_len);
568 	}
569 
570 	/*
571 	 * Encode the compressed buffer size at the start. We'll need this in
572 	 * decompression to counter the effects of padding which might be added
573 	 * to the compressed buffer and which, if unhandled, would confuse the
574 	 * hell out of our decompression function.
575 	 */
576 	hdr->c_len = BE_32(c_len);
577 
578 	/*
579 	 * Check version for overflow.
580 	 * The limit of 24 bits must not be exceeded. This allows a maximum
581 	 * version 1677.72.15 which we don't expect to be ever reached.
582 	 */
583 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
584 
585 	/*
586 	 * Encode the compression level as well. We may need to know the
587 	 * original compression level if compressed_arc is disabled, to match
588 	 * the compression settings to write this block to the L2ARC.
589 	 *
590 	 * Encode the actual level, so if the enum changes in the future, we
591 	 * will be compatible.
592 	 *
593 	 * The upper 24 bits store the ZSTD version to be able to provide
594 	 * future compatibility, since new versions might enhance the
595 	 * compression algorithm in a way, where the compressed data will
596 	 * change.
597 	 *
598 	 * As soon as such incompatibility occurs, handling code needs to be
599 	 * added, differentiating between the versions.
600 	 */
601 	zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
602 	zfs_set_hdrlevel(hdr, level);
603 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
604 
605 	return (c_len + sizeof (*hdr));
606 }
607 
608 static size_t
zfs_zstd_compress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)609 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
610     int level)
611 {
612 	int16_t zstd_level;
613 	if (zstd_enum_to_level(level, &zstd_level)) {
614 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
615 		return (s_len);
616 	}
617 	/*
618 	 * A zstd early abort heuristic.
619 	 *
620 	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
621 	 *   128k), don't try any of this, just go.
622 	 *   (because experimentally that was a reasonable cutoff for a perf win
623 	 *   with tiny ratio change)
624 	 * - First, we try LZ4 compression, and if it doesn't early abort, we
625 	 *   jump directly to whatever compression level we intended to try.
626 	 * - Second, we try zstd-1 - if that errors out (usually, but not
627 	 *   exclusively, if it would overflow), we give up early.
628 	 *
629 	 *   If it works, instead we go on and compress anyway.
630 	 *
631 	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
632 	 * compressible data, it was losing up to 8.5% of the compressed
633 	 * savings versus no early abort, and all the zstd-fast levels are
634 	 * worse indications on their own than LZ4, and don't improve the LZ4
635 	 * pass noticably if stacked like this.
636 	 */
637 	size_t actual_abort_size = zstd_abort_size;
638 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
639 	    s_len >= actual_abort_size) {
640 		int pass_len = 1;
641 		abd_t sabd, dabd;
642 		abd_get_from_buf_struct(&sabd, s_start, s_len);
643 		abd_get_from_buf_struct(&dabd, d_start, d_len);
644 		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
645 		abd_free(&dabd);
646 		abd_free(&sabd);
647 		if (pass_len < d_len) {
648 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
649 			goto keep_trying;
650 		}
651 		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
652 
653 		pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
654 		    d_len, ZIO_ZSTD_LEVEL_1);
655 		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
656 			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
657 			return (s_len);
658 		}
659 		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
660 	} else {
661 		ZSTDSTAT_BUMP(zstd_stat_passignored);
662 		if (s_len < actual_abort_size) {
663 			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
664 		}
665 	}
666 keep_trying:
667 	return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
668 
669 }
670 #endif
671 
672 /* Decompress block using zstd and return its stored level */
673 static int
zfs_zstd_decompress_level_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,uint8_t * level)674 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
675     size_t d_len, uint8_t *level)
676 {
677 	ZSTD_DCtx *dctx;
678 	size_t result;
679 	int16_t zstd_level;
680 	uint32_t c_len;
681 	const zfs_zstdhdr_t *hdr;
682 	zfs_zstdhdr_t hdr_copy;
683 
684 	hdr = (const zfs_zstdhdr_t *)s_start;
685 	c_len = BE_32(hdr->c_len);
686 
687 	/*
688 	 * Make a copy instead of directly converting the header, since we must
689 	 * not modify the original data that may be used again later.
690 	 */
691 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
692 	uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
693 
694 	/*
695 	 * NOTE: We ignore the ZSTD version for now. As soon as any
696 	 * incompatibility occurs, it has to be handled accordingly.
697 	 * The version can be accessed via `hdr_copy.version`.
698 	 */
699 
700 	/*
701 	 * Convert and check the level
702 	 * An invalid level is a strong indicator for data corruption! In such
703 	 * case return an error so the upper layers can try to fix it.
704 	 */
705 	if (zstd_enum_to_level(curlevel, &zstd_level)) {
706 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
707 		return (1);
708 	}
709 
710 	ASSERT3U(d_len, >=, s_len);
711 	ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
712 
713 	/* Invalid compressed buffer size encoded at start */
714 	if (c_len + sizeof (*hdr) > s_len) {
715 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
716 		return (1);
717 	}
718 
719 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
720 	if (!dctx) {
721 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
722 		return (1);
723 	}
724 
725 	/* Set header type to "magicless" */
726 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
727 
728 	/* Decompress the data and release the context */
729 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
730 	ZSTD_freeDCtx(dctx);
731 
732 	/*
733 	 * Returns 0 on success (decompression function returned non-negative)
734 	 * and non-zero on failure (decompression function returned negative.
735 	 */
736 	if (ZSTD_isError(result)) {
737 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
738 		return (1);
739 	}
740 
741 	if (level) {
742 		*level = curlevel;
743 	}
744 
745 	return (0);
746 }
747 
748 /* Decompress datablock using zstd */
749 #ifdef IN_BASE
750 int
zfs_zstd_decompress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)751 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
752     size_t d_len, int level __maybe_unused)
753 {
754 
755 	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
756 	    NULL));
757 }
758 #else
759 static int
zfs_zstd_decompress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)760 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
761     size_t d_len, int level __maybe_unused)
762 {
763 
764 	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
765 	    NULL));
766 }
767 #endif
768 
769 #ifndef IN_LIBSA
770 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)771 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
772 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
773 
774 /* Allocator for zstd compression context using mempool_allocator */
775 static void *
776 zstd_alloc(void *opaque __maybe_unused, size_t size)
777 {
778 	size_t nbytes = sizeof (struct zstd_kmem) + size;
779 	struct zstd_kmem *z = NULL;
780 
781 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
782 
783 	if (!z) {
784 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
785 		return (NULL);
786 	}
787 
788 	return ((void*)z + (sizeof (struct zstd_kmem)));
789 }
790 
791 #endif
792 /*
793  * Allocator for zstd decompression context using mempool_allocator with
794  * fallback to reserved memory if allocation fails
795  */
796 static void *
zstd_dctx_alloc(void * opaque __maybe_unused,size_t size)797 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
798 {
799 	size_t nbytes = sizeof (struct zstd_kmem) + size;
800 	struct zstd_kmem *z = NULL;
801 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
802 
803 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
804 	if (!z) {
805 		/* Try harder, decompression shall not fail */
806 		z = vmem_alloc(nbytes, KM_SLEEP);
807 		if (z) {
808 			z->pool = NULL;
809 		}
810 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
811 	} else {
812 		return ((void*)z + (sizeof (struct zstd_kmem)));
813 	}
814 
815 	/* Fallback if everything fails */
816 	if (!z) {
817 		/*
818 		 * Barrier since we only can handle it in a single thread. All
819 		 * other following threads need to wait here until decompression
820 		 * is completed. zstd_free will release this barrier later.
821 		 */
822 		mutex_enter(&zstd_dctx_fallback.barrier);
823 
824 		z = zstd_dctx_fallback.mem;
825 		type = ZSTD_KMEM_DCTX;
826 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
827 	}
828 
829 	/* Allocation should always be successful */
830 	if (!z) {
831 		return (NULL);
832 	}
833 
834 	z->kmem_type = type;
835 	z->kmem_size = nbytes;
836 
837 	return ((void*)z + (sizeof (struct zstd_kmem)));
838 }
839 
840 /* Free allocated memory by its specific type */
841 static void
zstd_free(void * opaque __maybe_unused,void * ptr)842 zstd_free(void *opaque __maybe_unused, void *ptr)
843 {
844 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
845 	enum zstd_kmem_type type;
846 
847 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
848 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
849 
850 	type = z->kmem_type;
851 	switch (type) {
852 	case ZSTD_KMEM_DEFAULT:
853 		vmem_free(z, z->kmem_size);
854 		break;
855 	case ZSTD_KMEM_POOL:
856 		zstd_mempool_free(z);
857 		break;
858 	case ZSTD_KMEM_DCTX:
859 		mutex_exit(&zstd_dctx_fallback.barrier);
860 		break;
861 	default:
862 		break;
863 	}
864 }
865 
866 /* Allocate fallback memory to ensure safe decompression */
867 static void __init
create_fallback_mem(struct zstd_fallback_mem * mem,size_t size)868 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
869 {
870 	mem->mem_size = size;
871 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
872 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
873 }
874 
875 /* Initialize memory pool barrier mutexes */
876 static void __init
zstd_mempool_init(void)877 zstd_mempool_init(void)
878 {
879 	zstd_mempool_cctx =
880 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
881 	zstd_mempool_dctx =
882 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
883 
884 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
885 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
886 		    MUTEX_DEFAULT, NULL);
887 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
888 		    MUTEX_DEFAULT, NULL);
889 	}
890 }
891 
892 /* Initialize zstd-related memory handling */
893 static int __init
zstd_meminit(void)894 zstd_meminit(void)
895 {
896 	zstd_mempool_init();
897 
898 	/*
899 	 * Estimate the size of the fallback decompression context.
900 	 * The expected size on x64 with current ZSTD should be about 160 KB.
901 	 */
902 	create_fallback_mem(&zstd_dctx_fallback,
903 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
904 	    PAGESIZE));
905 
906 	return (0);
907 }
908 
909 /* Release object from pool and free memory */
910 static void
release_pool(struct zstd_pool * pool)911 release_pool(struct zstd_pool *pool)
912 {
913 	mutex_destroy(&pool->barrier);
914 	vmem_free(pool->mem, pool->size);
915 	pool->mem = NULL;
916 	pool->size = 0;
917 }
918 
919 /* Release memory pool objects */
920 static void
zstd_mempool_deinit(void)921 zstd_mempool_deinit(void)
922 {
923 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
924 		release_pool(&zstd_mempool_cctx[i]);
925 		release_pool(&zstd_mempool_dctx[i]);
926 	}
927 
928 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
929 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
930 	zstd_mempool_dctx = NULL;
931 	zstd_mempool_cctx = NULL;
932 }
933 
934 /* release unused memory from pool */
935 
936 void
zfs_zstd_cache_reap_now(void)937 zfs_zstd_cache_reap_now(void)
938 {
939 
940 	/*
941 	 * Short-circuit if there are no buffers to begin with.
942 	 */
943 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
944 		return;
945 
946 	/*
947 	 * calling alloc with zero size seeks
948 	 * and releases old unused objects
949 	 */
950 	zstd_mempool_reap(zstd_mempool_cctx);
951 	zstd_mempool_reap(zstd_mempool_dctx);
952 }
953 
954 extern int __init
zstd_init(void)955 zstd_init(void)
956 {
957 	/* Set pool size by using maximum sane thread count * 4 */
958 	pool_count = (boot_ncpus * 4);
959 	zstd_meminit();
960 
961 	/* Initialize kstat */
962 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
963 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
964 	    KSTAT_FLAG_VIRTUAL);
965 	if (zstd_ksp != NULL) {
966 		zstd_ksp->ks_data = &zstd_stats;
967 		kstat_install(zstd_ksp);
968 #ifdef _KERNEL
969 		zstd_ksp->ks_update = kstat_zstd_update;
970 #endif
971 	}
972 
973 	return (0);
974 }
975 
976 extern void
zstd_fini(void)977 zstd_fini(void)
978 {
979 	/* Deinitialize kstat */
980 	if (zstd_ksp != NULL) {
981 		kstat_delete(zstd_ksp);
982 		zstd_ksp = NULL;
983 	}
984 
985 	/* Release fallback memory */
986 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
987 	mutex_destroy(&zstd_dctx_fallback.barrier);
988 
989 	/* Deinit memory pool */
990 	zstd_mempool_deinit();
991 }
992 
993 #if defined(_KERNEL)
994 #ifdef __FreeBSD__
995 module_init(zstd_init);
996 module_exit(zstd_fini);
997 #endif
998 
999 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
1000 	"Enable early abort attempts when using zstd");
1001 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
1002 	"Minimal size of block to attempt early abort");
1003 #endif
1004