1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2018, Joyent, Inc.
25 * Copyright (c) 2011, 2020, Delphix. All rights reserved.
26 * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
29 * Copyright (c) 2020, George Amanakis. All rights reserved.
30 * Copyright (c) 2019, 2024, 2025, Klara, Inc.
31 * Copyright (c) 2019, Allan Jude
32 * Copyright (c) 2020, The FreeBSD Foundation [1]
33 * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
34 *
35 * [1] Portions of this software were developed by Allan Jude
36 * under sponsorship from the FreeBSD Foundation.
37 */
38
39 /*
40 * DVA-based Adjustable Replacement Cache
41 *
42 * While much of the theory of operation used here is
43 * based on the self-tuning, low overhead replacement cache
44 * presented by Megiddo and Modha at FAST 2003, there are some
45 * significant differences:
46 *
47 * 1. The Megiddo and Modha model assumes any page is evictable.
48 * Pages in its cache cannot be "locked" into memory. This makes
49 * the eviction algorithm simple: evict the last page in the list.
50 * This also make the performance characteristics easy to reason
51 * about. Our cache is not so simple. At any given moment, some
52 * subset of the blocks in the cache are un-evictable because we
53 * have handed out a reference to them. Blocks are only evictable
54 * when there are no external references active. This makes
55 * eviction far more problematic: we choose to evict the evictable
56 * blocks that are the "lowest" in the list.
57 *
58 * There are times when it is not possible to evict the requested
59 * space. In these circumstances we are unable to adjust the cache
60 * size. To prevent the cache growing unbounded at these times we
61 * implement a "cache throttle" that slows the flow of new data
62 * into the cache until we can make space available.
63 *
64 * 2. The Megiddo and Modha model assumes a fixed cache size.
65 * Pages are evicted when the cache is full and there is a cache
66 * miss. Our model has a variable sized cache. It grows with
67 * high use, but also tries to react to memory pressure from the
68 * operating system: decreasing its size when system memory is
69 * tight.
70 *
71 * 3. The Megiddo and Modha model assumes a fixed page size. All
72 * elements of the cache are therefore exactly the same size. So
73 * when adjusting the cache size following a cache miss, its simply
74 * a matter of choosing a single page to evict. In our model, we
75 * have variable sized cache blocks (ranging from 512 bytes to
76 * 128K bytes). We therefore choose a set of blocks to evict to make
77 * space for a cache miss that approximates as closely as possible
78 * the space used by the new block.
79 *
80 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
81 * by N. Megiddo & D. Modha, FAST 2003
82 */
83
84 /*
85 * The locking model:
86 *
87 * A new reference to a cache buffer can be obtained in two
88 * ways: 1) via a hash table lookup using the DVA as a key,
89 * or 2) via one of the ARC lists. The arc_read() interface
90 * uses method 1, while the internal ARC algorithms for
91 * adjusting the cache use method 2. We therefore provide two
92 * types of locks: 1) the hash table lock array, and 2) the
93 * ARC list locks.
94 *
95 * Buffers do not have their own mutexes, rather they rely on the
96 * hash table mutexes for the bulk of their protection (i.e. most
97 * fields in the arc_buf_hdr_t are protected by these mutexes).
98 *
99 * buf_hash_find() returns the appropriate mutex (held) when it
100 * locates the requested buffer in the hash table. It returns
101 * NULL for the mutex if the buffer was not in the table.
102 *
103 * buf_hash_remove() expects the appropriate hash mutex to be
104 * already held before it is invoked.
105 *
106 * Each ARC state also has a mutex which is used to protect the
107 * buffer list associated with the state. When attempting to
108 * obtain a hash table lock while holding an ARC list lock you
109 * must use: mutex_tryenter() to avoid deadlock. Also note that
110 * the active state mutex must be held before the ghost state mutex.
111 *
112 * It as also possible to register a callback which is run when the
113 * metadata limit is reached and no buffers can be safely evicted. In
114 * this case the arc user should drop a reference on some arc buffers so
115 * they can be reclaimed. For example, when using the ZPL each dentry
116 * holds a references on a znode. These dentries must be pruned before
117 * the arc buffer holding the znode can be safely evicted.
118 *
119 * Note that the majority of the performance stats are manipulated
120 * with atomic operations.
121 *
122 * The L2ARC uses the l2ad_mtx on each vdev for the following:
123 *
124 * - L2ARC buflist creation
125 * - L2ARC buflist eviction
126 * - L2ARC write completion, which walks L2ARC buflists
127 * - ARC header destruction, as it removes from L2ARC buflists
128 * - ARC header release, as it removes from L2ARC buflists
129 */
130
131 /*
132 * ARC operation:
133 *
134 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
135 * This structure can point either to a block that is still in the cache or to
136 * one that is only accessible in an L2 ARC device, or it can provide
137 * information about a block that was recently evicted. If a block is
138 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
139 * information to retrieve it from the L2ARC device. This information is
140 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
141 * that is in this state cannot access the data directly.
142 *
143 * Blocks that are actively being referenced or have not been evicted
144 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
145 * the arc_buf_hdr_t that will point to the data block in memory. A block can
146 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
147 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
148 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
149 *
150 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
151 * ability to store the physical data (b_pabd) associated with the DVA of the
152 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
153 * it will match its on-disk compression characteristics. This behavior can be
154 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
155 * compressed ARC functionality is disabled, the b_pabd will point to an
156 * uncompressed version of the on-disk data.
157 *
158 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
159 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
160 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
161 * consumer. The ARC will provide references to this data and will keep it
162 * cached until it is no longer in use. The ARC caches only the L1ARC's physical
163 * data block and will evict any arc_buf_t that is no longer referenced. The
164 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
165 * "overhead_size" kstat.
166 *
167 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
168 * compressed form. The typical case is that consumers will want uncompressed
169 * data, and when that happens a new data buffer is allocated where the data is
170 * decompressed for them to use. Currently the only consumer who wants
171 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
172 * exists on disk. When this happens, the arc_buf_t's data buffer is shared
173 * with the arc_buf_hdr_t.
174 *
175 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
176 * first one is owned by a compressed send consumer (and therefore references
177 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
178 * used by any other consumer (and has its own uncompressed copy of the data
179 * buffer).
180 *
181 * arc_buf_hdr_t
182 * +-----------+
183 * | fields |
184 * | common to |
185 * | L1- and |
186 * | L2ARC |
187 * +-----------+
188 * | l2arc_buf_hdr_t
189 * | |
190 * +-----------+
191 * | l1arc_buf_hdr_t
192 * | | arc_buf_t
193 * | b_buf +------------>+-----------+ arc_buf_t
194 * | b_pabd +-+ |b_next +---->+-----------+
195 * +-----------+ | |-----------| |b_next +-->NULL
196 * | |b_comp = T | +-----------+
197 * | |b_data +-+ |b_comp = F |
198 * | +-----------+ | |b_data +-+
199 * +->+------+ | +-----------+ |
200 * compressed | | | |
201 * data | |<--------------+ | uncompressed
202 * +------+ compressed, | data
203 * shared +-->+------+
204 * data | |
205 * | |
206 * +------+
207 *
208 * When a consumer reads a block, the ARC must first look to see if the
209 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
210 * arc_buf_t and either copies uncompressed data into a new data buffer from an
211 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
212 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
213 * hdr is compressed and the desired compression characteristics of the
214 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
215 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
216 * the last buffer in the hdr's b_buf list, however a shared compressed buf can
217 * be anywhere in the hdr's list.
218 *
219 * The diagram below shows an example of an uncompressed ARC hdr that is
220 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
221 * the last element in the buf list):
222 *
223 * arc_buf_hdr_t
224 * +-----------+
225 * | |
226 * | |
227 * | |
228 * +-----------+
229 * l2arc_buf_hdr_t| |
230 * | |
231 * +-----------+
232 * l1arc_buf_hdr_t| |
233 * | | arc_buf_t (shared)
234 * | b_buf +------------>+---------+ arc_buf_t
235 * | | |b_next +---->+---------+
236 * | b_pabd +-+ |---------| |b_next +-->NULL
237 * +-----------+ | | | +---------+
238 * | |b_data +-+ | |
239 * | +---------+ | |b_data +-+
240 * +->+------+ | +---------+ |
241 * | | | |
242 * uncompressed | | | |
243 * data +------+ | |
244 * ^ +->+------+ |
245 * | uncompressed | | |
246 * | data | | |
247 * | +------+ |
248 * +---------------------------------+
249 *
250 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
251 * since the physical block is about to be rewritten. The new data contents
252 * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
253 * it may compress the data before writing it to disk. The ARC will be called
254 * with the transformed data and will memcpy the transformed on-disk block into
255 * a newly allocated b_pabd. Writes are always done into buffers which have
256 * either been loaned (and hence are new and don't have other readers) or
257 * buffers which have been released (and hence have their own hdr, if there
258 * were originally other readers of the buf's original hdr). This ensures that
259 * the ARC only needs to update a single buf and its hdr after a write occurs.
260 *
261 * When the L2ARC is in use, it will also take advantage of the b_pabd. The
262 * L2ARC will always write the contents of b_pabd to the L2ARC. This means
263 * that when compressed ARC is enabled that the L2ARC blocks are identical
264 * to the on-disk block in the main data pool. This provides a significant
265 * advantage since the ARC can leverage the bp's checksum when reading from the
266 * L2ARC to determine if the contents are valid. However, if the compressed
267 * ARC is disabled, then the L2ARC's block must be transformed to look
268 * like the physical block in the main data pool before comparing the
269 * checksum and determining its validity.
270 *
271 * The L1ARC has a slightly different system for storing encrypted data.
272 * Raw (encrypted + possibly compressed) data has a few subtle differences from
273 * data that is just compressed. The biggest difference is that it is not
274 * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
275 * The other difference is that encryption cannot be treated as a suggestion.
276 * If a caller would prefer compressed data, but they actually wind up with
277 * uncompressed data the worst thing that could happen is there might be a
278 * performance hit. If the caller requests encrypted data, however, we must be
279 * sure they actually get it or else secret information could be leaked. Raw
280 * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
281 * may have both an encrypted version and a decrypted version of its data at
282 * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
283 * copied out of this header. To avoid complications with b_pabd, raw buffers
284 * cannot be shared.
285 */
286
287 #include <sys/spa.h>
288 #include <sys/zio.h>
289 #include <sys/spa_impl.h>
290 #include <sys/zio_compress.h>
291 #include <sys/zio_checksum.h>
292 #include <sys/zfs_context.h>
293 #include <sys/arc.h>
294 #include <sys/zfs_refcount.h>
295 #include <sys/vdev.h>
296 #include <sys/vdev_impl.h>
297 #include <sys/dsl_pool.h>
298 #include <sys/multilist.h>
299 #include <sys/abd.h>
300 #include <sys/dbuf.h>
301 #include <sys/zil.h>
302 #include <sys/fm/fs/zfs.h>
303 #include <sys/callb.h>
304 #include <sys/kstat.h>
305 #include <sys/zthr.h>
306 #include <zfs_fletcher.h>
307 #include <sys/arc_impl.h>
308 #include <sys/trace_zfs.h>
309 #include <sys/aggsum.h>
310 #include <sys/wmsum.h>
311 #include <cityhash.h>
312 #include <sys/vdev_trim.h>
313 #include <sys/zfs_racct.h>
314 #include <sys/zstd/zstd.h>
315
316 #ifndef _KERNEL
317 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
318 boolean_t arc_watch = B_FALSE;
319 #endif
320
321 /*
322 * This thread's job is to keep enough free memory in the system, by
323 * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
324 * arc_available_memory().
325 */
326 static zthr_t *arc_reap_zthr;
327
328 /*
329 * This thread's job is to keep arc_size under arc_c, by calling
330 * arc_evict(), which improves arc_is_overflowing().
331 */
332 static zthr_t *arc_evict_zthr;
333 static arc_buf_hdr_t **arc_state_evict_markers;
334 static int arc_state_evict_marker_count;
335
336 static kmutex_t arc_evict_lock;
337 static boolean_t arc_evict_needed = B_FALSE;
338 static clock_t arc_last_uncached_flush;
339
340 static taskq_t *arc_evict_taskq;
341 static struct evict_arg *arc_evict_arg;
342
343 /*
344 * Count of bytes evicted since boot.
345 */
346 static uint64_t arc_evict_count;
347
348 /*
349 * List of arc_evict_waiter_t's, representing threads waiting for the
350 * arc_evict_count to reach specific values.
351 */
352 static list_t arc_evict_waiters;
353
354 /*
355 * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
356 * the requested amount of data to be evicted. For example, by default for
357 * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
358 * Since this is above 100%, it ensures that progress is made towards getting
359 * arc_size under arc_c. Since this is finite, it ensures that allocations
360 * can still happen, even during the potentially long time that arc_size is
361 * more than arc_c.
362 */
363 static uint_t zfs_arc_eviction_pct = 200;
364
365 /*
366 * The number of headers to evict in arc_evict_state_impl() before
367 * dropping the sublist lock and evicting from another sublist. A lower
368 * value means we're more likely to evict the "correct" header (i.e. the
369 * oldest header in the arc state), but comes with higher overhead
370 * (i.e. more invocations of arc_evict_state_impl()).
371 */
372 static uint_t zfs_arc_evict_batch_limit = 10;
373
374 /*
375 * Number batches to process per parallel eviction task under heavy load to
376 * reduce number of context switches.
377 */
378 static uint_t zfs_arc_evict_batches_limit = 5;
379
380 /* number of seconds before growing cache again */
381 uint_t arc_grow_retry = 5;
382
383 /*
384 * Minimum time between calls to arc_kmem_reap_soon().
385 */
386 static const int arc_kmem_cache_reap_retry_ms = 1000;
387
388 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
389 static int zfs_arc_overflow_shift = 8;
390
391 /* log2(fraction of arc to reclaim) */
392 uint_t arc_shrink_shift = 7;
393
394 #ifdef _KERNEL
395 /* percent of pagecache to reclaim arc to */
396 uint_t zfs_arc_pc_percent = 0;
397 #endif
398
399 /*
400 * log2(fraction of ARC which must be free to allow growing).
401 * I.e. If there is less than arc_c >> zfs_arc_no_grow_shift free memory,
402 * when reading a new block into the ARC, we will evict an equal-sized block
403 * from the ARC.
404 *
405 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
406 * we will still not allow it to grow.
407 */
408 uint_t zfs_arc_no_grow_shift = 5;
409
410
411 /*
412 * minimum lifespan of a prefetch block in clock ticks
413 * (initialized in arc_init())
414 */
415 static uint_t arc_min_prefetch;
416 static uint_t arc_min_prescient_prefetch;
417
418 /*
419 * If this percent of memory is free, don't throttle.
420 */
421 uint_t arc_lotsfree_percent = 10;
422
423 /*
424 * The arc has filled available memory and has now warmed up.
425 */
426 boolean_t arc_warm;
427
428 /*
429 * These tunables are for performance analysis.
430 */
431 uint64_t zfs_arc_max = 0;
432 uint64_t zfs_arc_min = 0;
433 static uint64_t zfs_arc_dnode_limit = 0;
434 static uint_t zfs_arc_dnode_reduce_percent = 10;
435 static uint_t zfs_arc_grow_retry = 0;
436 static uint_t zfs_arc_shrink_shift = 0;
437 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
438
439 /*
440 * ARC dirty data constraints for arc_tempreserve_space() throttle:
441 * * total dirty data limit
442 * * anon block dirty limit
443 * * each pool's anon allowance
444 */
445 static const unsigned long zfs_arc_dirty_limit_percent = 50;
446 static const unsigned long zfs_arc_anon_limit_percent = 25;
447 static const unsigned long zfs_arc_pool_dirty_percent = 20;
448
449 /*
450 * Enable or disable compressed arc buffers.
451 */
452 int zfs_compressed_arc_enabled = B_TRUE;
453
454 /*
455 * Balance between metadata and data on ghost hits. Values above 100
456 * increase metadata caching by proportionally reducing effect of ghost
457 * data hits on target data/metadata rate.
458 */
459 static uint_t zfs_arc_meta_balance = 500;
460
461 /*
462 * Percentage that can be consumed by dnodes of ARC meta buffers.
463 */
464 static uint_t zfs_arc_dnode_limit_percent = 10;
465
466 /*
467 * These tunables are Linux-specific
468 */
469 static uint64_t zfs_arc_sys_free = 0;
470 static uint_t zfs_arc_min_prefetch_ms = 0;
471 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
472 static uint_t zfs_arc_lotsfree_percent = 10;
473
474 /*
475 * Number of arc_prune threads
476 */
477 static int zfs_arc_prune_task_threads = 1;
478
479 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
480 static taskq_t *arc_flush_taskq;
481
482 /*
483 * Controls the number of ARC eviction threads to dispatch sublists to.
484 *
485 * Possible values:
486 * 0 (auto) compute the number of threads using a logarithmic formula.
487 * 1 (disabled) one thread - parallel eviction is disabled.
488 * 2+ (manual) set the number manually.
489 *
490 * See arc_evict_thread_init() for how "auto" is computed.
491 */
492 static uint_t zfs_arc_evict_threads = 0;
493
494 /* The 7 states: */
495 arc_state_t ARC_anon;
496 arc_state_t ARC_mru;
497 arc_state_t ARC_mru_ghost;
498 arc_state_t ARC_mfu;
499 arc_state_t ARC_mfu_ghost;
500 arc_state_t ARC_l2c_only;
501 arc_state_t ARC_uncached;
502
503 arc_stats_t arc_stats = {
504 { "hits", KSTAT_DATA_UINT64 },
505 { "iohits", KSTAT_DATA_UINT64 },
506 { "misses", KSTAT_DATA_UINT64 },
507 { "demand_data_hits", KSTAT_DATA_UINT64 },
508 { "demand_data_iohits", KSTAT_DATA_UINT64 },
509 { "demand_data_misses", KSTAT_DATA_UINT64 },
510 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
511 { "demand_metadata_iohits", KSTAT_DATA_UINT64 },
512 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
513 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
514 { "prefetch_data_iohits", KSTAT_DATA_UINT64 },
515 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
516 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
517 { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 },
518 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
519 { "mru_hits", KSTAT_DATA_UINT64 },
520 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
521 { "mfu_hits", KSTAT_DATA_UINT64 },
522 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
523 { "uncached_hits", KSTAT_DATA_UINT64 },
524 { "deleted", KSTAT_DATA_UINT64 },
525 { "mutex_miss", KSTAT_DATA_UINT64 },
526 { "access_skip", KSTAT_DATA_UINT64 },
527 { "evict_skip", KSTAT_DATA_UINT64 },
528 { "evict_not_enough", KSTAT_DATA_UINT64 },
529 { "evict_l2_cached", KSTAT_DATA_UINT64 },
530 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
531 { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
532 { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
533 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
534 { "evict_l2_skip", KSTAT_DATA_UINT64 },
535 { "hash_elements", KSTAT_DATA_UINT64 },
536 { "hash_elements_max", KSTAT_DATA_UINT64 },
537 { "hash_collisions", KSTAT_DATA_UINT64 },
538 { "hash_chains", KSTAT_DATA_UINT64 },
539 { "hash_chain_max", KSTAT_DATA_UINT64 },
540 { "meta", KSTAT_DATA_UINT64 },
541 { "pd", KSTAT_DATA_UINT64 },
542 { "pm", KSTAT_DATA_UINT64 },
543 { "c", KSTAT_DATA_UINT64 },
544 { "c_min", KSTAT_DATA_UINT64 },
545 { "c_max", KSTAT_DATA_UINT64 },
546 { "size", KSTAT_DATA_UINT64 },
547 { "compressed_size", KSTAT_DATA_UINT64 },
548 { "uncompressed_size", KSTAT_DATA_UINT64 },
549 { "overhead_size", KSTAT_DATA_UINT64 },
550 { "hdr_size", KSTAT_DATA_UINT64 },
551 { "data_size", KSTAT_DATA_UINT64 },
552 { "metadata_size", KSTAT_DATA_UINT64 },
553 { "dbuf_size", KSTAT_DATA_UINT64 },
554 { "dnode_size", KSTAT_DATA_UINT64 },
555 { "bonus_size", KSTAT_DATA_UINT64 },
556 #if defined(COMPAT_FREEBSD11)
557 { "other_size", KSTAT_DATA_UINT64 },
558 #endif
559 { "anon_size", KSTAT_DATA_UINT64 },
560 { "anon_data", KSTAT_DATA_UINT64 },
561 { "anon_metadata", KSTAT_DATA_UINT64 },
562 { "anon_evictable_data", KSTAT_DATA_UINT64 },
563 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
564 { "mru_size", KSTAT_DATA_UINT64 },
565 { "mru_data", KSTAT_DATA_UINT64 },
566 { "mru_metadata", KSTAT_DATA_UINT64 },
567 { "mru_evictable_data", KSTAT_DATA_UINT64 },
568 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
569 { "mru_ghost_size", KSTAT_DATA_UINT64 },
570 { "mru_ghost_data", KSTAT_DATA_UINT64 },
571 { "mru_ghost_metadata", KSTAT_DATA_UINT64 },
572 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
573 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
574 { "mfu_size", KSTAT_DATA_UINT64 },
575 { "mfu_data", KSTAT_DATA_UINT64 },
576 { "mfu_metadata", KSTAT_DATA_UINT64 },
577 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
578 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
579 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
580 { "mfu_ghost_data", KSTAT_DATA_UINT64 },
581 { "mfu_ghost_metadata", KSTAT_DATA_UINT64 },
582 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
583 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
584 { "uncached_size", KSTAT_DATA_UINT64 },
585 { "uncached_data", KSTAT_DATA_UINT64 },
586 { "uncached_metadata", KSTAT_DATA_UINT64 },
587 { "uncached_evictable_data", KSTAT_DATA_UINT64 },
588 { "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
589 { "l2_ndev", KSTAT_DATA_UINT64 },
590 { "l2_hits", KSTAT_DATA_UINT64 },
591 { "l2_misses", KSTAT_DATA_UINT64 },
592 { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
593 { "l2_mru_asize", KSTAT_DATA_UINT64 },
594 { "l2_mfu_asize", KSTAT_DATA_UINT64 },
595 { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
596 { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
597 { "l2_feeds", KSTAT_DATA_UINT64 },
598 { "l2_rw_clash", KSTAT_DATA_UINT64 },
599 { "l2_read_bytes", KSTAT_DATA_UINT64 },
600 { "l2_write_bytes", KSTAT_DATA_UINT64 },
601 { "l2_writes_sent", KSTAT_DATA_UINT64 },
602 { "l2_writes_done", KSTAT_DATA_UINT64 },
603 { "l2_writes_error", KSTAT_DATA_UINT64 },
604 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
605 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
606 { "l2_evict_reading", KSTAT_DATA_UINT64 },
607 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
608 { "l2_free_on_write", KSTAT_DATA_UINT64 },
609 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
610 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
611 { "l2_io_error", KSTAT_DATA_UINT64 },
612 { "l2_size", KSTAT_DATA_UINT64 },
613 { "l2_asize", KSTAT_DATA_UINT64 },
614 { "l2_hdr_size", KSTAT_DATA_UINT64 },
615 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
616 { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
617 { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
618 { "l2_log_blk_count", KSTAT_DATA_UINT64 },
619 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
620 { "l2_rebuild_success", KSTAT_DATA_UINT64 },
621 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
622 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
623 { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
624 { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
625 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
626 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
627 { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
628 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
629 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
630 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
631 { "memory_throttle_count", KSTAT_DATA_UINT64 },
632 { "memory_direct_count", KSTAT_DATA_UINT64 },
633 { "memory_indirect_count", KSTAT_DATA_UINT64 },
634 { "memory_all_bytes", KSTAT_DATA_UINT64 },
635 { "memory_free_bytes", KSTAT_DATA_UINT64 },
636 { "memory_available_bytes", KSTAT_DATA_INT64 },
637 { "arc_no_grow", KSTAT_DATA_UINT64 },
638 { "arc_tempreserve", KSTAT_DATA_UINT64 },
639 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
640 { "arc_prune", KSTAT_DATA_UINT64 },
641 { "arc_meta_used", KSTAT_DATA_UINT64 },
642 { "arc_dnode_limit", KSTAT_DATA_UINT64 },
643 { "async_upgrade_sync", KSTAT_DATA_UINT64 },
644 { "predictive_prefetch", KSTAT_DATA_UINT64 },
645 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
646 { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
647 { "prescient_prefetch", KSTAT_DATA_UINT64 },
648 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
649 { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
650 { "arc_need_free", KSTAT_DATA_UINT64 },
651 { "arc_sys_free", KSTAT_DATA_UINT64 },
652 { "arc_raw_size", KSTAT_DATA_UINT64 },
653 { "cached_only_in_progress", KSTAT_DATA_UINT64 },
654 { "abd_chunk_waste_size", KSTAT_DATA_UINT64 },
655 };
656
657 arc_sums_t arc_sums;
658
659 #define ARCSTAT_MAX(stat, val) { \
660 uint64_t m; \
661 while ((val) > (m = arc_stats.stat.value.ui64) && \
662 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
663 continue; \
664 }
665
666 /*
667 * We define a macro to allow ARC hits/misses to be easily broken down by
668 * two separate conditions, giving a total of four different subtypes for
669 * each of hits and misses (so eight statistics total).
670 */
671 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
672 if (cond1) { \
673 if (cond2) { \
674 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
675 } else { \
676 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
677 } \
678 } else { \
679 if (cond2) { \
680 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
681 } else { \
682 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
683 } \
684 }
685
686 /*
687 * This macro allows us to use kstats as floating averages. Each time we
688 * update this kstat, we first factor it and the update value by
689 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
690 * average. This macro assumes that integer loads and stores are atomic, but
691 * is not safe for multiple writers updating the kstat in parallel (only the
692 * last writer's update will remain).
693 */
694 #define ARCSTAT_F_AVG_FACTOR 3
695 #define ARCSTAT_F_AVG(stat, value) \
696 do { \
697 uint64_t x = ARCSTAT(stat); \
698 x = x - x / ARCSTAT_F_AVG_FACTOR + \
699 (value) / ARCSTAT_F_AVG_FACTOR; \
700 ARCSTAT(stat) = x; \
701 } while (0)
702
703 static kstat_t *arc_ksp;
704
705 /*
706 * There are several ARC variables that are critical to export as kstats --
707 * but we don't want to have to grovel around in the kstat whenever we wish to
708 * manipulate them. For these variables, we therefore define them to be in
709 * terms of the statistic variable. This assures that we are not introducing
710 * the possibility of inconsistency by having shadow copies of the variables,
711 * while still allowing the code to be readable.
712 */
713 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
714 #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
715 #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
716 #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
717
718 hrtime_t arc_growtime;
719 list_t arc_prune_list;
720 kmutex_t arc_prune_mtx;
721 taskq_t *arc_prune_taskq;
722
723 #define GHOST_STATE(state) \
724 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
725 (state) == arc_l2c_only)
726
727 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
728 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
729 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
730 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
731 #define HDR_PRESCIENT_PREFETCH(hdr) \
732 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
733 #define HDR_COMPRESSION_ENABLED(hdr) \
734 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
735
736 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
737 #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED)
738 #define HDR_L2_READING(hdr) \
739 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
740 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
741 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
742 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
743 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
744 #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
745 #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
746 #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
747
748 #define HDR_ISTYPE_METADATA(hdr) \
749 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
750 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
751
752 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
753 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
754 #define HDR_HAS_RABD(hdr) \
755 (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
756 (hdr)->b_crypt_hdr.b_rabd != NULL)
757 #define HDR_ENCRYPTED(hdr) \
758 (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
759 #define HDR_AUTHENTICATED(hdr) \
760 (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
761
762 /* For storing compression mode in b_flags */
763 #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
764
765 #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
766 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
767 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
768 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
769
770 #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
771 #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
772 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
773 #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
774
775 /*
776 * Other sizes
777 */
778
779 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
780 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
781
782 /*
783 * Hash table routines
784 */
785
786 #define BUF_LOCKS 2048
787 typedef struct buf_hash_table {
788 uint64_t ht_mask;
789 arc_buf_hdr_t **ht_table;
790 kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
791 } buf_hash_table_t;
792
793 static buf_hash_table_t buf_hash_table;
794
795 #define BUF_HASH_INDEX(spa, dva, birth) \
796 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
797 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
798 #define HDR_LOCK(hdr) \
799 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
800
801 uint64_t zfs_crc64_table[256];
802
803 /*
804 * Asynchronous ARC flush
805 *
806 * We track these in a list for arc_async_flush_guid_inuse().
807 * Used for both L1 and L2 async teardown.
808 */
809 static list_t arc_async_flush_list;
810 static kmutex_t arc_async_flush_lock;
811
812 typedef struct arc_async_flush {
813 uint64_t af_spa_guid;
814 taskq_ent_t af_tqent;
815 uint_t af_cache_level; /* 1 or 2 to differentiate node */
816 list_node_t af_node;
817 } arc_async_flush_t;
818
819
820 /*
821 * Level 2 ARC
822 */
823
824 #define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
825 #define L2ARC_BURST_SIZE_MAX (64 * 1024 * 1024) /* max burst size */
826 #define L2ARC_HEADROOM 8 /* num of writes */
827
828 /*
829 * If we discover during ARC scan any buffers to be compressed, we boost
830 * our headroom for the next scanning cycle by this percentage multiple.
831 */
832 #define L2ARC_HEADROOM_BOOST 200
833 #define L2ARC_FEED_SECS 1 /* caching interval secs */
834 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
835
836 /*
837 * Min L2ARC capacity to enable persistent markers, adaptive intervals, and
838 * DWPD rate limiting. L2ARC must be at least twice arc_c_max to benefit from
839 * inclusive caching - smaller L2ARC would either cyclically overwrite itself
840 * (if L2ARC < ARC) or merely duplicate ARC contents (if L2ARC = ARC).
841 * With L2ARC >= 2*ARC, there's room for ARC duplication plus additional
842 * cached data.
843 */
844 #define L2ARC_PERSIST_THRESHOLD (arc_c_max * 2)
845
846 /* L2ARC Performance Tunables */
847 static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
848 uint64_t l2arc_dwpd_limit = 100; /* 100 = 1.0 DWPD */
849 static uint64_t l2arc_dwpd_bump = 0; /* DWPD reset trigger */
850 static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
851 static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
852 static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
853 static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
854 static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
855 static int l2arc_feed_again = B_TRUE; /* turbo warmup */
856 static int l2arc_norw = B_FALSE; /* no reads during writes */
857 static uint_t l2arc_meta_percent = 33; /* limit on headers size */
858
859 /*
860 * L2ARC Internals
861 */
862 static list_t L2ARC_dev_list; /* device list */
863 static list_t *l2arc_dev_list; /* device list pointer */
864 static kmutex_t l2arc_dev_mtx; /* device list mutex */
865 static list_t L2ARC_free_on_write; /* free after write buf list */
866 static list_t *l2arc_free_on_write; /* free after write list ptr */
867 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
868 static uint64_t l2arc_ndev; /* number of devices */
869
870 typedef struct l2arc_read_callback {
871 arc_buf_hdr_t *l2rcb_hdr; /* read header */
872 blkptr_t l2rcb_bp; /* original blkptr */
873 zbookmark_phys_t l2rcb_zb; /* original bookmark */
874 int l2rcb_flags; /* original flags */
875 abd_t *l2rcb_abd; /* temporary buffer */
876 } l2arc_read_callback_t;
877
878 typedef struct l2arc_data_free {
879 /* protected by l2arc_free_on_write_mtx */
880 abd_t *l2df_abd;
881 l2arc_dev_t *l2df_dev; /* L2ARC device that owns this ABD */
882 list_node_t l2df_list_node;
883 } l2arc_data_free_t;
884
885 typedef enum arc_fill_flags {
886 ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
887 ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
888 ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
889 ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
890 ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
891 } arc_fill_flags_t;
892
893 typedef enum arc_ovf_level {
894 ARC_OVF_NONE, /* ARC within target size. */
895 ARC_OVF_SOME, /* ARC is slightly overflowed. */
896 ARC_OVF_SEVERE /* ARC is severely overflowed. */
897 } arc_ovf_level_t;
898
899 static kmutex_t l2arc_rebuild_thr_lock;
900 static kcondvar_t l2arc_rebuild_thr_cv;
901
902 enum arc_hdr_alloc_flags {
903 ARC_HDR_ALLOC_RDATA = 0x1,
904 ARC_HDR_USE_RESERVE = 0x4,
905 ARC_HDR_ALLOC_LINEAR = 0x8,
906 };
907
908
909 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
910 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
911 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
912 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
913 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
914 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
915 const void *tag);
916 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
917 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
918 static void arc_hdr_destroy(arc_buf_hdr_t *);
919 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
920 static void arc_buf_watch(arc_buf_t *);
921 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
922
923 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
924 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
925 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
926 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
927
928 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
929 static void l2arc_read_done(zio_t *);
930 static void l2arc_do_free_on_write(l2arc_dev_t *dev);
931 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
932 boolean_t state_only);
933 static uint64_t l2arc_get_write_rate(l2arc_dev_t *dev);
934
935 static void arc_prune_async(uint64_t adjust);
936
937 #define l2arc_hdr_arcstats_increment(hdr) \
938 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
939 #define l2arc_hdr_arcstats_decrement(hdr) \
940 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
941 #define l2arc_hdr_arcstats_increment_state(hdr) \
942 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
943 #define l2arc_hdr_arcstats_decrement_state(hdr) \
944 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
945
946 /*
947 * l2arc_exclude_special : A zfs module parameter that controls whether buffers
948 * present on special vdevs are eligibile for caching in L2ARC. If
949 * set to 1, exclude dbufs on special vdevs from being cached to
950 * L2ARC.
951 */
952 int l2arc_exclude_special = 0;
953
954 /*
955 * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
956 * metadata and data are cached from ARC into L2ARC.
957 */
958 static int l2arc_mfuonly = 0;
959
960 /*
961 * Depth cap as percentage of state size. Each pass resets its markers
962 * to tail after scanning this fraction of the state. Keeps markers
963 * focused on the tail zone where L2ARC adds the most value.
964 */
965 static uint64_t l2arc_ext_headroom_pct = 25;
966
967 /*
968 * Metadata monopolization limit. When metadata fills the write budget
969 * for this many consecutive cycles while data gets nothing, skip metadata
970 * for one cycle to let data run, then reset the counter.
971 * With N=2, the steady-state pattern under sustained monopolization is
972 * 2 metadata cycles followed by 1 data cycle (67%/33% split).
973 */
974 static uint64_t l2arc_meta_cycles = 2;
975
976 /*
977 * L2ARC TRIM
978 * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
979 * the current write size (l2arc_write_max) we should TRIM if we
980 * have filled the device. It is defined as a percentage of the
981 * write size. If set to 100 we trim twice the space required to
982 * accommodate upcoming writes. A minimum of 64MB will be trimmed.
983 * It also enables TRIM of the whole L2ARC device upon creation or
984 * addition to an existing pool or if the header of the device is
985 * invalid upon importing a pool or onlining a cache device. The
986 * default is 0, which disables TRIM on L2ARC altogether as it can
987 * put significant stress on the underlying storage devices. This
988 * will vary depending of how well the specific device handles
989 * these commands.
990 */
991 static uint64_t l2arc_trim_ahead = 0;
992
993 /*
994 * Performance tuning of L2ARC persistence:
995 *
996 * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
997 * an L2ARC device (either at pool import or later) will attempt
998 * to rebuild L2ARC buffer contents.
999 * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
1000 * whether log blocks are written to the L2ARC device. If the L2ARC
1001 * device is less than 1GB, the amount of data l2arc_evict()
1002 * evicts is significant compared to the amount of restored L2ARC
1003 * data. In this case do not write log blocks in L2ARC in order
1004 * not to waste space.
1005 */
1006 static int l2arc_rebuild_enabled = B_TRUE;
1007 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
1008
1009 /* L2ARC persistence rebuild control routines. */
1010 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
1011 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
1012 static int l2arc_rebuild(l2arc_dev_t *dev);
1013
1014 /* L2ARC persistence read I/O routines. */
1015 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
1016 static int l2arc_log_blk_read(l2arc_dev_t *dev,
1017 const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
1018 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
1019 zio_t *this_io, zio_t **next_io);
1020 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
1021 const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
1022 static void l2arc_log_blk_fetch_abort(zio_t *zio);
1023
1024 /* L2ARC persistence block restoration routines. */
1025 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
1026 const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
1027 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
1028 l2arc_dev_t *dev);
1029
1030 /* L2ARC persistence write I/O routines. */
1031 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1032 l2arc_write_callback_t *cb);
1033
1034 /* L2ARC persistence auxiliary routines. */
1035 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1036 const l2arc_log_blkptr_t *lbp);
1037 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1038 const arc_buf_hdr_t *ab);
1039 boolean_t l2arc_range_check_overlap(uint64_t bottom,
1040 uint64_t top, uint64_t check);
1041 static void l2arc_blk_fetch_done(zio_t *zio);
1042 static inline uint64_t
1043 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
1044
1045 /*
1046 * We use Cityhash for this. It's fast, and has good hash properties without
1047 * requiring any large static buffers.
1048 */
1049 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)1050 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1051 {
1052 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
1053 }
1054
1055 #define HDR_EMPTY(hdr) \
1056 ((hdr)->b_dva.dva_word[0] == 0 && \
1057 (hdr)->b_dva.dva_word[1] == 0)
1058
1059 #define HDR_EMPTY_OR_LOCKED(hdr) \
1060 (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
1061
1062 #define HDR_EQUAL(spa, dva, birth, hdr) \
1063 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1064 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1065 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1066
1067 static void
buf_discard_identity(arc_buf_hdr_t * hdr)1068 buf_discard_identity(arc_buf_hdr_t *hdr)
1069 {
1070 hdr->b_dva.dva_word[0] = 0;
1071 hdr->b_dva.dva_word[1] = 0;
1072 hdr->b_birth = 0;
1073 }
1074
1075 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)1076 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1077 {
1078 const dva_t *dva = BP_IDENTITY(bp);
1079 uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp);
1080 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1081 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1082 arc_buf_hdr_t *hdr;
1083
1084 mutex_enter(hash_lock);
1085 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1086 hdr = hdr->b_hash_next) {
1087 if (HDR_EQUAL(spa, dva, birth, hdr)) {
1088 *lockp = hash_lock;
1089 return (hdr);
1090 }
1091 }
1092 mutex_exit(hash_lock);
1093 *lockp = NULL;
1094 return (NULL);
1095 }
1096
1097 /*
1098 * Insert an entry into the hash table. If there is already an element
1099 * equal to elem in the hash table, then the already existing element
1100 * will be returned and the new element will not be inserted.
1101 * Otherwise returns NULL.
1102 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1103 */
1104 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)1105 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1106 {
1107 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1108 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1109 arc_buf_hdr_t *fhdr;
1110 uint32_t i;
1111
1112 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1113 ASSERT(hdr->b_birth != 0);
1114 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1115
1116 if (lockp != NULL) {
1117 *lockp = hash_lock;
1118 mutex_enter(hash_lock);
1119 } else {
1120 ASSERT(MUTEX_HELD(hash_lock));
1121 }
1122
1123 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1124 fhdr = fhdr->b_hash_next, i++) {
1125 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1126 return (fhdr);
1127 }
1128
1129 hdr->b_hash_next = buf_hash_table.ht_table[idx];
1130 buf_hash_table.ht_table[idx] = hdr;
1131 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1132
1133 /* collect some hash table performance data */
1134 if (i > 0) {
1135 ARCSTAT_BUMP(arcstat_hash_collisions);
1136 if (i == 1)
1137 ARCSTAT_BUMP(arcstat_hash_chains);
1138 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1139 }
1140 ARCSTAT_BUMP(arcstat_hash_elements);
1141
1142 return (NULL);
1143 }
1144
1145 static void
buf_hash_remove(arc_buf_hdr_t * hdr)1146 buf_hash_remove(arc_buf_hdr_t *hdr)
1147 {
1148 arc_buf_hdr_t *fhdr, **hdrp;
1149 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1150
1151 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1152 ASSERT(HDR_IN_HASH_TABLE(hdr));
1153
1154 hdrp = &buf_hash_table.ht_table[idx];
1155 while ((fhdr = *hdrp) != hdr) {
1156 ASSERT3P(fhdr, !=, NULL);
1157 hdrp = &fhdr->b_hash_next;
1158 }
1159 *hdrp = hdr->b_hash_next;
1160 hdr->b_hash_next = NULL;
1161 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1162
1163 /* collect some hash table performance data */
1164 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1165 if (buf_hash_table.ht_table[idx] &&
1166 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1167 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1168 }
1169
1170 /*
1171 * Global data structures and functions for the buf kmem cache.
1172 */
1173
1174 static kmem_cache_t *hdr_full_cache;
1175 static kmem_cache_t *hdr_l2only_cache;
1176 static kmem_cache_t *buf_cache;
1177
1178 static void
buf_fini(void)1179 buf_fini(void)
1180 {
1181 #if defined(_KERNEL)
1182 /*
1183 * Large allocations which do not require contiguous pages
1184 * should be using vmem_free() in the linux kernel.
1185 */
1186 vmem_free(buf_hash_table.ht_table,
1187 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1188 #else
1189 kmem_free(buf_hash_table.ht_table,
1190 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1191 #endif
1192 for (int i = 0; i < BUF_LOCKS; i++)
1193 mutex_destroy(BUF_HASH_LOCK(i));
1194 kmem_cache_destroy(hdr_full_cache);
1195 kmem_cache_destroy(hdr_l2only_cache);
1196 kmem_cache_destroy(buf_cache);
1197 }
1198
1199 /*
1200 * Constructor callback - called when the cache is empty
1201 * and a new buf is requested.
1202 */
1203 static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)1204 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1205 {
1206 (void) unused, (void) kmflag;
1207 arc_buf_hdr_t *hdr = vbuf;
1208
1209 memset(hdr, 0, HDR_FULL_SIZE);
1210 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1211 zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1212 #ifdef ZFS_DEBUG
1213 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1214 #endif
1215 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1216 list_link_init(&hdr->b_l2hdr.b_l2node);
1217 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1218
1219 return (0);
1220 }
1221
1222 static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1223 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1224 {
1225 (void) unused, (void) kmflag;
1226 arc_buf_hdr_t *hdr = vbuf;
1227
1228 memset(hdr, 0, HDR_L2ONLY_SIZE);
1229 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1230
1231 return (0);
1232 }
1233
1234 static int
buf_cons(void * vbuf,void * unused,int kmflag)1235 buf_cons(void *vbuf, void *unused, int kmflag)
1236 {
1237 (void) unused, (void) kmflag;
1238 arc_buf_t *buf = vbuf;
1239
1240 memset(buf, 0, sizeof (arc_buf_t));
1241 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1242
1243 return (0);
1244 }
1245
1246 /*
1247 * Destructor callback - called when a cached buf is
1248 * no longer required.
1249 */
1250 static void
hdr_full_dest(void * vbuf,void * unused)1251 hdr_full_dest(void *vbuf, void *unused)
1252 {
1253 (void) unused;
1254 arc_buf_hdr_t *hdr = vbuf;
1255
1256 ASSERT(HDR_EMPTY(hdr));
1257 zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1258 #ifdef ZFS_DEBUG
1259 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1260 #endif
1261 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1262 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1263 }
1264
1265 static void
hdr_l2only_dest(void * vbuf,void * unused)1266 hdr_l2only_dest(void *vbuf, void *unused)
1267 {
1268 (void) unused;
1269 arc_buf_hdr_t *hdr = vbuf;
1270
1271 ASSERT(HDR_EMPTY(hdr));
1272 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1273 }
1274
1275 static void
buf_dest(void * vbuf,void * unused)1276 buf_dest(void *vbuf, void *unused)
1277 {
1278 (void) unused;
1279 (void) vbuf;
1280
1281 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1282 }
1283
1284 static void
buf_init(void)1285 buf_init(void)
1286 {
1287 uint64_t *ct = NULL;
1288 uint64_t hsize = 1ULL << 12;
1289 int i, j;
1290
1291 /*
1292 * The hash table is big enough to fill all of physical memory
1293 * with an average block size of zfs_arc_average_blocksize (default 8K).
1294 * By default, the table will take up
1295 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1296 */
1297 while (hsize * zfs_arc_average_blocksize < arc_all_memory())
1298 hsize <<= 1;
1299 retry:
1300 buf_hash_table.ht_mask = hsize - 1;
1301 #if defined(_KERNEL)
1302 /*
1303 * Large allocations which do not require contiguous pages
1304 * should be using vmem_alloc() in the linux kernel
1305 */
1306 buf_hash_table.ht_table =
1307 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1308 #else
1309 buf_hash_table.ht_table =
1310 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1311 #endif
1312 if (buf_hash_table.ht_table == NULL) {
1313 ASSERT(hsize > (1ULL << 8));
1314 hsize >>= 1;
1315 goto retry;
1316 }
1317
1318 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1319 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
1320 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1321 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
1322 NULL, NULL, 0);
1323 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1324 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1325
1326 for (i = 0; i < 256; i++)
1327 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1328 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1329
1330 for (i = 0; i < BUF_LOCKS; i++)
1331 mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
1332 }
1333
1334 #define ARC_MINTIME (hz>>4) /* 62 ms */
1335
1336 /*
1337 * This is the size that the buf occupies in memory. If the buf is compressed,
1338 * it will correspond to the compressed size. You should use this method of
1339 * getting the buf size unless you explicitly need the logical size.
1340 */
1341 uint64_t
arc_buf_size(arc_buf_t * buf)1342 arc_buf_size(arc_buf_t *buf)
1343 {
1344 return (ARC_BUF_COMPRESSED(buf) ?
1345 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1346 }
1347
1348 uint64_t
arc_buf_lsize(arc_buf_t * buf)1349 arc_buf_lsize(arc_buf_t *buf)
1350 {
1351 return (HDR_GET_LSIZE(buf->b_hdr));
1352 }
1353
1354 /*
1355 * This function will return B_TRUE if the buffer is encrypted in memory.
1356 * This buffer can be decrypted by calling arc_untransform().
1357 */
1358 boolean_t
arc_is_encrypted(arc_buf_t * buf)1359 arc_is_encrypted(arc_buf_t *buf)
1360 {
1361 return (ARC_BUF_ENCRYPTED(buf) != 0);
1362 }
1363
1364 /*
1365 * Returns B_TRUE if the buffer represents data that has not had its MAC
1366 * verified yet.
1367 */
1368 boolean_t
arc_is_unauthenticated(arc_buf_t * buf)1369 arc_is_unauthenticated(arc_buf_t *buf)
1370 {
1371 return (HDR_NOAUTH(buf->b_hdr) != 0);
1372 }
1373
1374 void
arc_get_raw_params(arc_buf_t * buf,boolean_t * byteorder,uint8_t * salt,uint8_t * iv,uint8_t * mac)1375 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1376 uint8_t *iv, uint8_t *mac)
1377 {
1378 arc_buf_hdr_t *hdr = buf->b_hdr;
1379
1380 ASSERT(HDR_PROTECTED(hdr));
1381
1382 memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
1383 memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
1384 memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
1385 *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1386 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1387 }
1388
1389 /*
1390 * Indicates how this buffer is compressed in memory. If it is not compressed
1391 * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1392 * arc_untransform() as long as it is also unencrypted.
1393 */
1394 enum zio_compress
arc_get_compression(arc_buf_t * buf)1395 arc_get_compression(arc_buf_t *buf)
1396 {
1397 return (ARC_BUF_COMPRESSED(buf) ?
1398 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1399 }
1400
1401 /*
1402 * Return the compression algorithm used to store this data in the ARC. If ARC
1403 * compression is enabled or this is an encrypted block, this will be the same
1404 * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1405 */
1406 static inline enum zio_compress
arc_hdr_get_compress(arc_buf_hdr_t * hdr)1407 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1408 {
1409 return (HDR_COMPRESSION_ENABLED(hdr) ?
1410 HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1411 }
1412
1413 uint8_t
arc_get_complevel(arc_buf_t * buf)1414 arc_get_complevel(arc_buf_t *buf)
1415 {
1416 return (buf->b_hdr->b_complevel);
1417 }
1418
1419 __maybe_unused
1420 static inline boolean_t
arc_buf_is_shared(arc_buf_t * buf)1421 arc_buf_is_shared(arc_buf_t *buf)
1422 {
1423 boolean_t shared = (buf->b_data != NULL &&
1424 buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1425 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1426 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1427 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1428 EQUIV(shared, ARC_BUF_SHARED(buf));
1429 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1430
1431 /*
1432 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1433 * already being shared" requirement prevents us from doing that.
1434 */
1435
1436 return (shared);
1437 }
1438
1439 /*
1440 * Free the checksum associated with this header. If there is no checksum, this
1441 * is a no-op.
1442 */
1443 static inline void
arc_cksum_free(arc_buf_hdr_t * hdr)1444 arc_cksum_free(arc_buf_hdr_t *hdr)
1445 {
1446 #ifdef ZFS_DEBUG
1447 ASSERT(HDR_HAS_L1HDR(hdr));
1448
1449 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1450 if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1451 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1452 hdr->b_l1hdr.b_freeze_cksum = NULL;
1453 }
1454 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1455 #endif
1456 }
1457
1458 /*
1459 * Return true iff at least one of the bufs on hdr is not compressed.
1460 * Encrypted buffers count as compressed.
1461 */
1462 static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t * hdr)1463 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1464 {
1465 ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1466
1467 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1468 if (!ARC_BUF_COMPRESSED(b)) {
1469 return (B_TRUE);
1470 }
1471 }
1472 return (B_FALSE);
1473 }
1474
1475
1476 /*
1477 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1478 * matches the checksum that is stored in the hdr. If there is no checksum,
1479 * or if the buf is compressed, this is a no-op.
1480 */
1481 static void
arc_cksum_verify(arc_buf_t * buf)1482 arc_cksum_verify(arc_buf_t *buf)
1483 {
1484 #ifdef ZFS_DEBUG
1485 arc_buf_hdr_t *hdr = buf->b_hdr;
1486 zio_cksum_t zc;
1487
1488 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1489 return;
1490
1491 if (ARC_BUF_COMPRESSED(buf))
1492 return;
1493
1494 ASSERT(HDR_HAS_L1HDR(hdr));
1495
1496 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1497
1498 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1499 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1500 return;
1501 }
1502
1503 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1504 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1505 panic("buffer modified while frozen!");
1506 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1507 #endif
1508 }
1509
1510 /*
1511 * This function makes the assumption that data stored in the L2ARC
1512 * will be transformed exactly as it is in the main pool. Because of
1513 * this we can verify the checksum against the reading process's bp.
1514 */
1515 static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t * hdr,zio_t * zio)1516 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1517 {
1518 ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1519 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1520
1521 /*
1522 * Block pointers always store the checksum for the logical data.
1523 * If the block pointer has the gang bit set, then the checksum
1524 * it represents is for the reconstituted data and not for an
1525 * individual gang member. The zio pipeline, however, must be able to
1526 * determine the checksum of each of the gang constituents so it
1527 * treats the checksum comparison differently than what we need
1528 * for l2arc blocks. This prevents us from using the
1529 * zio_checksum_error() interface directly. Instead we must call the
1530 * zio_checksum_error_impl() so that we can ensure the checksum is
1531 * generated using the correct checksum algorithm and accounts for the
1532 * logical I/O size and not just a gang fragment.
1533 */
1534 return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1535 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1536 zio->io_offset, NULL) == 0);
1537 }
1538
1539 /*
1540 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1541 * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1542 * isn't modified later on. If buf is compressed or there is already a checksum
1543 * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1544 */
1545 static void
arc_cksum_compute(arc_buf_t * buf)1546 arc_cksum_compute(arc_buf_t *buf)
1547 {
1548 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1549 return;
1550
1551 #ifdef ZFS_DEBUG
1552 arc_buf_hdr_t *hdr = buf->b_hdr;
1553 ASSERT(HDR_HAS_L1HDR(hdr));
1554 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1555 if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1556 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1557 return;
1558 }
1559
1560 ASSERT(!ARC_BUF_ENCRYPTED(buf));
1561 ASSERT(!ARC_BUF_COMPRESSED(buf));
1562 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1563 KM_SLEEP);
1564 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1565 hdr->b_l1hdr.b_freeze_cksum);
1566 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1567 #endif
1568 arc_buf_watch(buf);
1569 }
1570
1571 #ifndef _KERNEL
1572 void
arc_buf_sigsegv(int sig,siginfo_t * si,void * unused)1573 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1574 {
1575 (void) sig, (void) unused;
1576 panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
1577 }
1578 #endif
1579
1580 static void
arc_buf_unwatch(arc_buf_t * buf)1581 arc_buf_unwatch(arc_buf_t *buf)
1582 {
1583 #ifndef _KERNEL
1584 if (arc_watch) {
1585 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1586 PROT_READ | PROT_WRITE));
1587 }
1588 #else
1589 (void) buf;
1590 #endif
1591 }
1592
1593 static void
arc_buf_watch(arc_buf_t * buf)1594 arc_buf_watch(arc_buf_t *buf)
1595 {
1596 #ifndef _KERNEL
1597 if (arc_watch)
1598 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1599 PROT_READ));
1600 #else
1601 (void) buf;
1602 #endif
1603 }
1604
1605 static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1606 arc_buf_type(arc_buf_hdr_t *hdr)
1607 {
1608 arc_buf_contents_t type;
1609 if (HDR_ISTYPE_METADATA(hdr)) {
1610 type = ARC_BUFC_METADATA;
1611 } else {
1612 type = ARC_BUFC_DATA;
1613 }
1614 VERIFY3U(hdr->b_type, ==, type);
1615 return (type);
1616 }
1617
1618 boolean_t
arc_is_metadata(arc_buf_t * buf)1619 arc_is_metadata(arc_buf_t *buf)
1620 {
1621 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1622 }
1623
1624 static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1625 arc_bufc_to_flags(arc_buf_contents_t type)
1626 {
1627 switch (type) {
1628 case ARC_BUFC_DATA:
1629 /* metadata field is 0 if buffer contains normal data */
1630 return (0);
1631 case ARC_BUFC_METADATA:
1632 return (ARC_FLAG_BUFC_METADATA);
1633 default:
1634 break;
1635 }
1636 panic("undefined ARC buffer type!");
1637 return ((uint32_t)-1);
1638 }
1639
1640 void
arc_buf_thaw(arc_buf_t * buf)1641 arc_buf_thaw(arc_buf_t *buf)
1642 {
1643 arc_buf_hdr_t *hdr = buf->b_hdr;
1644
1645 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1646 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1647
1648 arc_cksum_verify(buf);
1649
1650 /*
1651 * Compressed buffers do not manipulate the b_freeze_cksum.
1652 */
1653 if (ARC_BUF_COMPRESSED(buf))
1654 return;
1655
1656 ASSERT(HDR_HAS_L1HDR(hdr));
1657 arc_cksum_free(hdr);
1658 arc_buf_unwatch(buf);
1659 }
1660
1661 void
arc_buf_freeze(arc_buf_t * buf)1662 arc_buf_freeze(arc_buf_t *buf)
1663 {
1664 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1665 return;
1666
1667 if (ARC_BUF_COMPRESSED(buf))
1668 return;
1669
1670 ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1671 arc_cksum_compute(buf);
1672 }
1673
1674 /*
1675 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1676 * the following functions should be used to ensure that the flags are
1677 * updated in a thread-safe way. When manipulating the flags either
1678 * the hash_lock must be held or the hdr must be undiscoverable. This
1679 * ensures that we're not racing with any other threads when updating
1680 * the flags.
1681 */
1682 static inline void
arc_hdr_set_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1683 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1684 {
1685 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1686 hdr->b_flags |= flags;
1687 }
1688
1689 static inline void
arc_hdr_clear_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1690 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1691 {
1692 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1693 hdr->b_flags &= ~flags;
1694 }
1695
1696 /*
1697 * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1698 * done in a special way since we have to clear and set bits
1699 * at the same time. Consumers that wish to set the compression bits
1700 * must use this function to ensure that the flags are updated in
1701 * thread-safe manner.
1702 */
1703 static void
arc_hdr_set_compress(arc_buf_hdr_t * hdr,enum zio_compress cmp)1704 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1705 {
1706 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1707
1708 /*
1709 * Holes and embedded blocks will always have a psize = 0 so
1710 * we ignore the compression of the blkptr and set the
1711 * want to uncompress them. Mark them as uncompressed.
1712 */
1713 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1714 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1715 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1716 } else {
1717 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1718 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1719 }
1720
1721 HDR_SET_COMPRESS(hdr, cmp);
1722 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1723 }
1724
1725 /*
1726 * Looks for another buf on the same hdr which has the data decompressed, copies
1727 * from it, and returns true. If no such buf exists, returns false.
1728 */
1729 static boolean_t
arc_buf_try_copy_decompressed_data(arc_buf_t * buf)1730 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1731 {
1732 arc_buf_hdr_t *hdr = buf->b_hdr;
1733 boolean_t copied = B_FALSE;
1734
1735 ASSERT(HDR_HAS_L1HDR(hdr));
1736 ASSERT3P(buf->b_data, !=, NULL);
1737 ASSERT(!ARC_BUF_COMPRESSED(buf));
1738
1739 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1740 from = from->b_next) {
1741 /* can't use our own data buffer */
1742 if (from == buf) {
1743 continue;
1744 }
1745
1746 if (!ARC_BUF_COMPRESSED(from)) {
1747 memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
1748 copied = B_TRUE;
1749 break;
1750 }
1751 }
1752
1753 #ifdef ZFS_DEBUG
1754 /*
1755 * There were no decompressed bufs, so there should not be a
1756 * checksum on the hdr either.
1757 */
1758 if (zfs_flags & ZFS_DEBUG_MODIFY)
1759 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1760 #endif
1761
1762 return (copied);
1763 }
1764
1765 /*
1766 * Allocates an ARC buf header that's in an evicted & L2-cached state.
1767 * This is used during l2arc reconstruction to make empty ARC buffers
1768 * which circumvent the regular disk->arc->l2arc path and instead come
1769 * into being in the reverse order, i.e. l2arc->arc.
1770 */
1771 static arc_buf_hdr_t *
arc_buf_alloc_l2only(size_t size,arc_buf_contents_t type,l2arc_dev_t * dev,dva_t dva,uint64_t daddr,int32_t psize,uint64_t asize,uint64_t birth,enum zio_compress compress,uint8_t complevel,boolean_t protected,boolean_t prefetch,arc_state_type_t arcs_state)1772 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
1773 dva_t dva, uint64_t daddr, int32_t psize, uint64_t asize, uint64_t birth,
1774 enum zio_compress compress, uint8_t complevel, boolean_t protected,
1775 boolean_t prefetch, arc_state_type_t arcs_state)
1776 {
1777 arc_buf_hdr_t *hdr;
1778
1779 ASSERT(size != 0);
1780 ASSERT(dev->l2ad_vdev != NULL);
1781
1782 hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
1783 hdr->b_birth = birth;
1784 hdr->b_type = type;
1785 hdr->b_flags = 0;
1786 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
1787 HDR_SET_LSIZE(hdr, size);
1788 HDR_SET_PSIZE(hdr, psize);
1789 HDR_SET_L2SIZE(hdr, asize);
1790 arc_hdr_set_compress(hdr, compress);
1791 hdr->b_complevel = complevel;
1792 if (protected)
1793 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
1794 if (prefetch)
1795 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
1796 hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
1797
1798 hdr->b_dva = dva;
1799
1800 hdr->b_l2hdr.b_dev = dev;
1801 hdr->b_l2hdr.b_daddr = daddr;
1802 hdr->b_l2hdr.b_arcs_state = arcs_state;
1803
1804 return (hdr);
1805 }
1806
1807 /*
1808 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1809 */
1810 static uint64_t
arc_hdr_size(arc_buf_hdr_t * hdr)1811 arc_hdr_size(arc_buf_hdr_t *hdr)
1812 {
1813 uint64_t size;
1814
1815 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1816 HDR_GET_PSIZE(hdr) > 0) {
1817 size = HDR_GET_PSIZE(hdr);
1818 } else {
1819 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1820 size = HDR_GET_LSIZE(hdr);
1821 }
1822 return (size);
1823 }
1824
1825 static int
arc_hdr_authenticate(arc_buf_hdr_t * hdr,spa_t * spa,uint64_t dsobj)1826 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1827 {
1828 int ret;
1829 uint64_t csize;
1830 uint64_t lsize = HDR_GET_LSIZE(hdr);
1831 uint64_t psize = HDR_GET_PSIZE(hdr);
1832 abd_t *abd = hdr->b_l1hdr.b_pabd;
1833 boolean_t free_abd = B_FALSE;
1834
1835 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1836 ASSERT(HDR_AUTHENTICATED(hdr));
1837 ASSERT3P(abd, !=, NULL);
1838
1839 /*
1840 * The MAC is calculated on the compressed data that is stored on disk.
1841 * However, if compressed arc is disabled we will only have the
1842 * decompressed data available to us now. Compress it into a temporary
1843 * abd so we can verify the MAC. The performance overhead of this will
1844 * be relatively low, since most objects in an encrypted objset will
1845 * be encrypted (instead of authenticated) anyway.
1846 */
1847 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1848 !HDR_COMPRESSION_ENABLED(hdr)) {
1849 abd = NULL;
1850 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1851 hdr->b_l1hdr.b_pabd, &abd, lsize, MIN(lsize, psize),
1852 hdr->b_complevel);
1853 if (csize >= lsize || csize > psize) {
1854 ret = SET_ERROR(EIO);
1855 return (ret);
1856 }
1857 ASSERT3P(abd, !=, NULL);
1858 abd_zero_off(abd, csize, psize - csize);
1859 free_abd = B_TRUE;
1860 }
1861
1862 /*
1863 * Authentication is best effort. We authenticate whenever the key is
1864 * available. If we succeed we clear ARC_FLAG_NOAUTH.
1865 */
1866 if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1867 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1868 ASSERT3U(lsize, ==, psize);
1869 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1870 psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1871 } else {
1872 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1873 hdr->b_crypt_hdr.b_mac);
1874 }
1875
1876 if (ret == 0)
1877 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1878 else if (ret == EACCES)
1879 ret = 0;
1880
1881 if (free_abd)
1882 abd_free(abd);
1883
1884 return (ret);
1885 }
1886
1887 /*
1888 * This function will take a header that only has raw encrypted data in
1889 * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1890 * b_l1hdr.b_pabd. If designated in the header flags, this function will
1891 * also decompress the data.
1892 */
1893 static int
arc_hdr_decrypt(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb)1894 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1895 {
1896 int ret;
1897 abd_t *cabd = NULL;
1898 boolean_t no_crypt = B_FALSE;
1899 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1900
1901 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1902 ASSERT(HDR_ENCRYPTED(hdr));
1903
1904 arc_hdr_alloc_abd(hdr, 0);
1905
1906 ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1907 B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1908 hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1909 hdr->b_crypt_hdr.b_rabd, &no_crypt);
1910 if (ret != 0)
1911 goto error;
1912
1913 if (no_crypt) {
1914 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1915 HDR_GET_PSIZE(hdr));
1916 }
1917
1918 /*
1919 * If this header has disabled arc compression but the b_pabd is
1920 * compressed after decrypting it, we need to decompress the newly
1921 * decrypted data.
1922 */
1923 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1924 !HDR_COMPRESSION_ENABLED(hdr)) {
1925 /*
1926 * We want to make sure that we are correctly honoring the
1927 * zfs_abd_scatter_enabled setting, so we allocate an abd here
1928 * and then loan a buffer from it, rather than allocating a
1929 * linear buffer and wrapping it in an abd later.
1930 */
1931 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
1932
1933 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1934 hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
1935 HDR_GET_LSIZE(hdr), &hdr->b_complevel);
1936 if (ret != 0) {
1937 goto error;
1938 }
1939
1940 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1941 arc_hdr_size(hdr), hdr);
1942 hdr->b_l1hdr.b_pabd = cabd;
1943 }
1944
1945 return (0);
1946
1947 error:
1948 arc_hdr_free_abd(hdr, B_FALSE);
1949 if (cabd != NULL)
1950 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
1951
1952 return (ret);
1953 }
1954
1955 /*
1956 * This function is called during arc_buf_fill() to prepare the header's
1957 * abd plaintext pointer for use. This involves authenticated protected
1958 * data and decrypting encrypted data into the plaintext abd.
1959 */
1960 static int
arc_fill_hdr_crypt(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,spa_t * spa,const zbookmark_phys_t * zb,boolean_t noauth)1961 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1962 const zbookmark_phys_t *zb, boolean_t noauth)
1963 {
1964 int ret;
1965
1966 ASSERT(HDR_PROTECTED(hdr));
1967
1968 if (hash_lock != NULL)
1969 mutex_enter(hash_lock);
1970
1971 if (HDR_NOAUTH(hdr) && !noauth) {
1972 /*
1973 * The caller requested authenticated data but our data has
1974 * not been authenticated yet. Verify the MAC now if we can.
1975 */
1976 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1977 if (ret != 0)
1978 goto error;
1979 } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1980 /*
1981 * If we only have the encrypted version of the data, but the
1982 * unencrypted version was requested we take this opportunity
1983 * to store the decrypted version in the header for future use.
1984 */
1985 ret = arc_hdr_decrypt(hdr, spa, zb);
1986 if (ret != 0)
1987 goto error;
1988 }
1989
1990 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1991
1992 if (hash_lock != NULL)
1993 mutex_exit(hash_lock);
1994
1995 return (0);
1996
1997 error:
1998 if (hash_lock != NULL)
1999 mutex_exit(hash_lock);
2000
2001 return (ret);
2002 }
2003
2004 /*
2005 * This function is used by the dbuf code to decrypt bonus buffers in place.
2006 * The dbuf code itself doesn't have any locking for decrypting a shared dnode
2007 * block, so we use the hash lock here to protect against concurrent calls to
2008 * arc_buf_fill().
2009 */
2010 static void
arc_buf_untransform_in_place(arc_buf_t * buf)2011 arc_buf_untransform_in_place(arc_buf_t *buf)
2012 {
2013 arc_buf_hdr_t *hdr = buf->b_hdr;
2014
2015 ASSERT(HDR_ENCRYPTED(hdr));
2016 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2017 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2018 ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
2019
2020 zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
2021 arc_buf_size(buf));
2022 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
2023 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2024 }
2025
2026 /*
2027 * Given a buf that has a data buffer attached to it, this function will
2028 * efficiently fill the buf with data of the specified compression setting from
2029 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2030 * are already sharing a data buf, no copy is performed.
2031 *
2032 * If the buf is marked as compressed but uncompressed data was requested, this
2033 * will allocate a new data buffer for the buf, remove that flag, and fill the
2034 * buf with uncompressed data. You can't request a compressed buf on a hdr with
2035 * uncompressed data, and (since we haven't added support for it yet) if you
2036 * want compressed data your buf must already be marked as compressed and have
2037 * the correct-sized data buffer.
2038 */
2039 static int
arc_buf_fill(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,arc_fill_flags_t flags)2040 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2041 arc_fill_flags_t flags)
2042 {
2043 int error = 0;
2044 arc_buf_hdr_t *hdr = buf->b_hdr;
2045 boolean_t hdr_compressed =
2046 (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
2047 boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
2048 boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
2049 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2050 kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
2051
2052 ASSERT3P(buf->b_data, !=, NULL);
2053 IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
2054 IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
2055 IMPLY(encrypted, HDR_ENCRYPTED(hdr));
2056 IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
2057 IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
2058 IMPLY(encrypted, !arc_buf_is_shared(buf));
2059
2060 /*
2061 * If the caller wanted encrypted data we just need to copy it from
2062 * b_rabd and potentially byteswap it. We won't be able to do any
2063 * further transforms on it.
2064 */
2065 if (encrypted) {
2066 ASSERT(HDR_HAS_RABD(hdr));
2067 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
2068 HDR_GET_PSIZE(hdr));
2069 goto byteswap;
2070 }
2071
2072 /*
2073 * Adjust encrypted and authenticated headers to accommodate
2074 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
2075 * allowed to fail decryption due to keys not being loaded
2076 * without being marked as an IO error.
2077 */
2078 if (HDR_PROTECTED(hdr)) {
2079 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
2080 zb, !!(flags & ARC_FILL_NOAUTH));
2081 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
2082 return (error);
2083 } else if (error != 0) {
2084 if (hash_lock != NULL)
2085 mutex_enter(hash_lock);
2086 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2087 if (hash_lock != NULL)
2088 mutex_exit(hash_lock);
2089 return (error);
2090 }
2091 }
2092
2093 /*
2094 * There is a special case here for dnode blocks which are
2095 * decrypting their bonus buffers. These blocks may request to
2096 * be decrypted in-place. This is necessary because there may
2097 * be many dnodes pointing into this buffer and there is
2098 * currently no method to synchronize replacing the backing
2099 * b_data buffer and updating all of the pointers. Here we use
2100 * the hash lock to ensure there are no races. If the need
2101 * arises for other types to be decrypted in-place, they must
2102 * add handling here as well.
2103 */
2104 if ((flags & ARC_FILL_IN_PLACE) != 0) {
2105 ASSERT(!hdr_compressed);
2106 ASSERT(!compressed);
2107 ASSERT(!encrypted);
2108
2109 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
2110 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2111
2112 if (hash_lock != NULL)
2113 mutex_enter(hash_lock);
2114 arc_buf_untransform_in_place(buf);
2115 if (hash_lock != NULL)
2116 mutex_exit(hash_lock);
2117
2118 /* Compute the hdr's checksum if necessary */
2119 arc_cksum_compute(buf);
2120 }
2121
2122 return (0);
2123 }
2124
2125 if (hdr_compressed == compressed) {
2126 if (ARC_BUF_SHARED(buf)) {
2127 ASSERT(arc_buf_is_shared(buf));
2128 } else {
2129 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2130 arc_buf_size(buf));
2131 }
2132 } else {
2133 ASSERT(hdr_compressed);
2134 ASSERT(!compressed);
2135
2136 /*
2137 * If the buf is sharing its data with the hdr, unlink it and
2138 * allocate a new data buffer for the buf.
2139 */
2140 if (ARC_BUF_SHARED(buf)) {
2141 ASSERTF(ARC_BUF_COMPRESSED(buf),
2142 "buf %p was uncompressed", buf);
2143
2144 /* We need to give the buf its own b_data */
2145 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2146 buf->b_data =
2147 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2148 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2149
2150 /* Previously overhead was 0; just add new overhead */
2151 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2152 } else if (ARC_BUF_COMPRESSED(buf)) {
2153 ASSERT(!arc_buf_is_shared(buf));
2154
2155 /* We need to reallocate the buf's b_data */
2156 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2157 buf);
2158 buf->b_data =
2159 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2160
2161 /* We increased the size of b_data; update overhead */
2162 ARCSTAT_INCR(arcstat_overhead_size,
2163 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2164 }
2165
2166 /*
2167 * Regardless of the buf's previous compression settings, it
2168 * should not be compressed at the end of this function.
2169 */
2170 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2171
2172 /*
2173 * Try copying the data from another buf which already has a
2174 * decompressed version. If that's not possible, it's time to
2175 * bite the bullet and decompress the data from the hdr.
2176 */
2177 if (arc_buf_try_copy_decompressed_data(buf)) {
2178 /* Skip byteswapping and checksumming (already done) */
2179 return (0);
2180 } else {
2181 abd_t dabd;
2182 abd_get_from_buf_struct(&dabd, buf->b_data,
2183 HDR_GET_LSIZE(hdr));
2184 error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2185 hdr->b_l1hdr.b_pabd, &dabd,
2186 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
2187 &hdr->b_complevel);
2188 abd_free(&dabd);
2189
2190 /*
2191 * Absent hardware errors or software bugs, this should
2192 * be impossible, but log it anyway so we can debug it.
2193 */
2194 if (error != 0) {
2195 zfs_dbgmsg(
2196 "hdr %px, compress %d, psize %d, lsize %d",
2197 hdr, arc_hdr_get_compress(hdr),
2198 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2199 if (hash_lock != NULL)
2200 mutex_enter(hash_lock);
2201 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2202 if (hash_lock != NULL)
2203 mutex_exit(hash_lock);
2204 return (SET_ERROR(EIO));
2205 }
2206 }
2207 }
2208
2209 byteswap:
2210 /* Byteswap the buf's data if necessary */
2211 if (bswap != DMU_BSWAP_NUMFUNCS) {
2212 ASSERT(!HDR_SHARED_DATA(hdr));
2213 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2214 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2215 }
2216
2217 /* Compute the hdr's checksum if necessary */
2218 arc_cksum_compute(buf);
2219
2220 return (0);
2221 }
2222
2223 /*
2224 * If this function is being called to decrypt an encrypted buffer or verify an
2225 * authenticated one, the key must be loaded and a mapping must be made
2226 * available in the keystore via spa_keystore_create_mapping() or one of its
2227 * callers.
2228 */
2229 int
arc_untransform(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,boolean_t in_place)2230 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2231 boolean_t in_place)
2232 {
2233 int ret;
2234 arc_fill_flags_t flags = 0;
2235
2236 if (in_place)
2237 flags |= ARC_FILL_IN_PLACE;
2238
2239 ret = arc_buf_fill(buf, spa, zb, flags);
2240 if (ret == ECKSUM) {
2241 /*
2242 * Convert authentication and decryption errors to EIO
2243 * (and generate an ereport) before leaving the ARC.
2244 */
2245 ret = SET_ERROR(EIO);
2246 spa_log_error(spa, zb, buf->b_hdr->b_birth);
2247 (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2248 spa, NULL, zb, NULL, 0);
2249 }
2250
2251 return (ret);
2252 }
2253
2254 /*
2255 * Increment the amount of evictable space in the arc_state_t's refcount.
2256 * We account for the space used by the hdr and the arc buf individually
2257 * so that we can add and remove them from the refcount individually.
2258 */
2259 static void
arc_evictable_space_increment(arc_buf_hdr_t * hdr,arc_state_t * state)2260 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2261 {
2262 arc_buf_contents_t type = arc_buf_type(hdr);
2263
2264 ASSERT(HDR_HAS_L1HDR(hdr));
2265
2266 if (GHOST_STATE(state)) {
2267 ASSERT0P(hdr->b_l1hdr.b_buf);
2268 ASSERT0P(hdr->b_l1hdr.b_pabd);
2269 ASSERT(!HDR_HAS_RABD(hdr));
2270 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2271 HDR_GET_LSIZE(hdr), hdr);
2272 return;
2273 }
2274
2275 if (hdr->b_l1hdr.b_pabd != NULL) {
2276 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2277 arc_hdr_size(hdr), hdr);
2278 }
2279 if (HDR_HAS_RABD(hdr)) {
2280 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2281 HDR_GET_PSIZE(hdr), hdr);
2282 }
2283
2284 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2285 buf = buf->b_next) {
2286 if (ARC_BUF_SHARED(buf))
2287 continue;
2288 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2289 arc_buf_size(buf), buf);
2290 }
2291 }
2292
2293 /*
2294 * Decrement the amount of evictable space in the arc_state_t's refcount.
2295 * We account for the space used by the hdr and the arc buf individually
2296 * so that we can add and remove them from the refcount individually.
2297 */
2298 static void
arc_evictable_space_decrement(arc_buf_hdr_t * hdr,arc_state_t * state)2299 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2300 {
2301 arc_buf_contents_t type = arc_buf_type(hdr);
2302
2303 ASSERT(HDR_HAS_L1HDR(hdr));
2304
2305 if (GHOST_STATE(state)) {
2306 ASSERT0P(hdr->b_l1hdr.b_buf);
2307 ASSERT0P(hdr->b_l1hdr.b_pabd);
2308 ASSERT(!HDR_HAS_RABD(hdr));
2309 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2310 HDR_GET_LSIZE(hdr), hdr);
2311 return;
2312 }
2313
2314 if (hdr->b_l1hdr.b_pabd != NULL) {
2315 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2316 arc_hdr_size(hdr), hdr);
2317 }
2318 if (HDR_HAS_RABD(hdr)) {
2319 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2320 HDR_GET_PSIZE(hdr), hdr);
2321 }
2322
2323 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2324 buf = buf->b_next) {
2325 if (ARC_BUF_SHARED(buf))
2326 continue;
2327 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2328 arc_buf_size(buf), buf);
2329 }
2330 }
2331
2332 /*
2333 * Add a reference to this hdr indicating that someone is actively
2334 * referencing that memory. When the refcount transitions from 0 to 1,
2335 * we remove it from the respective arc_state_t list to indicate that
2336 * it is not evictable.
2337 */
2338 static void
add_reference(arc_buf_hdr_t * hdr,const void * tag)2339 add_reference(arc_buf_hdr_t *hdr, const void *tag)
2340 {
2341 arc_state_t *state = hdr->b_l1hdr.b_state;
2342
2343 ASSERT(HDR_HAS_L1HDR(hdr));
2344 if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2345 ASSERT(state == arc_anon);
2346 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2347 ASSERT0P(hdr->b_l1hdr.b_buf);
2348 }
2349
2350 if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2351 state != arc_anon && state != arc_l2c_only) {
2352 /* We don't use the L2-only state list. */
2353 multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
2354 arc_evictable_space_decrement(hdr, state);
2355 }
2356 }
2357
2358 /*
2359 * Remove a reference from this hdr. When the reference transitions from
2360 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2361 * list making it eligible for eviction.
2362 */
2363 static int
remove_reference(arc_buf_hdr_t * hdr,const void * tag)2364 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
2365 {
2366 int cnt;
2367 arc_state_t *state = hdr->b_l1hdr.b_state;
2368
2369 ASSERT(HDR_HAS_L1HDR(hdr));
2370 ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
2371 ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */
2372
2373 if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
2374 return (cnt);
2375
2376 if (state == arc_anon) {
2377 arc_hdr_destroy(hdr);
2378 return (0);
2379 }
2380 if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
2381 arc_change_state(arc_anon, hdr);
2382 arc_hdr_destroy(hdr);
2383 return (0);
2384 }
2385 multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
2386 arc_evictable_space_increment(hdr, state);
2387 return (0);
2388 }
2389
2390 /*
2391 * Returns detailed information about a specific arc buffer. When the
2392 * state_index argument is set the function will calculate the arc header
2393 * list position for its arc state. Since this requires a linear traversal
2394 * callers are strongly encourage not to do this. However, it can be helpful
2395 * for targeted analysis so the functionality is provided.
2396 */
2397 void
arc_buf_info(arc_buf_t * ab,arc_buf_info_t * abi,int state_index)2398 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
2399 {
2400 (void) state_index;
2401 arc_buf_hdr_t *hdr = ab->b_hdr;
2402 l1arc_buf_hdr_t *l1hdr = NULL;
2403 l2arc_buf_hdr_t *l2hdr = NULL;
2404 arc_state_t *state = NULL;
2405
2406 memset(abi, 0, sizeof (arc_buf_info_t));
2407
2408 if (hdr == NULL)
2409 return;
2410
2411 abi->abi_flags = hdr->b_flags;
2412
2413 if (HDR_HAS_L1HDR(hdr)) {
2414 l1hdr = &hdr->b_l1hdr;
2415 state = l1hdr->b_state;
2416 }
2417 if (HDR_HAS_L2HDR(hdr))
2418 l2hdr = &hdr->b_l2hdr;
2419
2420 if (l1hdr) {
2421 abi->abi_bufcnt = 0;
2422 for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
2423 abi->abi_bufcnt++;
2424 abi->abi_access = l1hdr->b_arc_access;
2425 abi->abi_mru_hits = l1hdr->b_mru_hits;
2426 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
2427 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
2428 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
2429 abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
2430 }
2431
2432 if (l2hdr) {
2433 abi->abi_l2arc_dattr = l2hdr->b_daddr;
2434 abi->abi_l2arc_hits = l2hdr->b_hits;
2435 }
2436
2437 abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
2438 abi->abi_state_contents = arc_buf_type(hdr);
2439 abi->abi_size = arc_hdr_size(hdr);
2440 }
2441
2442 /*
2443 * Move the supplied buffer to the indicated state. The hash lock
2444 * for the buffer must be held by the caller.
2445 */
2446 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr)2447 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
2448 {
2449 arc_state_t *old_state;
2450 int64_t refcnt;
2451 boolean_t update_old, update_new;
2452 arc_buf_contents_t type = arc_buf_type(hdr);
2453
2454 /*
2455 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2456 * in arc_read() when bringing a buffer out of the L2ARC. However, the
2457 * L1 hdr doesn't always exist when we change state to arc_anon before
2458 * destroying a header, in which case reallocating to add the L1 hdr is
2459 * pointless.
2460 */
2461 if (HDR_HAS_L1HDR(hdr)) {
2462 old_state = hdr->b_l1hdr.b_state;
2463 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2464 update_old = (hdr->b_l1hdr.b_buf != NULL ||
2465 hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
2466
2467 IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
2468 IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
2469 IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
2470 ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
2471 } else {
2472 old_state = arc_l2c_only;
2473 refcnt = 0;
2474 update_old = B_FALSE;
2475 }
2476 update_new = update_old;
2477 if (GHOST_STATE(old_state))
2478 update_old = B_TRUE;
2479 if (GHOST_STATE(new_state))
2480 update_new = B_TRUE;
2481
2482 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
2483 ASSERT3P(new_state, !=, old_state);
2484
2485 /*
2486 * If this buffer is evictable, transfer it from the
2487 * old state list to the new state list.
2488 */
2489 if (refcnt == 0) {
2490 if (old_state != arc_anon && old_state != arc_l2c_only) {
2491 ASSERT(HDR_HAS_L1HDR(hdr));
2492 /* remove_reference() saves on insert. */
2493 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2494 multilist_remove(&old_state->arcs_list[type],
2495 hdr);
2496 arc_evictable_space_decrement(hdr, old_state);
2497 }
2498 }
2499 if (new_state != arc_anon && new_state != arc_l2c_only) {
2500 /*
2501 * An L1 header always exists here, since if we're
2502 * moving to some L1-cached state (i.e. not l2c_only or
2503 * anonymous), we realloc the header to add an L1hdr
2504 * beforehand.
2505 */
2506 ASSERT(HDR_HAS_L1HDR(hdr));
2507 multilist_insert(&new_state->arcs_list[type], hdr);
2508 arc_evictable_space_increment(hdr, new_state);
2509 }
2510 }
2511
2512 ASSERT(!HDR_EMPTY(hdr));
2513 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2514 buf_hash_remove(hdr);
2515
2516 /* adjust state sizes (ignore arc_l2c_only) */
2517
2518 if (update_new && new_state != arc_l2c_only) {
2519 ASSERT(HDR_HAS_L1HDR(hdr));
2520 if (GHOST_STATE(new_state)) {
2521
2522 /*
2523 * When moving a header to a ghost state, we first
2524 * remove all arc buffers. Thus, we'll have no arc
2525 * buffer to use for the reference. As a result, we
2526 * use the arc header pointer for the reference.
2527 */
2528 (void) zfs_refcount_add_many(
2529 &new_state->arcs_size[type],
2530 HDR_GET_LSIZE(hdr), hdr);
2531 ASSERT0P(hdr->b_l1hdr.b_pabd);
2532 ASSERT(!HDR_HAS_RABD(hdr));
2533 } else {
2534
2535 /*
2536 * Each individual buffer holds a unique reference,
2537 * thus we must remove each of these references one
2538 * at a time.
2539 */
2540 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2541 buf = buf->b_next) {
2542
2543 /*
2544 * When the arc_buf_t is sharing the data
2545 * block with the hdr, the owner of the
2546 * reference belongs to the hdr. Only
2547 * add to the refcount if the arc_buf_t is
2548 * not shared.
2549 */
2550 if (ARC_BUF_SHARED(buf))
2551 continue;
2552
2553 (void) zfs_refcount_add_many(
2554 &new_state->arcs_size[type],
2555 arc_buf_size(buf), buf);
2556 }
2557
2558 if (hdr->b_l1hdr.b_pabd != NULL) {
2559 (void) zfs_refcount_add_many(
2560 &new_state->arcs_size[type],
2561 arc_hdr_size(hdr), hdr);
2562 }
2563
2564 if (HDR_HAS_RABD(hdr)) {
2565 (void) zfs_refcount_add_many(
2566 &new_state->arcs_size[type],
2567 HDR_GET_PSIZE(hdr), hdr);
2568 }
2569 }
2570 }
2571
2572 if (update_old && old_state != arc_l2c_only) {
2573 ASSERT(HDR_HAS_L1HDR(hdr));
2574 if (GHOST_STATE(old_state)) {
2575 ASSERT0P(hdr->b_l1hdr.b_pabd);
2576 ASSERT(!HDR_HAS_RABD(hdr));
2577
2578 /*
2579 * When moving a header off of a ghost state,
2580 * the header will not contain any arc buffers.
2581 * We use the arc header pointer for the reference
2582 * which is exactly what we did when we put the
2583 * header on the ghost state.
2584 */
2585
2586 (void) zfs_refcount_remove_many(
2587 &old_state->arcs_size[type],
2588 HDR_GET_LSIZE(hdr), hdr);
2589 } else {
2590
2591 /*
2592 * Each individual buffer holds a unique reference,
2593 * thus we must remove each of these references one
2594 * at a time.
2595 */
2596 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2597 buf = buf->b_next) {
2598
2599 /*
2600 * When the arc_buf_t is sharing the data
2601 * block with the hdr, the owner of the
2602 * reference belongs to the hdr. Only
2603 * add to the refcount if the arc_buf_t is
2604 * not shared.
2605 */
2606 if (ARC_BUF_SHARED(buf))
2607 continue;
2608
2609 (void) zfs_refcount_remove_many(
2610 &old_state->arcs_size[type],
2611 arc_buf_size(buf), buf);
2612 }
2613 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2614 HDR_HAS_RABD(hdr));
2615
2616 if (hdr->b_l1hdr.b_pabd != NULL) {
2617 (void) zfs_refcount_remove_many(
2618 &old_state->arcs_size[type],
2619 arc_hdr_size(hdr), hdr);
2620 }
2621
2622 if (HDR_HAS_RABD(hdr)) {
2623 (void) zfs_refcount_remove_many(
2624 &old_state->arcs_size[type],
2625 HDR_GET_PSIZE(hdr), hdr);
2626 }
2627 }
2628 }
2629
2630 if (HDR_HAS_L1HDR(hdr)) {
2631 hdr->b_l1hdr.b_state = new_state;
2632
2633 if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2634 l2arc_hdr_arcstats_decrement_state(hdr);
2635 hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2636 l2arc_hdr_arcstats_increment_state(hdr);
2637 }
2638 }
2639 }
2640
2641 void
arc_space_consume(uint64_t space,arc_space_type_t type)2642 arc_space_consume(uint64_t space, arc_space_type_t type)
2643 {
2644 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2645
2646 switch (type) {
2647 default:
2648 break;
2649 case ARC_SPACE_DATA:
2650 ARCSTAT_INCR(arcstat_data_size, space);
2651 break;
2652 case ARC_SPACE_META:
2653 ARCSTAT_INCR(arcstat_metadata_size, space);
2654 break;
2655 case ARC_SPACE_BONUS:
2656 ARCSTAT_INCR(arcstat_bonus_size, space);
2657 break;
2658 case ARC_SPACE_DNODE:
2659 aggsum_add(&arc_sums.arcstat_dnode_size, space);
2660 break;
2661 case ARC_SPACE_DBUF:
2662 ARCSTAT_INCR(arcstat_dbuf_size, space);
2663 break;
2664 case ARC_SPACE_HDRS:
2665 ARCSTAT_INCR(arcstat_hdr_size, space);
2666 break;
2667 case ARC_SPACE_L2HDRS:
2668 aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
2669 break;
2670 case ARC_SPACE_ABD_CHUNK_WASTE:
2671 /*
2672 * Note: this includes space wasted by all scatter ABD's, not
2673 * just those allocated by the ARC. But the vast majority of
2674 * scatter ABD's come from the ARC, because other users are
2675 * very short-lived.
2676 */
2677 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
2678 break;
2679 }
2680
2681 if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
2682 ARCSTAT_INCR(arcstat_meta_used, space);
2683
2684 aggsum_add(&arc_sums.arcstat_size, space);
2685 }
2686
2687 void
arc_space_return(uint64_t space,arc_space_type_t type)2688 arc_space_return(uint64_t space, arc_space_type_t type)
2689 {
2690 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2691
2692 switch (type) {
2693 default:
2694 break;
2695 case ARC_SPACE_DATA:
2696 ARCSTAT_INCR(arcstat_data_size, -space);
2697 break;
2698 case ARC_SPACE_META:
2699 ARCSTAT_INCR(arcstat_metadata_size, -space);
2700 break;
2701 case ARC_SPACE_BONUS:
2702 ARCSTAT_INCR(arcstat_bonus_size, -space);
2703 break;
2704 case ARC_SPACE_DNODE:
2705 aggsum_add(&arc_sums.arcstat_dnode_size, -space);
2706 break;
2707 case ARC_SPACE_DBUF:
2708 ARCSTAT_INCR(arcstat_dbuf_size, -space);
2709 break;
2710 case ARC_SPACE_HDRS:
2711 ARCSTAT_INCR(arcstat_hdr_size, -space);
2712 break;
2713 case ARC_SPACE_L2HDRS:
2714 aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
2715 break;
2716 case ARC_SPACE_ABD_CHUNK_WASTE:
2717 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
2718 break;
2719 }
2720
2721 if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
2722 ARCSTAT_INCR(arcstat_meta_used, -space);
2723
2724 ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
2725 aggsum_add(&arc_sums.arcstat_size, -space);
2726 }
2727
2728 /*
2729 * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2730 * with the hdr's b_pabd.
2731 */
2732 static boolean_t
arc_can_share(arc_buf_hdr_t * hdr,arc_buf_t * buf)2733 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2734 {
2735 /*
2736 * The criteria for sharing a hdr's data are:
2737 * 1. the buffer is not encrypted
2738 * 2. the hdr's compression matches the buf's compression
2739 * 3. the hdr doesn't need to be byteswapped
2740 * 4. the hdr isn't already being shared
2741 * 5. the buf is either compressed or it is the last buf in the hdr list
2742 *
2743 * Criterion #5 maintains the invariant that shared uncompressed
2744 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2745 * might ask, "if a compressed buf is allocated first, won't that be the
2746 * last thing in the list?", but in that case it's impossible to create
2747 * a shared uncompressed buf anyway (because the hdr must be compressed
2748 * to have the compressed buf). You might also think that #3 is
2749 * sufficient to make this guarantee, however it's possible
2750 * (specifically in the rare L2ARC write race mentioned in
2751 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2752 * is shareable, but wasn't at the time of its allocation. Rather than
2753 * allow a new shared uncompressed buf to be created and then shuffle
2754 * the list around to make it the last element, this simply disallows
2755 * sharing if the new buf isn't the first to be added.
2756 */
2757 ASSERT3P(buf->b_hdr, ==, hdr);
2758 boolean_t hdr_compressed =
2759 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
2760 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2761 return (!ARC_BUF_ENCRYPTED(buf) &&
2762 buf_compressed == hdr_compressed &&
2763 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2764 !HDR_SHARED_DATA(hdr) &&
2765 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2766 }
2767
2768 /*
2769 * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2770 * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2771 * copy was made successfully, or an error code otherwise.
2772 */
2773 static int
arc_buf_alloc_impl(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb,const void * tag,boolean_t encrypted,boolean_t compressed,boolean_t noauth,boolean_t fill,arc_buf_t ** ret)2774 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2775 const void *tag, boolean_t encrypted, boolean_t compressed,
2776 boolean_t noauth, boolean_t fill, arc_buf_t **ret)
2777 {
2778 arc_buf_t *buf;
2779 arc_fill_flags_t flags = ARC_FILL_LOCKED;
2780
2781 ASSERT(HDR_HAS_L1HDR(hdr));
2782 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2783 VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2784 hdr->b_type == ARC_BUFC_METADATA);
2785 ASSERT3P(ret, !=, NULL);
2786 ASSERT0P(*ret);
2787 IMPLY(encrypted, compressed);
2788
2789 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2790 buf->b_hdr = hdr;
2791 buf->b_data = NULL;
2792 buf->b_next = hdr->b_l1hdr.b_buf;
2793 buf->b_flags = 0;
2794
2795 add_reference(hdr, tag);
2796
2797 /*
2798 * We're about to change the hdr's b_flags. We must either
2799 * hold the hash_lock or be undiscoverable.
2800 */
2801 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2802
2803 /*
2804 * Only honor requests for compressed bufs if the hdr is actually
2805 * compressed. This must be overridden if the buffer is encrypted since
2806 * encrypted buffers cannot be decompressed.
2807 */
2808 if (encrypted) {
2809 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2810 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2811 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2812 } else if (compressed &&
2813 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2814 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2815 flags |= ARC_FILL_COMPRESSED;
2816 }
2817
2818 if (noauth) {
2819 ASSERT0(encrypted);
2820 flags |= ARC_FILL_NOAUTH;
2821 }
2822
2823 /*
2824 * If the hdr's data can be shared then we share the data buffer and
2825 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2826 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
2827 * buffer to store the buf's data.
2828 *
2829 * There are two additional restrictions here because we're sharing
2830 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2831 * actively involved in an L2ARC write, because if this buf is used by
2832 * an arc_write() then the hdr's data buffer will be released when the
2833 * write completes, even though the L2ARC write might still be using it.
2834 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2835 * need to be ABD-aware. It must be allocated via
2836 * zio_[data_]buf_alloc(), not as a page, because we need to be able
2837 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
2838 * page" buffers because the ABD code needs to handle freeing them
2839 * specially.
2840 */
2841 boolean_t can_share = arc_can_share(hdr, buf) &&
2842 !HDR_L2_WRITING(hdr) &&
2843 hdr->b_l1hdr.b_pabd != NULL &&
2844 abd_is_linear(hdr->b_l1hdr.b_pabd) &&
2845 !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
2846
2847 /* Set up b_data and sharing */
2848 if (can_share) {
2849 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2850 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2851 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2852 } else {
2853 buf->b_data =
2854 arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2855 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2856 }
2857 VERIFY3P(buf->b_data, !=, NULL);
2858
2859 hdr->b_l1hdr.b_buf = buf;
2860
2861 /*
2862 * If the user wants the data from the hdr, we need to either copy or
2863 * decompress the data.
2864 */
2865 if (fill) {
2866 ASSERT3P(zb, !=, NULL);
2867 return (arc_buf_fill(buf, spa, zb, flags));
2868 }
2869
2870 return (0);
2871 }
2872
2873 static const char *arc_onloan_tag = "onloan";
2874
2875 static inline void
arc_loaned_bytes_update(int64_t delta)2876 arc_loaned_bytes_update(int64_t delta)
2877 {
2878 atomic_add_64(&arc_loaned_bytes, delta);
2879
2880 /* assert that it did not wrap around */
2881 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2882 }
2883
2884 /*
2885 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2886 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2887 * buffers must be returned to the arc before they can be used by the DMU or
2888 * freed.
2889 */
2890 arc_buf_t *
arc_loan_buf(spa_t * spa,boolean_t is_metadata,int size)2891 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2892 {
2893 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2894 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2895
2896 arc_loaned_bytes_update(arc_buf_size(buf));
2897
2898 return (buf);
2899 }
2900
2901 arc_buf_t *
arc_loan_compressed_buf(spa_t * spa,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)2902 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2903 enum zio_compress compression_type, uint8_t complevel)
2904 {
2905 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2906 psize, lsize, compression_type, complevel);
2907
2908 arc_loaned_bytes_update(arc_buf_size(buf));
2909
2910 return (buf);
2911 }
2912
2913 arc_buf_t *
arc_loan_raw_buf(spa_t * spa,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)2914 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2915 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2916 dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2917 enum zio_compress compression_type, uint8_t complevel)
2918 {
2919 arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2920 byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
2921 complevel);
2922
2923 atomic_add_64(&arc_loaned_bytes, psize);
2924 return (buf);
2925 }
2926
2927
2928 /*
2929 * Return a loaned arc buffer to the arc.
2930 */
2931 void
arc_return_buf(arc_buf_t * buf,const void * tag)2932 arc_return_buf(arc_buf_t *buf, const void *tag)
2933 {
2934 arc_buf_hdr_t *hdr = buf->b_hdr;
2935
2936 ASSERT3P(buf->b_data, !=, NULL);
2937 ASSERT(HDR_HAS_L1HDR(hdr));
2938 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2939 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2940
2941 arc_loaned_bytes_update(-arc_buf_size(buf));
2942 }
2943
2944 /* Detach an arc_buf from a dbuf (tag) */
2945 void
arc_loan_inuse_buf(arc_buf_t * buf,const void * tag)2946 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
2947 {
2948 arc_buf_hdr_t *hdr = buf->b_hdr;
2949
2950 ASSERT3P(buf->b_data, !=, NULL);
2951 ASSERT(HDR_HAS_L1HDR(hdr));
2952 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2953 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2954
2955 arc_loaned_bytes_update(arc_buf_size(buf));
2956 }
2957
2958 static void
l2arc_free_abd_on_write(abd_t * abd,l2arc_dev_t * dev)2959 l2arc_free_abd_on_write(abd_t *abd, l2arc_dev_t *dev)
2960 {
2961 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2962
2963 df->l2df_abd = abd;
2964 df->l2df_dev = dev;
2965 mutex_enter(&l2arc_free_on_write_mtx);
2966 list_insert_head(l2arc_free_on_write, df);
2967 mutex_exit(&l2arc_free_on_write_mtx);
2968 }
2969
2970 static void
arc_hdr_free_on_write(arc_buf_hdr_t * hdr,boolean_t free_rdata)2971 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2972 {
2973 arc_state_t *state = hdr->b_l1hdr.b_state;
2974 arc_buf_contents_t type = arc_buf_type(hdr);
2975 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2976
2977 /* protected by hash lock, if in the hash table */
2978 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2979 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2980 ASSERT(state != arc_anon && state != arc_l2c_only);
2981
2982 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2983 size, hdr);
2984 }
2985 (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
2986 if (type == ARC_BUFC_METADATA) {
2987 arc_space_return(size, ARC_SPACE_META);
2988 } else {
2989 ASSERT(type == ARC_BUFC_DATA);
2990 arc_space_return(size, ARC_SPACE_DATA);
2991 }
2992
2993 /*
2994 * L2HDR must exist since we're freeing an L2ARC-related ABD.
2995 */
2996 ASSERT(HDR_HAS_L2HDR(hdr));
2997
2998 if (free_rdata) {
2999 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd,
3000 hdr->b_l2hdr.b_dev);
3001 } else {
3002 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd,
3003 hdr->b_l2hdr.b_dev);
3004 }
3005 }
3006
3007 /*
3008 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
3009 * data buffer, we transfer the refcount ownership to the hdr and update
3010 * the appropriate kstats.
3011 */
3012 static void
arc_share_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)3013 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3014 {
3015 ASSERT(arc_can_share(hdr, buf));
3016 ASSERT0P(hdr->b_l1hdr.b_pabd);
3017 ASSERT(!ARC_BUF_ENCRYPTED(buf));
3018 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3019
3020 /*
3021 * Start sharing the data buffer. We transfer the
3022 * refcount ownership to the hdr since it always owns
3023 * the refcount whenever an arc_buf_t is shared.
3024 */
3025 zfs_refcount_transfer_ownership_many(
3026 &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
3027 arc_hdr_size(hdr), buf, hdr);
3028 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3029 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3030 HDR_ISTYPE_METADATA(hdr));
3031 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3032 buf->b_flags |= ARC_BUF_FLAG_SHARED;
3033
3034 /*
3035 * Since we've transferred ownership to the hdr we need
3036 * to increment its compressed and uncompressed kstats and
3037 * decrement the overhead size.
3038 */
3039 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3040 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3041 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
3042 }
3043
3044 static void
arc_unshare_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)3045 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3046 {
3047 ASSERT(arc_buf_is_shared(buf));
3048 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3049 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3050
3051 /*
3052 * We are no longer sharing this buffer so we need
3053 * to transfer its ownership to the rightful owner.
3054 */
3055 zfs_refcount_transfer_ownership_many(
3056 &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
3057 arc_hdr_size(hdr), hdr, buf);
3058 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3059 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3060 abd_free(hdr->b_l1hdr.b_pabd);
3061 hdr->b_l1hdr.b_pabd = NULL;
3062 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3063
3064 /*
3065 * Since the buffer is no longer shared between
3066 * the arc buf and the hdr, count it as overhead.
3067 */
3068 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3069 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3070 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3071 }
3072
3073 /*
3074 * Remove an arc_buf_t from the hdr's buf list and return the last
3075 * arc_buf_t on the list. If no buffers remain on the list then return
3076 * NULL.
3077 */
3078 static arc_buf_t *
arc_buf_remove(arc_buf_hdr_t * hdr,arc_buf_t * buf)3079 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3080 {
3081 ASSERT(HDR_HAS_L1HDR(hdr));
3082 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3083
3084 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3085 arc_buf_t *lastbuf = NULL;
3086
3087 /*
3088 * Remove the buf from the hdr list and locate the last
3089 * remaining buffer on the list.
3090 */
3091 while (*bufp != NULL) {
3092 if (*bufp == buf)
3093 *bufp = buf->b_next;
3094
3095 /*
3096 * If we've removed a buffer in the middle of
3097 * the list then update the lastbuf and update
3098 * bufp.
3099 */
3100 if (*bufp != NULL) {
3101 lastbuf = *bufp;
3102 bufp = &(*bufp)->b_next;
3103 }
3104 }
3105 buf->b_next = NULL;
3106 ASSERT3P(lastbuf, !=, buf);
3107 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3108
3109 return (lastbuf);
3110 }
3111
3112 /*
3113 * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
3114 * list and free it.
3115 */
3116 static void
arc_buf_destroy_impl(arc_buf_t * buf)3117 arc_buf_destroy_impl(arc_buf_t *buf)
3118 {
3119 arc_buf_hdr_t *hdr = buf->b_hdr;
3120
3121 /*
3122 * Free up the data associated with the buf but only if we're not
3123 * sharing this with the hdr. If we are sharing it with the hdr, the
3124 * hdr is responsible for doing the free.
3125 */
3126 if (buf->b_data != NULL) {
3127 /*
3128 * We're about to change the hdr's b_flags. We must either
3129 * hold the hash_lock or be undiscoverable.
3130 */
3131 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3132
3133 arc_cksum_verify(buf);
3134 arc_buf_unwatch(buf);
3135
3136 if (ARC_BUF_SHARED(buf)) {
3137 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3138 } else {
3139 ASSERT(!arc_buf_is_shared(buf));
3140 uint64_t size = arc_buf_size(buf);
3141 arc_free_data_buf(hdr, buf->b_data, size, buf);
3142 ARCSTAT_INCR(arcstat_overhead_size, -size);
3143 }
3144 buf->b_data = NULL;
3145
3146 /*
3147 * If we have no more encrypted buffers and we've already
3148 * gotten a copy of the decrypted data we can free b_rabd
3149 * to save some space.
3150 */
3151 if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
3152 hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
3153 arc_buf_t *b;
3154 for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
3155 if (b != buf && ARC_BUF_ENCRYPTED(b))
3156 break;
3157 }
3158 if (b == NULL)
3159 arc_hdr_free_abd(hdr, B_TRUE);
3160 }
3161 }
3162
3163 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3164
3165 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3166 /*
3167 * If the current arc_buf_t is sharing its data buffer with the
3168 * hdr, then reassign the hdr's b_pabd to share it with the new
3169 * buffer at the end of the list. The shared buffer is always
3170 * the last one on the hdr's buffer list.
3171 *
3172 * There is an equivalent case for compressed bufs, but since
3173 * they aren't guaranteed to be the last buf in the list and
3174 * that is an exceedingly rare case, we just allow that space be
3175 * wasted temporarily. We must also be careful not to share
3176 * encrypted buffers, since they cannot be shared.
3177 */
3178 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3179 /* Only one buf can be shared at once */
3180 ASSERT(!arc_buf_is_shared(lastbuf));
3181 /* hdr is uncompressed so can't have compressed buf */
3182 ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
3183
3184 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3185 arc_hdr_free_abd(hdr, B_FALSE);
3186
3187 /*
3188 * We must setup a new shared block between the
3189 * last buffer and the hdr. The data would have
3190 * been allocated by the arc buf so we need to transfer
3191 * ownership to the hdr since it's now being shared.
3192 */
3193 arc_share_buf(hdr, lastbuf);
3194 }
3195 } else if (HDR_SHARED_DATA(hdr)) {
3196 /*
3197 * Uncompressed shared buffers are always at the end
3198 * of the list. Compressed buffers don't have the
3199 * same requirements. This makes it hard to
3200 * simply assert that the lastbuf is shared so
3201 * we rely on the hdr's compression flags to determine
3202 * if we have a compressed, shared buffer.
3203 */
3204 ASSERT3P(lastbuf, !=, NULL);
3205 ASSERT(arc_buf_is_shared(lastbuf) ||
3206 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3207 }
3208
3209 /*
3210 * Free the checksum if we're removing the last uncompressed buf from
3211 * this hdr.
3212 */
3213 if (!arc_hdr_has_uncompressed_buf(hdr)) {
3214 arc_cksum_free(hdr);
3215 }
3216
3217 /* clean up the buf */
3218 buf->b_hdr = NULL;
3219 kmem_cache_free(buf_cache, buf);
3220 }
3221
3222 static void
arc_hdr_alloc_abd(arc_buf_hdr_t * hdr,int alloc_flags)3223 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
3224 {
3225 uint64_t size;
3226 boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3227
3228 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3229 ASSERT(HDR_HAS_L1HDR(hdr));
3230 ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3231 IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3232
3233 if (alloc_rdata) {
3234 size = HDR_GET_PSIZE(hdr);
3235 ASSERT0P(hdr->b_crypt_hdr.b_rabd);
3236 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3237 alloc_flags);
3238 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3239 ARCSTAT_INCR(arcstat_raw_size, size);
3240 } else {
3241 size = arc_hdr_size(hdr);
3242 ASSERT0P(hdr->b_l1hdr.b_pabd);
3243 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3244 alloc_flags);
3245 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3246 }
3247
3248 ARCSTAT_INCR(arcstat_compressed_size, size);
3249 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3250 }
3251
3252 static void
arc_hdr_free_abd(arc_buf_hdr_t * hdr,boolean_t free_rdata)3253 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3254 {
3255 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3256
3257 ASSERT(HDR_HAS_L1HDR(hdr));
3258 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3259 IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3260
3261 /*
3262 * If the hdr is currently being written to the l2arc then
3263 * we defer freeing the data by adding it to the l2arc_free_on_write
3264 * list. The l2arc will free the data once it's finished
3265 * writing it to the l2arc device.
3266 */
3267 if (HDR_L2_WRITING(hdr)) {
3268 arc_hdr_free_on_write(hdr, free_rdata);
3269 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3270 } else if (free_rdata) {
3271 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3272 } else {
3273 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
3274 }
3275
3276 if (free_rdata) {
3277 hdr->b_crypt_hdr.b_rabd = NULL;
3278 ARCSTAT_INCR(arcstat_raw_size, -size);
3279 } else {
3280 hdr->b_l1hdr.b_pabd = NULL;
3281 }
3282
3283 if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3284 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3285
3286 ARCSTAT_INCR(arcstat_compressed_size, -size);
3287 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3288 }
3289
3290 /*
3291 * Allocate empty anonymous ARC header. The header will get its identity
3292 * assigned and buffers attached later as part of read or write operations.
3293 *
3294 * In case of read arc_read() assigns header its identify (b_dva + b_birth),
3295 * inserts it into ARC hash to become globally visible and allocates physical
3296 * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read
3297 * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
3298 * sharing one of them with the physical ABD buffer.
3299 *
3300 * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
3301 * data. Then after compression and/or encryption arc_write_ready() allocates
3302 * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
3303 * buffer. On disk write completion arc_write_done() assigns the header its
3304 * new identity (b_dva + b_birth) and inserts into ARC hash.
3305 *
3306 * In case of partial overwrite the old data is read first as described. Then
3307 * arc_release() either allocates new anonymous ARC header and moves the ARC
3308 * buffer to it, or reuses the old ARC header by discarding its identity and
3309 * removing it from ARC hash. After buffer modification normal write process
3310 * follows as described.
3311 */
3312 static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa,int32_t psize,int32_t lsize,boolean_t protected,enum zio_compress compression_type,uint8_t complevel,arc_buf_contents_t type)3313 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3314 boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
3315 arc_buf_contents_t type)
3316 {
3317 arc_buf_hdr_t *hdr;
3318
3319 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3320 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3321
3322 ASSERT(HDR_EMPTY(hdr));
3323 #ifdef ZFS_DEBUG
3324 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3325 #endif
3326 HDR_SET_PSIZE(hdr, psize);
3327 HDR_SET_LSIZE(hdr, lsize);
3328 hdr->b_spa = spa;
3329 hdr->b_type = type;
3330 hdr->b_flags = 0;
3331 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3332 arc_hdr_set_compress(hdr, compression_type);
3333 hdr->b_complevel = complevel;
3334 if (protected)
3335 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3336
3337 hdr->b_l1hdr.b_state = arc_anon;
3338 hdr->b_l1hdr.b_arc_access = 0;
3339 hdr->b_l1hdr.b_mru_hits = 0;
3340 hdr->b_l1hdr.b_mru_ghost_hits = 0;
3341 hdr->b_l1hdr.b_mfu_hits = 0;
3342 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
3343 hdr->b_l1hdr.b_buf = NULL;
3344
3345 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3346
3347 return (hdr);
3348 }
3349
3350 /*
3351 * Transition between the two allocation states for the arc_buf_hdr struct.
3352 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3353 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3354 * version is used when a cache buffer is only in the L2ARC in order to reduce
3355 * memory usage.
3356 */
3357 static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)3358 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3359 {
3360 ASSERT(HDR_HAS_L2HDR(hdr));
3361
3362 arc_buf_hdr_t *nhdr;
3363 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3364
3365 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3366 (old == hdr_l2only_cache && new == hdr_full_cache));
3367
3368 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3369
3370 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3371 buf_hash_remove(hdr);
3372
3373 memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
3374
3375 if (new == hdr_full_cache) {
3376 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3377 /*
3378 * arc_access and arc_change_state need to be aware that a
3379 * header has just come out of L2ARC, so we set its state to
3380 * l2c_only even though it's about to change.
3381 */
3382 nhdr->b_l1hdr.b_state = arc_l2c_only;
3383
3384 /* Verify previous threads set to NULL before freeing */
3385 ASSERT0P(nhdr->b_l1hdr.b_pabd);
3386 ASSERT(!HDR_HAS_RABD(hdr));
3387 } else {
3388 ASSERT0P(hdr->b_l1hdr.b_buf);
3389 #ifdef ZFS_DEBUG
3390 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3391 #endif
3392
3393 /*
3394 * If we've reached here, We must have been called from
3395 * arc_evict_hdr(), as such we should have already been
3396 * removed from any ghost list we were previously on
3397 * (which protects us from racing with arc_evict_state),
3398 * thus no locking is needed during this check.
3399 */
3400 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3401
3402 /*
3403 * A buffer must not be moved into the arc_l2c_only
3404 * state if it's not finished being written out to the
3405 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3406 * might try to be accessed, even though it was removed.
3407 */
3408 VERIFY(!HDR_L2_WRITING(hdr));
3409 VERIFY0P(hdr->b_l1hdr.b_pabd);
3410 ASSERT(!HDR_HAS_RABD(hdr));
3411
3412 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3413 }
3414 /*
3415 * The header has been reallocated so we need to re-insert it into any
3416 * lists it was on.
3417 */
3418 (void) buf_hash_insert(nhdr, NULL);
3419
3420 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3421
3422 mutex_enter(&dev->l2ad_mtx);
3423
3424 /*
3425 * We must place the realloc'ed header back into the list at
3426 * the same spot. Otherwise, if it's placed earlier in the list,
3427 * l2arc_write_buffers() could find it during the function's
3428 * write phase, and try to write it out to the l2arc.
3429 */
3430 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3431 list_remove(&dev->l2ad_buflist, hdr);
3432
3433 mutex_exit(&dev->l2ad_mtx);
3434
3435 /*
3436 * Since we're using the pointer address as the tag when
3437 * incrementing and decrementing the l2ad_alloc refcount, we
3438 * must remove the old pointer (that we're about to destroy) and
3439 * add the new pointer to the refcount. Otherwise we'd remove
3440 * the wrong pointer address when calling arc_hdr_destroy() later.
3441 */
3442
3443 (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
3444 arc_hdr_size(hdr), hdr);
3445 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
3446 arc_hdr_size(nhdr), nhdr);
3447
3448 buf_discard_identity(hdr);
3449 kmem_cache_free(old, hdr);
3450
3451 return (nhdr);
3452 }
3453
3454 /*
3455 * This function is used by the send / receive code to convert a newly
3456 * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3457 * is also used to allow the root objset block to be updated without altering
3458 * its embedded MACs. Both block types will always be uncompressed so we do not
3459 * have to worry about compression type or psize.
3460 */
3461 void
arc_convert_to_raw(arc_buf_t * buf,uint64_t dsobj,boolean_t byteorder,dmu_object_type_t ot,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac)3462 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3463 dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3464 const uint8_t *mac)
3465 {
3466 arc_buf_hdr_t *hdr = buf->b_hdr;
3467
3468 ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3469 ASSERT(HDR_HAS_L1HDR(hdr));
3470 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3471
3472 buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3473 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3474 hdr->b_crypt_hdr.b_dsobj = dsobj;
3475 hdr->b_crypt_hdr.b_ot = ot;
3476 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3477 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3478 if (!arc_hdr_has_uncompressed_buf(hdr))
3479 arc_cksum_free(hdr);
3480
3481 if (salt != NULL)
3482 memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
3483 if (iv != NULL)
3484 memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
3485 if (mac != NULL)
3486 memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
3487 }
3488
3489 /*
3490 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3491 * The buf is returned thawed since we expect the consumer to modify it.
3492 */
3493 arc_buf_t *
arc_alloc_buf(spa_t * spa,const void * tag,arc_buf_contents_t type,int32_t size)3494 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
3495 int32_t size)
3496 {
3497 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3498 B_FALSE, ZIO_COMPRESS_OFF, 0, type);
3499
3500 arc_buf_t *buf = NULL;
3501 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
3502 B_FALSE, B_FALSE, &buf));
3503 arc_buf_thaw(buf);
3504
3505 return (buf);
3506 }
3507
3508 /*
3509 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3510 * for bufs containing metadata.
3511 */
3512 arc_buf_t *
arc_alloc_compressed_buf(spa_t * spa,const void * tag,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)3513 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
3514 uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
3515 {
3516 ASSERT3U(lsize, >, 0);
3517 ASSERT3U(lsize, >=, psize);
3518 ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3519 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3520
3521 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3522 B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
3523
3524 arc_buf_t *buf = NULL;
3525 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
3526 B_TRUE, B_FALSE, B_FALSE, &buf));
3527 arc_buf_thaw(buf);
3528
3529 /*
3530 * To ensure that the hdr has the correct data in it if we call
3531 * arc_untransform() on this buf before it's been written to disk,
3532 * it's easiest if we just set up sharing between the buf and the hdr.
3533 */
3534 arc_share_buf(hdr, buf);
3535
3536 return (buf);
3537 }
3538
3539 arc_buf_t *
arc_alloc_raw_buf(spa_t * spa,const void * tag,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)3540 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
3541 boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
3542 const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3543 enum zio_compress compression_type, uint8_t complevel)
3544 {
3545 arc_buf_hdr_t *hdr;
3546 arc_buf_t *buf;
3547 arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3548 ARC_BUFC_METADATA : ARC_BUFC_DATA;
3549
3550 ASSERT3U(lsize, >, 0);
3551 ASSERT3U(lsize, >=, psize);
3552 ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3553 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3554
3555 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3556 compression_type, complevel, type);
3557
3558 hdr->b_crypt_hdr.b_dsobj = dsobj;
3559 hdr->b_crypt_hdr.b_ot = ot;
3560 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3561 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3562 memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
3563 memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
3564 memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
3565
3566 /*
3567 * This buffer will be considered encrypted even if the ot is not an
3568 * encrypted type. It will become authenticated instead in
3569 * arc_write_ready().
3570 */
3571 buf = NULL;
3572 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
3573 B_FALSE, B_FALSE, &buf));
3574 arc_buf_thaw(buf);
3575
3576 return (buf);
3577 }
3578
3579 static void
l2arc_hdr_arcstats_update(arc_buf_hdr_t * hdr,boolean_t incr,boolean_t state_only)3580 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
3581 boolean_t state_only)
3582 {
3583 uint64_t lsize = HDR_GET_LSIZE(hdr);
3584 uint64_t psize = HDR_GET_PSIZE(hdr);
3585 uint64_t asize = HDR_GET_L2SIZE(hdr);
3586 arc_buf_contents_t type = hdr->b_type;
3587 int64_t lsize_s;
3588 int64_t psize_s;
3589 int64_t asize_s;
3590
3591 /* For L2 we expect the header's b_l2size to be valid */
3592 ASSERT3U(asize, >=, psize);
3593
3594 if (incr) {
3595 lsize_s = lsize;
3596 psize_s = psize;
3597 asize_s = asize;
3598 } else {
3599 lsize_s = -lsize;
3600 psize_s = -psize;
3601 asize_s = -asize;
3602 }
3603
3604 /* If the buffer is a prefetch, count it as such. */
3605 if (HDR_PREFETCH(hdr)) {
3606 ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
3607 } else {
3608 /*
3609 * We use the value stored in the L2 header upon initial
3610 * caching in L2ARC. This value will be updated in case
3611 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
3612 * metadata (log entry) cannot currently be updated. Having
3613 * the ARC state in the L2 header solves the problem of a
3614 * possibly absent L1 header (apparent in buffers restored
3615 * from persistent L2ARC).
3616 */
3617 switch (hdr->b_l2hdr.b_arcs_state) {
3618 case ARC_STATE_MRU_GHOST:
3619 case ARC_STATE_MRU:
3620 ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
3621 break;
3622 case ARC_STATE_MFU_GHOST:
3623 case ARC_STATE_MFU:
3624 ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
3625 break;
3626 default:
3627 break;
3628 }
3629 }
3630
3631 if (state_only)
3632 return;
3633
3634 ARCSTAT_INCR(arcstat_l2_psize, psize_s);
3635 ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
3636
3637 switch (type) {
3638 case ARC_BUFC_DATA:
3639 ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
3640 break;
3641 case ARC_BUFC_METADATA:
3642 ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
3643 break;
3644 default:
3645 break;
3646 }
3647 }
3648
3649
3650 static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t * hdr)3651 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3652 {
3653 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3654 l2arc_dev_t *dev = l2hdr->b_dev;
3655
3656 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3657 ASSERT(HDR_HAS_L2HDR(hdr));
3658
3659 list_remove(&dev->l2ad_buflist, hdr);
3660
3661 l2arc_hdr_arcstats_decrement(hdr);
3662 if (dev->l2ad_vdev != NULL) {
3663 uint64_t asize = HDR_GET_L2SIZE(hdr);
3664 vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
3665 }
3666
3667 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3668 hdr);
3669 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3670 }
3671
3672 static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)3673 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3674 {
3675 if (HDR_HAS_L1HDR(hdr)) {
3676 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3677 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3678 }
3679 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3680 ASSERT(!HDR_IN_HASH_TABLE(hdr));
3681 boolean_t l1hdr_destroyed = B_FALSE;
3682
3683 /*
3684 * If L2_WRITING, destroy L1HDR before L2HDR (under mutex) so
3685 * arc_hdr_free_abd() can properly defer ABDs. Otherwise, destroy
3686 * L1HDR outside mutex to minimize contention.
3687 */
3688 if (HDR_HAS_L2HDR(hdr)) {
3689 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3690 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3691
3692 if (!buflist_held)
3693 mutex_enter(&dev->l2ad_mtx);
3694
3695 /*
3696 * Even though we checked this conditional above, we
3697 * need to check this again now that we have the
3698 * l2ad_mtx. This is because we could be racing with
3699 * another thread calling l2arc_evict() which might have
3700 * destroyed this header's L2 portion as we were waiting
3701 * to acquire the l2ad_mtx. If that happens, we don't
3702 * want to re-destroy the header's L2 portion.
3703 */
3704 if (HDR_HAS_L2HDR(hdr)) {
3705 if (HDR_L2_WRITING(hdr)) {
3706 l1hdr_destroyed = B_TRUE;
3707
3708 if (!HDR_EMPTY(hdr))
3709 buf_discard_identity(hdr);
3710
3711 if (HDR_HAS_L1HDR(hdr)) {
3712 arc_cksum_free(hdr);
3713
3714 while (hdr->b_l1hdr.b_buf != NULL)
3715 arc_buf_destroy_impl(
3716 hdr->b_l1hdr.b_buf);
3717
3718 if (hdr->b_l1hdr.b_pabd != NULL)
3719 arc_hdr_free_abd(hdr, B_FALSE);
3720
3721 if (HDR_HAS_RABD(hdr))
3722 arc_hdr_free_abd(hdr, B_TRUE);
3723 }
3724 }
3725
3726 arc_hdr_l2hdr_destroy(hdr);
3727 }
3728
3729 if (!buflist_held)
3730 mutex_exit(&dev->l2ad_mtx);
3731 }
3732
3733 if (!l1hdr_destroyed) {
3734 if (!HDR_EMPTY(hdr))
3735 buf_discard_identity(hdr);
3736
3737 if (HDR_HAS_L1HDR(hdr)) {
3738 arc_cksum_free(hdr);
3739
3740 while (hdr->b_l1hdr.b_buf != NULL)
3741 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3742
3743 if (hdr->b_l1hdr.b_pabd != NULL)
3744 arc_hdr_free_abd(hdr, B_FALSE);
3745
3746 if (HDR_HAS_RABD(hdr))
3747 arc_hdr_free_abd(hdr, B_TRUE);
3748 }
3749 }
3750
3751 ASSERT0P(hdr->b_hash_next);
3752 if (HDR_HAS_L1HDR(hdr)) {
3753 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3754 ASSERT0P(hdr->b_l1hdr.b_acb);
3755 #ifdef ZFS_DEBUG
3756 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3757 #endif
3758 kmem_cache_free(hdr_full_cache, hdr);
3759 } else {
3760 kmem_cache_free(hdr_l2only_cache, hdr);
3761 }
3762 }
3763
3764 void
arc_buf_destroy(arc_buf_t * buf,const void * tag)3765 arc_buf_destroy(arc_buf_t *buf, const void *tag)
3766 {
3767 arc_buf_hdr_t *hdr = buf->b_hdr;
3768
3769 if (hdr->b_l1hdr.b_state == arc_anon) {
3770 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
3771 ASSERT(ARC_BUF_LAST(buf));
3772 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3773 VERIFY0(remove_reference(hdr, tag));
3774 return;
3775 }
3776
3777 kmutex_t *hash_lock = HDR_LOCK(hdr);
3778 mutex_enter(hash_lock);
3779
3780 ASSERT3P(hdr, ==, buf->b_hdr);
3781 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
3782 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3783 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3784 ASSERT3P(buf->b_data, !=, NULL);
3785
3786 arc_buf_destroy_impl(buf);
3787 (void) remove_reference(hdr, tag);
3788 mutex_exit(hash_lock);
3789 }
3790
3791 /*
3792 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3793 * state of the header is dependent on its state prior to entering this
3794 * function. The following transitions are possible:
3795 *
3796 * - arc_mru -> arc_mru_ghost
3797 * - arc_mfu -> arc_mfu_ghost
3798 * - arc_mru_ghost -> arc_l2c_only
3799 * - arc_mru_ghost -> deleted
3800 * - arc_mfu_ghost -> arc_l2c_only
3801 * - arc_mfu_ghost -> deleted
3802 * - arc_uncached -> deleted
3803 *
3804 * Return total size of evicted data buffers for eviction progress tracking.
3805 * When evicting from ghost states return logical buffer size to make eviction
3806 * progress at the same (or at least comparable) rate as from non-ghost states.
3807 *
3808 * Return *real_evicted for actual ARC size reduction to wake up threads
3809 * waiting for it. For non-ghost states it includes size of evicted data
3810 * buffers (the headers are not freed there). For ghost states it includes
3811 * only the evicted headers size.
3812 */
3813 static int64_t
arc_evict_hdr(arc_buf_hdr_t * hdr,uint64_t * real_evicted)3814 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
3815 {
3816 arc_state_t *evicted_state, *state;
3817 int64_t bytes_evicted = 0;
3818
3819 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3820 ASSERT(HDR_HAS_L1HDR(hdr));
3821 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3822 ASSERT0P(hdr->b_l1hdr.b_buf);
3823 ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3824
3825 *real_evicted = 0;
3826 state = hdr->b_l1hdr.b_state;
3827 if (GHOST_STATE(state)) {
3828
3829 /*
3830 * l2arc_write_buffers() relies on a header's L1 portion
3831 * (i.e. its b_pabd field) during it's write phase.
3832 * Thus, we cannot push a header onto the arc_l2c_only
3833 * state (removing its L1 piece) until the header is
3834 * done being written to the l2arc.
3835 */
3836 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3837 ARCSTAT_BUMP(arcstat_evict_l2_skip);
3838 return (bytes_evicted);
3839 }
3840
3841 ARCSTAT_BUMP(arcstat_deleted);
3842 bytes_evicted += HDR_GET_LSIZE(hdr);
3843
3844 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3845
3846 if (HDR_HAS_L2HDR(hdr)) {
3847 ASSERT0P(hdr->b_l1hdr.b_pabd);
3848 ASSERT(!HDR_HAS_RABD(hdr));
3849 /*
3850 * This buffer is cached on the 2nd Level ARC;
3851 * don't destroy the header.
3852 */
3853 arc_change_state(arc_l2c_only, hdr);
3854 /*
3855 * dropping from L1+L2 cached to L2-only,
3856 * realloc to remove the L1 header.
3857 */
3858 (void) arc_hdr_realloc(hdr, hdr_full_cache,
3859 hdr_l2only_cache);
3860 *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
3861 } else {
3862 arc_change_state(arc_anon, hdr);
3863 arc_hdr_destroy(hdr);
3864 *real_evicted += HDR_FULL_SIZE;
3865 }
3866 return (bytes_evicted);
3867 }
3868
3869 ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
3870 evicted_state = (state == arc_uncached) ? arc_anon :
3871 ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
3872
3873 /* prefetch buffers have a minimum lifespan */
3874 uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3875 arc_min_prescient_prefetch : arc_min_prefetch;
3876 if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3877 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime) {
3878 ARCSTAT_BUMP(arcstat_evict_skip);
3879 return (bytes_evicted);
3880 }
3881
3882 if (HDR_HAS_L2HDR(hdr)) {
3883 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3884 } else {
3885 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3886 ARCSTAT_INCR(arcstat_evict_l2_eligible,
3887 HDR_GET_LSIZE(hdr));
3888
3889 switch (state->arcs_state) {
3890 case ARC_STATE_MRU:
3891 ARCSTAT_INCR(
3892 arcstat_evict_l2_eligible_mru,
3893 HDR_GET_LSIZE(hdr));
3894 break;
3895 case ARC_STATE_MFU:
3896 ARCSTAT_INCR(
3897 arcstat_evict_l2_eligible_mfu,
3898 HDR_GET_LSIZE(hdr));
3899 break;
3900 default:
3901 break;
3902 }
3903 } else {
3904 ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3905 HDR_GET_LSIZE(hdr));
3906 }
3907 }
3908
3909 bytes_evicted += arc_hdr_size(hdr);
3910 *real_evicted += arc_hdr_size(hdr);
3911
3912 /*
3913 * If this hdr is being evicted and has a compressed buffer then we
3914 * discard it here before we change states. This ensures that the
3915 * accounting is updated correctly in arc_free_data_impl().
3916 */
3917 if (hdr->b_l1hdr.b_pabd != NULL)
3918 arc_hdr_free_abd(hdr, B_FALSE);
3919
3920 if (HDR_HAS_RABD(hdr))
3921 arc_hdr_free_abd(hdr, B_TRUE);
3922
3923 arc_change_state(evicted_state, hdr);
3924 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3925 if (evicted_state == arc_anon) {
3926 arc_hdr_destroy(hdr);
3927 *real_evicted += HDR_FULL_SIZE;
3928 } else {
3929 ASSERT(HDR_IN_HASH_TABLE(hdr));
3930 }
3931
3932 return (bytes_evicted);
3933 }
3934
3935 static void
arc_set_need_free(void)3936 arc_set_need_free(void)
3937 {
3938 ASSERT(MUTEX_HELD(&arc_evict_lock));
3939 int64_t remaining = arc_free_memory() - arc_sys_free / 2;
3940 arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
3941 if (aw == NULL) {
3942 arc_need_free = MAX(-remaining, 0);
3943 } else {
3944 arc_need_free =
3945 MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
3946 }
3947 }
3948
3949 static uint64_t
arc_evict_state_impl(multilist_t * ml,int idx,arc_buf_hdr_t * marker,uint64_t spa,uint64_t bytes,boolean_t * more)3950 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3951 uint64_t spa, uint64_t bytes, boolean_t *more)
3952 {
3953 multilist_sublist_t *mls;
3954 uint64_t bytes_evicted = 0, real_evicted = 0;
3955 arc_buf_hdr_t *hdr;
3956 kmutex_t *hash_lock;
3957 uint_t evict_count = zfs_arc_evict_batch_limit;
3958
3959 ASSERT3P(marker, !=, NULL);
3960
3961 mls = multilist_sublist_lock_idx(ml, idx);
3962
3963 for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
3964 hdr = multilist_sublist_prev(mls, marker)) {
3965 if ((evict_count == 0) || (bytes_evicted >= bytes))
3966 break;
3967
3968 /*
3969 * To keep our iteration location, move the marker
3970 * forward. Since we're not holding hdr's hash lock, we
3971 * must be very careful and not remove 'hdr' from the
3972 * sublist. Otherwise, other consumers might mistake the
3973 * 'hdr' as not being on a sublist when they call the
3974 * multilist_link_active() function (they all rely on
3975 * the hash lock protecting concurrent insertions and
3976 * removals). multilist_sublist_move_forward() was
3977 * specifically implemented to ensure this is the case
3978 * (only 'marker' will be removed and re-inserted).
3979 */
3980 multilist_sublist_move_forward(mls, marker);
3981
3982 /*
3983 * The only case where the b_spa field should ever be
3984 * zero, is the marker headers inserted by
3985 * arc_evict_state(). It's possible for multiple threads
3986 * to be calling arc_evict_state() concurrently (e.g.
3987 * dsl_pool_close() and zio_inject_fault()), so we must
3988 * skip any markers we see from these other threads.
3989 */
3990 if (hdr->b_spa == 0)
3991 continue;
3992
3993 /* we're only interested in evicting buffers of a certain spa */
3994 if (spa != 0 && hdr->b_spa != spa) {
3995 ARCSTAT_BUMP(arcstat_evict_skip);
3996 continue;
3997 }
3998
3999 hash_lock = HDR_LOCK(hdr);
4000
4001 /*
4002 * We aren't calling this function from any code path
4003 * that would already be holding a hash lock, so we're
4004 * asserting on this assumption to be defensive in case
4005 * this ever changes. Without this check, it would be
4006 * possible to incorrectly increment arcstat_mutex_miss
4007 * below (e.g. if the code changed such that we called
4008 * this function with a hash lock held).
4009 */
4010 ASSERT(!MUTEX_HELD(hash_lock));
4011
4012 if (mutex_tryenter(hash_lock)) {
4013 uint64_t revicted;
4014 uint64_t evicted = arc_evict_hdr(hdr, &revicted);
4015 mutex_exit(hash_lock);
4016
4017 bytes_evicted += evicted;
4018 real_evicted += revicted;
4019
4020 /*
4021 * If evicted is zero, arc_evict_hdr() must have
4022 * decided to skip this header, don't increment
4023 * evict_count in this case.
4024 */
4025 if (evicted != 0)
4026 evict_count--;
4027
4028 } else {
4029 ARCSTAT_BUMP(arcstat_mutex_miss);
4030 }
4031 }
4032
4033 multilist_sublist_unlock(mls);
4034
4035 /* Indicate if another iteration may be productive. */
4036 if (more)
4037 *more = (hdr != NULL);
4038
4039 /*
4040 * Increment the count of evicted bytes, and wake up any threads that
4041 * are waiting for the count to reach this value. Since the list is
4042 * ordered by ascending aew_count, we pop off the beginning of the
4043 * list until we reach the end, or a waiter that's past the current
4044 * "count". Doing this outside the loop reduces the number of times
4045 * we need to acquire the global arc_evict_lock.
4046 *
4047 * Only wake when there's sufficient free memory in the system
4048 * (specifically, arc_sys_free/2, which by default is a bit more than
4049 * 1/64th of RAM). See the comments in arc_wait_for_eviction().
4050 */
4051 mutex_enter(&arc_evict_lock);
4052 arc_evict_count += real_evicted;
4053
4054 if (arc_free_memory() > arc_sys_free / 2) {
4055 arc_evict_waiter_t *aw;
4056 while ((aw = list_head(&arc_evict_waiters)) != NULL &&
4057 aw->aew_count <= arc_evict_count) {
4058 list_remove(&arc_evict_waiters, aw);
4059 cv_signal(&aw->aew_cv);
4060 }
4061 }
4062 arc_set_need_free();
4063 mutex_exit(&arc_evict_lock);
4064
4065 return (bytes_evicted);
4066 }
4067
4068 static arc_buf_hdr_t *
arc_state_alloc_marker(void)4069 arc_state_alloc_marker(void)
4070 {
4071 arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4072
4073 /*
4074 * A b_spa of 0 is used to indicate that this header is
4075 * a marker. This fact is used in arc_evict_state_impl().
4076 */
4077 marker->b_spa = 0;
4078
4079 return (marker);
4080 }
4081
4082 static void
arc_state_free_marker(arc_buf_hdr_t * marker)4083 arc_state_free_marker(arc_buf_hdr_t *marker)
4084 {
4085 kmem_cache_free(hdr_full_cache, marker);
4086 }
4087
4088 /*
4089 * Allocate an array of buffer headers used as placeholders during arc state
4090 * eviction.
4091 */
4092 static arc_buf_hdr_t **
arc_state_alloc_markers(int count)4093 arc_state_alloc_markers(int count)
4094 {
4095 arc_buf_hdr_t **markers;
4096
4097 markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
4098 for (int i = 0; i < count; i++)
4099 markers[i] = arc_state_alloc_marker();
4100 return (markers);
4101 }
4102
4103 static void
arc_state_free_markers(arc_buf_hdr_t ** markers,int count)4104 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4105 {
4106 for (int i = 0; i < count; i++)
4107 arc_state_free_marker(markers[i]);
4108 kmem_free(markers, sizeof (*markers) * count);
4109 }
4110
4111 typedef struct evict_arg {
4112 taskq_ent_t eva_tqent;
4113 multilist_t *eva_ml;
4114 arc_buf_hdr_t *eva_marker;
4115 int eva_idx;
4116 uint64_t eva_spa;
4117 uint64_t eva_bytes;
4118 uint64_t eva_evicted;
4119 } evict_arg_t;
4120
4121 static void
arc_evict_task(void * arg)4122 arc_evict_task(void *arg)
4123 {
4124 evict_arg_t *eva = arg;
4125 uint64_t total_evicted = 0;
4126 boolean_t more;
4127 uint_t batches = zfs_arc_evict_batches_limit;
4128
4129 /* Process multiple batches to amortize taskq dispatch overhead. */
4130 do {
4131 total_evicted += arc_evict_state_impl(eva->eva_ml,
4132 eva->eva_idx, eva->eva_marker, eva->eva_spa,
4133 eva->eva_bytes - total_evicted, &more);
4134 } while (total_evicted < eva->eva_bytes && --batches > 0 && more);
4135
4136 eva->eva_evicted = total_evicted;
4137 }
4138
4139 static void
arc_evict_thread_init(void)4140 arc_evict_thread_init(void)
4141 {
4142 if (zfs_arc_evict_threads == 0) {
4143 /*
4144 * Compute number of threads we want to use for eviction.
4145 *
4146 * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4147 * default max of 16 threads at ~256 CPUs.
4148 *
4149 * However, that formula goes to two threads at 4 CPUs, which
4150 * is still rather to low to be really useful, so we just go
4151 * with 1 thread at fewer than 6 cores.
4152 */
4153 if (max_ncpus < 6)
4154 zfs_arc_evict_threads = 1;
4155 else
4156 zfs_arc_evict_threads =
4157 (highbit64(max_ncpus) - 1) + max_ncpus / 32;
4158 } else if (zfs_arc_evict_threads > max_ncpus)
4159 zfs_arc_evict_threads = max_ncpus;
4160
4161 if (zfs_arc_evict_threads > 1) {
4162 arc_evict_taskq = taskq_create("arc_evict",
4163 zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
4164 TASKQ_PREPOPULATE);
4165 arc_evict_arg = kmem_zalloc(
4166 sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
4167 }
4168 }
4169
4170 /*
4171 * The minimum number of bytes we can evict at once is a block size.
4172 * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4173 * We use this value to compute a scaling factor for the eviction tasks.
4174 */
4175 #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4176
4177 /*
4178 * Evict buffers from the given arc state, until we've removed the
4179 * specified number of bytes. Move the removed buffers to the
4180 * appropriate evict state.
4181 *
4182 * This function makes a "best effort". It skips over any buffers
4183 * it can't get a hash_lock on, and so, may not catch all candidates.
4184 * It may also return without evicting as much space as requested.
4185 *
4186 * If bytes is specified using the special value ARC_EVICT_ALL, this
4187 * will evict all available (i.e. unlocked and evictable) buffers from
4188 * the given arc state; which is used by arc_flush().
4189 */
4190 static uint64_t
arc_evict_state(arc_state_t * state,arc_buf_contents_t type,uint64_t spa,uint64_t bytes)4191 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4192 uint64_t bytes)
4193 {
4194 uint64_t total_evicted = 0;
4195 multilist_t *ml = &state->arcs_list[type];
4196 int num_sublists;
4197 arc_buf_hdr_t **markers;
4198 evict_arg_t *eva = NULL;
4199
4200 num_sublists = multilist_get_num_sublists(ml);
4201
4202 boolean_t use_evcttq = zfs_arc_evict_threads > 1;
4203
4204 /*
4205 * If we've tried to evict from each sublist, made some
4206 * progress, but still have not hit the target number of bytes
4207 * to evict, we want to keep trying. The markers allow us to
4208 * pick up where we left off for each individual sublist, rather
4209 * than starting from the tail each time.
4210 */
4211 if (zthr_iscurthread(arc_evict_zthr)) {
4212 markers = arc_state_evict_markers;
4213 ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
4214 } else {
4215 markers = arc_state_alloc_markers(num_sublists);
4216 }
4217 for (int i = 0; i < num_sublists; i++) {
4218 multilist_sublist_t *mls;
4219
4220 mls = multilist_sublist_lock_idx(ml, i);
4221 multilist_sublist_insert_tail(mls, markers[i]);
4222 multilist_sublist_unlock(mls);
4223 }
4224
4225 if (use_evcttq) {
4226 if (zthr_iscurthread(arc_evict_zthr))
4227 eva = arc_evict_arg;
4228 else
4229 eva = kmem_alloc(sizeof (evict_arg_t) *
4230 zfs_arc_evict_threads, KM_NOSLEEP);
4231 if (eva) {
4232 for (int i = 0; i < zfs_arc_evict_threads; i++) {
4233 taskq_init_ent(&eva[i].eva_tqent);
4234 eva[i].eva_ml = ml;
4235 eva[i].eva_spa = spa;
4236 }
4237 } else {
4238 /*
4239 * Fall back to the regular single evict if it is not
4240 * possible to allocate memory for the taskq entries.
4241 */
4242 use_evcttq = B_FALSE;
4243 }
4244 }
4245
4246 /*
4247 * Start eviction using a randomly selected sublist, this is to try and
4248 * evenly balance eviction across all sublists. Always starting at the
4249 * same sublist (e.g. index 0) would cause evictions to favor certain
4250 * sublists over others.
4251 */
4252 uint64_t scan_evicted = 0;
4253 int sublists_left = num_sublists;
4254 int sublist_idx = multilist_get_random_index(ml);
4255
4256 /*
4257 * While we haven't hit our target number of bytes to evict, or
4258 * we're evicting all available buffers.
4259 */
4260 while (total_evicted < bytes) {
4261 uint64_t evict = MIN_EVICT_SIZE;
4262 uint_t ntasks = zfs_arc_evict_threads;
4263
4264 if (use_evcttq) {
4265 if (sublists_left < ntasks)
4266 ntasks = sublists_left;
4267
4268 if (ntasks < 2)
4269 use_evcttq = B_FALSE;
4270 }
4271
4272 if (use_evcttq) {
4273 uint64_t left = bytes - total_evicted;
4274
4275 if (bytes == ARC_EVICT_ALL) {
4276 evict = bytes;
4277 } else if (left >= ntasks * MIN_EVICT_SIZE) {
4278 evict = DIV_ROUND_UP(left, ntasks);
4279 } else {
4280 ntasks = left / MIN_EVICT_SIZE;
4281 if (ntasks < 2)
4282 use_evcttq = B_FALSE;
4283 else
4284 evict = DIV_ROUND_UP(left, ntasks);
4285 }
4286 }
4287
4288 for (int i = 0; sublists_left > 0; i++, sublist_idx++,
4289 sublists_left--) {
4290 uint64_t bytes_evicted;
4291
4292 /* we've reached the end, wrap to the beginning */
4293 if (sublist_idx >= num_sublists)
4294 sublist_idx = 0;
4295
4296 if (use_evcttq) {
4297 if (i == ntasks)
4298 break;
4299
4300 eva[i].eva_marker = markers[sublist_idx];
4301 eva[i].eva_idx = sublist_idx;
4302 eva[i].eva_bytes = evict;
4303
4304 taskq_dispatch_ent(arc_evict_taskq,
4305 arc_evict_task, &eva[i], 0,
4306 &eva[i].eva_tqent);
4307
4308 continue;
4309 }
4310
4311 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4312 markers[sublist_idx], spa, bytes - total_evicted,
4313 NULL);
4314
4315 scan_evicted += bytes_evicted;
4316 total_evicted += bytes_evicted;
4317
4318 if (total_evicted < bytes)
4319 kpreempt(KPREEMPT_SYNC);
4320 else
4321 break;
4322 }
4323
4324 if (use_evcttq) {
4325 taskq_wait(arc_evict_taskq);
4326
4327 for (int i = 0; i < ntasks; i++) {
4328 scan_evicted += eva[i].eva_evicted;
4329 total_evicted += eva[i].eva_evicted;
4330 }
4331 }
4332
4333 /*
4334 * If we scanned all sublists and didn't evict anything, we
4335 * have no reason to believe we'll evict more during another
4336 * scan, so break the loop.
4337 */
4338 if (scan_evicted == 0 && sublists_left == 0) {
4339 /* This isn't possible, let's make that obvious */
4340 ASSERT3S(bytes, !=, 0);
4341
4342 /*
4343 * When bytes is ARC_EVICT_ALL, the only way to
4344 * break the loop is when scan_evicted is zero.
4345 * In that case, we actually have evicted enough,
4346 * so we don't want to increment the kstat.
4347 */
4348 if (bytes != ARC_EVICT_ALL) {
4349 ASSERT3S(total_evicted, <, bytes);
4350 ARCSTAT_BUMP(arcstat_evict_not_enough);
4351 }
4352
4353 break;
4354 }
4355
4356 /*
4357 * If we scanned all sublists but still have more to do,
4358 * reset the counts so we can go around again.
4359 */
4360 if (sublists_left == 0) {
4361 sublists_left = num_sublists;
4362 sublist_idx = multilist_get_random_index(ml);
4363 scan_evicted = 0;
4364
4365 /*
4366 * Since we're about to reconsider all sublists,
4367 * re-enable use of the evict threads if available.
4368 */
4369 use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
4370 }
4371 }
4372
4373 if (eva != NULL && eva != arc_evict_arg)
4374 kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
4375
4376 for (int i = 0; i < num_sublists; i++) {
4377 multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
4378 multilist_sublist_remove(mls, markers[i]);
4379 multilist_sublist_unlock(mls);
4380 }
4381
4382 if (markers != arc_state_evict_markers)
4383 arc_state_free_markers(markers, num_sublists);
4384
4385 return (total_evicted);
4386 }
4387
4388 /*
4389 * Flush all "evictable" data of the given type from the arc state
4390 * specified. This will not evict any "active" buffers (i.e. referenced).
4391 *
4392 * When 'retry' is set to B_FALSE, the function will make a single pass
4393 * over the state and evict any buffers that it can. Since it doesn't
4394 * continually retry the eviction, it might end up leaving some buffers
4395 * in the ARC due to lock misses.
4396 *
4397 * When 'retry' is set to B_TRUE, the function will continually retry the
4398 * eviction until *all* evictable buffers have been removed from the
4399 * state. As a result, if concurrent insertions into the state are
4400 * allowed (e.g. if the ARC isn't shutting down), this function might
4401 * wind up in an infinite loop, continually trying to evict buffers.
4402 */
4403 static uint64_t
arc_flush_state(arc_state_t * state,uint64_t spa,arc_buf_contents_t type,boolean_t retry)4404 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4405 boolean_t retry)
4406 {
4407 uint64_t evicted = 0;
4408
4409 while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
4410 evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
4411
4412 if (!retry)
4413 break;
4414 }
4415
4416 return (evicted);
4417 }
4418
4419 /*
4420 * Evict the specified number of bytes from the state specified. This
4421 * function prevents us from trying to evict more from a state's list
4422 * than is "evictable", and to skip evicting altogether when passed a
4423 * negative value for "bytes". In contrast, arc_evict_state() will
4424 * evict everything it can, when passed a negative value for "bytes".
4425 */
4426 static uint64_t
arc_evict_impl(arc_state_t * state,arc_buf_contents_t type,int64_t bytes)4427 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
4428 {
4429 uint64_t delta;
4430
4431 if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
4432 delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
4433 bytes);
4434 return (arc_evict_state(state, type, 0, delta));
4435 }
4436
4437 return (0);
4438 }
4439
4440 /*
4441 * Adjust specified fraction, taking into account initial ghost state(s) size,
4442 * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
4443 * decreasing it, plus a balance factor, controlling the decrease rate, used
4444 * to balance metadata vs data.
4445 */
4446 static uint64_t
arc_evict_adj(uint64_t frac,uint64_t total,uint64_t up,uint64_t down,uint_t balance)4447 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
4448 uint_t balance)
4449 {
4450 if (total < 32 || up + down == 0)
4451 return (frac);
4452
4453 /*
4454 * We should not have more ghost hits than ghost size, but they may
4455 * get close. To avoid overflows below up/down should not be bigger
4456 * than 1/5 of total. But to limit maximum adjustment speed restrict
4457 * it some more.
4458 */
4459 if (up + down >= total / 16) {
4460 uint64_t scale = (up + down) / (total / 32);
4461 up /= scale;
4462 down /= scale;
4463 }
4464
4465 /* Get maximal dynamic range by choosing optimal shifts. */
4466 int s = highbit64(total);
4467 s = MIN(64 - s, 32);
4468
4469 ASSERT3U(frac, <=, 1ULL << 32);
4470 uint64_t ofrac = (1ULL << 32) - frac;
4471
4472 if (frac >= 4 * ofrac)
4473 up /= frac / (2 * ofrac + 1);
4474 up = (up << s) / (total >> (32 - s));
4475 if (ofrac >= 4 * frac)
4476 down /= ofrac / (2 * frac + 1);
4477 down = (down << s) / (total >> (32 - s));
4478 down = down * 100 / balance;
4479
4480 ASSERT3U(up, <=, (1ULL << 32) - frac);
4481 ASSERT3U(down, <=, frac);
4482 return (frac + up - down);
4483 }
4484
4485 /*
4486 * Calculate (x * multiplier / divisor) without unnecesary overflows.
4487 */
4488 static uint64_t
arc_mf(uint64_t x,uint64_t multiplier,uint64_t divisor)4489 arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor)
4490 {
4491 uint64_t q = (x / divisor);
4492 uint64_t r = (x % divisor);
4493
4494 return ((q * multiplier) + ((r * multiplier) / divisor));
4495 }
4496
4497 /*
4498 * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
4499 */
4500 static uint64_t
arc_evict(void)4501 arc_evict(void)
4502 {
4503 uint64_t bytes, total_evicted = 0;
4504 int64_t e, mrud, mrum, mfud, mfum, w;
4505 static uint64_t ogrd, ogrm, ogfd, ogfm;
4506 static uint64_t gsrd, gsrm, gsfd, gsfm;
4507 uint64_t ngrd, ngrm, ngfd, ngfm;
4508
4509 /* Get current size of ARC states we can evict from. */
4510 mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
4511 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
4512 mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
4513 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
4514 mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
4515 mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
4516 uint64_t d = mrud + mfud;
4517 uint64_t m = mrum + mfum;
4518 uint64_t t = d + m;
4519
4520 /* Get ARC ghost hits since last eviction. */
4521 ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
4522 uint64_t grd = ngrd - ogrd;
4523 ogrd = ngrd;
4524 ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
4525 uint64_t grm = ngrm - ogrm;
4526 ogrm = ngrm;
4527 ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
4528 uint64_t gfd = ngfd - ogfd;
4529 ogfd = ngfd;
4530 ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
4531 uint64_t gfm = ngfm - ogfm;
4532 ogfm = ngfm;
4533
4534 /* Adjust ARC states balance based on ghost hits. */
4535 arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
4536 grm + gfm, grd + gfd, zfs_arc_meta_balance);
4537 arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
4538 arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
4539
4540 uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4541 uint64_t ac = arc_c;
4542 int64_t wt = t - (asize - ac);
4543
4544 /*
4545 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
4546 * target is not evictable or if they go over arc_dnode_limit.
4547 */
4548 int64_t prune = 0;
4549 int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
4550 int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
4551 + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
4552 - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
4553 - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
4554 w = wt * (int64_t)(arc_meta >> 16) >> 16;
4555 if (nem > w * 3 / 4) {
4556 prune = dn / sizeof (dnode_t) *
4557 zfs_arc_dnode_reduce_percent / 100;
4558 if (nem < w && w > 4)
4559 prune = arc_mf(prune, nem - w * 3 / 4, w / 4);
4560 }
4561 if (dn > arc_dnode_limit) {
4562 prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) *
4563 zfs_arc_dnode_reduce_percent / 100);
4564 }
4565 if (prune > 0)
4566 arc_prune_async(prune);
4567
4568 /* Evict MRU metadata. */
4569 w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
4570 e = MIN((int64_t)(asize - ac), (int64_t)(mrum - w));
4571 bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
4572 total_evicted += bytes;
4573 mrum -= bytes;
4574 asize -= bytes;
4575
4576 /* Evict MFU metadata. */
4577 w = wt * (int64_t)(arc_meta >> 16) >> 16;
4578 e = MIN((int64_t)(asize - ac), (int64_t)(m - bytes - w));
4579 bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
4580 total_evicted += bytes;
4581 mfum -= bytes;
4582 asize -= bytes;
4583
4584 /* Evict MRU data. */
4585 wt -= m - total_evicted;
4586 w = wt * (int64_t)(arc_pd >> 16) >> 16;
4587 e = MIN((int64_t)(asize - ac), (int64_t)(mrud - w));
4588 bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
4589 total_evicted += bytes;
4590 mrud -= bytes;
4591 asize -= bytes;
4592
4593 /* Evict MFU data. */
4594 e = asize - ac;
4595 bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
4596 mfud -= bytes;
4597 total_evicted += bytes;
4598
4599 /*
4600 * Evict ghost lists
4601 *
4602 * Size of each state's ghost list represents how much that state
4603 * may grow by shrinking the other states. Would it need to shrink
4604 * other states to zero (that is unlikely), its ghost size would be
4605 * equal to sum of other three state sizes. But excessive ghost
4606 * size may result in false ghost hits (too far back), that may
4607 * never result in real cache hits if several states are competing.
4608 * So choose some arbitraty point of 1/2 of other state sizes.
4609 */
4610 gsrd = (mrum + mfud + mfum) / 2;
4611 e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
4612 gsrd;
4613 (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
4614
4615 gsrm = (mrud + mfud + mfum) / 2;
4616 e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
4617 gsrm;
4618 (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
4619
4620 gsfd = (mrud + mrum + mfum) / 2;
4621 e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
4622 gsfd;
4623 (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
4624
4625 gsfm = (mrud + mrum + mfud) / 2;
4626 e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
4627 gsfm;
4628 (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
4629
4630 return (total_evicted);
4631 }
4632
4633 static void
arc_flush_impl(uint64_t guid,boolean_t retry)4634 arc_flush_impl(uint64_t guid, boolean_t retry)
4635 {
4636 ASSERT(!retry || guid == 0);
4637
4638 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4639 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4640
4641 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4642 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4643
4644 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4645 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4646
4647 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4648 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4649
4650 (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
4651 (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
4652 }
4653
4654 void
arc_flush(spa_t * spa,boolean_t retry)4655 arc_flush(spa_t *spa, boolean_t retry)
4656 {
4657 /*
4658 * If retry is B_TRUE, a spa must not be specified since we have
4659 * no good way to determine if all of a spa's buffers have been
4660 * evicted from an arc state.
4661 */
4662 ASSERT(!retry || spa == NULL);
4663
4664 arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
4665 }
4666
4667 static arc_async_flush_t *
arc_async_flush_add(uint64_t spa_guid,uint_t level)4668 arc_async_flush_add(uint64_t spa_guid, uint_t level)
4669 {
4670 arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
4671 af->af_spa_guid = spa_guid;
4672 af->af_cache_level = level;
4673 taskq_init_ent(&af->af_tqent);
4674 list_link_init(&af->af_node);
4675
4676 mutex_enter(&arc_async_flush_lock);
4677 list_insert_tail(&arc_async_flush_list, af);
4678 mutex_exit(&arc_async_flush_lock);
4679
4680 return (af);
4681 }
4682
4683 static void
arc_async_flush_remove(uint64_t spa_guid,uint_t level)4684 arc_async_flush_remove(uint64_t spa_guid, uint_t level)
4685 {
4686 mutex_enter(&arc_async_flush_lock);
4687 for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4688 af != NULL; af = list_next(&arc_async_flush_list, af)) {
4689 if (af->af_spa_guid == spa_guid &&
4690 af->af_cache_level == level) {
4691 list_remove(&arc_async_flush_list, af);
4692 kmem_free(af, sizeof (*af));
4693 break;
4694 }
4695 }
4696 mutex_exit(&arc_async_flush_lock);
4697 }
4698
4699 static void
arc_flush_task(void * arg)4700 arc_flush_task(void *arg)
4701 {
4702 arc_async_flush_t *af = arg;
4703 hrtime_t start_time = gethrtime();
4704 uint64_t spa_guid = af->af_spa_guid;
4705
4706 arc_flush_impl(spa_guid, B_FALSE);
4707 arc_async_flush_remove(spa_guid, af->af_cache_level);
4708
4709 uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
4710 if (elapsed > 0) {
4711 zfs_dbgmsg("spa %llu arc flushed in %llu ms",
4712 (u_longlong_t)spa_guid, (u_longlong_t)elapsed);
4713 }
4714 }
4715
4716 /*
4717 * ARC buffers use the spa's load guid and can continue to exist after
4718 * the spa_t is gone (exported). The blocks are orphaned since each
4719 * spa import has a different load guid.
4720 *
4721 * It's OK if the spa is re-imported while this asynchronous flush is
4722 * still in progress. The new spa_load_guid will be different.
4723 *
4724 * Also, arc_fini will wait for any arc_flush_task to finish.
4725 */
4726 void
arc_flush_async(spa_t * spa)4727 arc_flush_async(spa_t *spa)
4728 {
4729 uint64_t spa_guid = spa_load_guid(spa);
4730 arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1);
4731
4732 taskq_dispatch_ent(arc_flush_taskq, arc_flush_task,
4733 af, TQ_SLEEP, &af->af_tqent);
4734 }
4735
4736 /*
4737 * Check if a guid is still in-use as part of an async teardown task
4738 */
4739 boolean_t
arc_async_flush_guid_inuse(uint64_t spa_guid)4740 arc_async_flush_guid_inuse(uint64_t spa_guid)
4741 {
4742 mutex_enter(&arc_async_flush_lock);
4743 for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4744 af != NULL; af = list_next(&arc_async_flush_list, af)) {
4745 if (af->af_spa_guid == spa_guid) {
4746 mutex_exit(&arc_async_flush_lock);
4747 return (B_TRUE);
4748 }
4749 }
4750 mutex_exit(&arc_async_flush_lock);
4751 return (B_FALSE);
4752 }
4753
4754 uint64_t
arc_reduce_target_size(uint64_t to_free)4755 arc_reduce_target_size(uint64_t to_free)
4756 {
4757 /*
4758 * Get the actual arc size. Even if we don't need it, this updates
4759 * the aggsum lower bound estimate for arc_is_overflowing().
4760 */
4761 uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4762
4763 /*
4764 * All callers want the ARC to actually evict (at least) this much
4765 * memory. Therefore we reduce from the lower of the current size and
4766 * the target size. This way, even if arc_c is much higher than
4767 * arc_size (as can be the case after many calls to arc_freed(), we will
4768 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
4769 * will evict.
4770 */
4771 uint64_t c = arc_c;
4772 if (c > arc_c_min) {
4773 c = MIN(c, MAX(asize, arc_c_min));
4774 to_free = MIN(to_free, c - arc_c_min);
4775 arc_c = c - to_free;
4776 } else {
4777 to_free = 0;
4778 }
4779
4780 /*
4781 * Since dbuf cache size is a fraction of target ARC size, we should
4782 * notify dbuf about the reduction, which might be significant,
4783 * especially if current ARC size was much smaller than the target.
4784 */
4785 dbuf_cache_reduce_target_size();
4786
4787 /*
4788 * Whether or not we reduced the target size, request eviction if the
4789 * current size is over it now, since caller obviously wants some RAM.
4790 */
4791 if (asize > arc_c) {
4792 /* See comment in arc_evict_cb_check() on why lock+flag */
4793 mutex_enter(&arc_evict_lock);
4794 arc_evict_needed = B_TRUE;
4795 mutex_exit(&arc_evict_lock);
4796 zthr_wakeup(arc_evict_zthr);
4797 }
4798
4799 return (to_free);
4800 }
4801
4802 /*
4803 * Determine if the system is under memory pressure and is asking
4804 * to reclaim memory. A return value of B_TRUE indicates that the system
4805 * is under memory pressure and that the arc should adjust accordingly.
4806 */
4807 boolean_t
arc_reclaim_needed(void)4808 arc_reclaim_needed(void)
4809 {
4810 return (arc_available_memory() < 0);
4811 }
4812
4813 void
arc_kmem_reap_soon(void)4814 arc_kmem_reap_soon(void)
4815 {
4816 size_t i;
4817 kmem_cache_t *prev_cache = NULL;
4818 kmem_cache_t *prev_data_cache = NULL;
4819
4820 #ifdef _KERNEL
4821 #if defined(_ILP32)
4822 /*
4823 * Reclaim unused memory from all kmem caches.
4824 */
4825 kmem_reap();
4826 #endif
4827 #endif
4828
4829 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4830 #if defined(_ILP32)
4831 /* reach upper limit of cache size on 32-bit */
4832 if (zio_buf_cache[i] == NULL)
4833 break;
4834 #endif
4835 if (zio_buf_cache[i] != prev_cache) {
4836 prev_cache = zio_buf_cache[i];
4837 kmem_cache_reap_now(zio_buf_cache[i]);
4838 }
4839 if (zio_data_buf_cache[i] != prev_data_cache) {
4840 prev_data_cache = zio_data_buf_cache[i];
4841 kmem_cache_reap_now(zio_data_buf_cache[i]);
4842 }
4843 }
4844 kmem_cache_reap_now(buf_cache);
4845 kmem_cache_reap_now(hdr_full_cache);
4846 kmem_cache_reap_now(hdr_l2only_cache);
4847 kmem_cache_reap_now(zfs_btree_leaf_cache);
4848 abd_cache_reap_now();
4849 }
4850
4851 static boolean_t
arc_evict_cb_check(void * arg,zthr_t * zthr)4852 arc_evict_cb_check(void *arg, zthr_t *zthr)
4853 {
4854 (void) arg, (void) zthr;
4855
4856 #ifdef ZFS_DEBUG
4857 /*
4858 * This is necessary in order to keep the kstat information
4859 * up to date for tools that display kstat data such as the
4860 * mdb ::arc dcmd and the Linux crash utility. These tools
4861 * typically do not call kstat's update function, but simply
4862 * dump out stats from the most recent update. Without
4863 * this call, these commands may show stale stats for the
4864 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4865 * with this call, the data might be out of date if the
4866 * evict thread hasn't been woken recently; but that should
4867 * suffice. The arc_state_t structures can be queried
4868 * directly if more accurate information is needed.
4869 */
4870 if (arc_ksp != NULL)
4871 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4872 #endif
4873
4874 /*
4875 * We have to rely on arc_wait_for_eviction() to tell us when to
4876 * evict, rather than checking if we are overflowing here, so that we
4877 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
4878 * If we have become "not overflowing" since arc_wait_for_eviction()
4879 * checked, we need to wake it up. We could broadcast the CV here,
4880 * but arc_wait_for_eviction() may have not yet gone to sleep. We
4881 * would need to use a mutex to ensure that this function doesn't
4882 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
4883 * the arc_evict_lock). However, the lock ordering of such a lock
4884 * would necessarily be incorrect with respect to the zthr_lock,
4885 * which is held before this function is called, and is held by
4886 * arc_wait_for_eviction() when it calls zthr_wakeup().
4887 */
4888 if (arc_evict_needed)
4889 return (B_TRUE);
4890
4891 /*
4892 * If we have buffers in uncached state, evict them periodically.
4893 */
4894 return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
4895 zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
4896 ddi_get_lbolt() - arc_last_uncached_flush > arc_min_prefetch / 2));
4897 }
4898
4899 /*
4900 * Keep arc_size under arc_c by running arc_evict which evicts data
4901 * from the ARC.
4902 */
4903 static void
arc_evict_cb(void * arg,zthr_t * zthr)4904 arc_evict_cb(void *arg, zthr_t *zthr)
4905 {
4906 (void) arg;
4907
4908 uint64_t evicted = 0;
4909 fstrans_cookie_t cookie = spl_fstrans_mark();
4910
4911 /* Always try to evict from uncached state. */
4912 arc_last_uncached_flush = ddi_get_lbolt();
4913 evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
4914 evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
4915
4916 /* Evict from other states only if told to. */
4917 if (arc_evict_needed)
4918 evicted += arc_evict();
4919
4920 /*
4921 * If evicted is zero, we couldn't evict anything
4922 * via arc_evict(). This could be due to hash lock
4923 * collisions, but more likely due to the majority of
4924 * arc buffers being unevictable. Therefore, even if
4925 * arc_size is above arc_c, another pass is unlikely to
4926 * be helpful and could potentially cause us to enter an
4927 * infinite loop. Additionally, zthr_iscancelled() is
4928 * checked here so that if the arc is shutting down, the
4929 * broadcast will wake any remaining arc evict waiters.
4930 *
4931 * Note we cancel using zthr instead of arc_evict_zthr
4932 * because the latter may not yet be initializd when the
4933 * callback is first invoked.
4934 */
4935 mutex_enter(&arc_evict_lock);
4936 arc_evict_needed = !zthr_iscancelled(zthr) &&
4937 evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
4938 if (!arc_evict_needed) {
4939 /*
4940 * We're either no longer overflowing, or we
4941 * can't evict anything more, so we should wake
4942 * arc_get_data_impl() sooner.
4943 */
4944 arc_evict_waiter_t *aw;
4945 while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
4946 cv_signal(&aw->aew_cv);
4947 }
4948 arc_set_need_free();
4949 }
4950 mutex_exit(&arc_evict_lock);
4951 spl_fstrans_unmark(cookie);
4952 }
4953
4954 static boolean_t
arc_reap_cb_check(void * arg,zthr_t * zthr)4955 arc_reap_cb_check(void *arg, zthr_t *zthr)
4956 {
4957 (void) arg, (void) zthr;
4958
4959 int64_t free_memory = arc_available_memory();
4960 static int reap_cb_check_counter = 0;
4961
4962 /*
4963 * If a kmem reap is already active, don't schedule more. We must
4964 * check for this because kmem_cache_reap_soon() won't actually
4965 * block on the cache being reaped (this is to prevent callers from
4966 * becoming implicitly blocked by a system-wide kmem reap -- which,
4967 * on a system with many, many full magazines, can take minutes).
4968 */
4969 if (!kmem_cache_reap_active() && free_memory < 0) {
4970
4971 arc_no_grow = B_TRUE;
4972 arc_warm = B_TRUE;
4973 /*
4974 * Wait at least zfs_grow_retry (default 5) seconds
4975 * before considering growing.
4976 */
4977 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4978 return (B_TRUE);
4979 } else if (free_memory < arc_c >> zfs_arc_no_grow_shift) {
4980 arc_no_grow = B_TRUE;
4981 } else if (gethrtime() >= arc_growtime) {
4982 arc_no_grow = B_FALSE;
4983 }
4984
4985 /*
4986 * Called unconditionally every 60 seconds to reclaim unused
4987 * zstd compression and decompression context. This is done
4988 * here to avoid the need for an independent thread.
4989 */
4990 if (!((reap_cb_check_counter++) % 60))
4991 zfs_zstd_cache_reap_now();
4992
4993 return (B_FALSE);
4994 }
4995
4996 /*
4997 * Keep enough free memory in the system by reaping the ARC's kmem
4998 * caches. To cause more slabs to be reapable, we may reduce the
4999 * target size of the cache (arc_c), causing the arc_evict_cb()
5000 * to free more buffers.
5001 */
5002 static void
arc_reap_cb(void * arg,zthr_t * zthr)5003 arc_reap_cb(void *arg, zthr_t *zthr)
5004 {
5005 int64_t can_free, free_memory, to_free;
5006
5007 (void) arg, (void) zthr;
5008 fstrans_cookie_t cookie = spl_fstrans_mark();
5009
5010 /*
5011 * Kick off asynchronous kmem_reap()'s of all our caches.
5012 */
5013 arc_kmem_reap_soon();
5014
5015 /*
5016 * Wait at least arc_kmem_cache_reap_retry_ms between
5017 * arc_kmem_reap_soon() calls. Without this check it is possible to
5018 * end up in a situation where we spend lots of time reaping
5019 * caches, while we're near arc_c_min. Waiting here also gives the
5020 * subsequent free memory check a chance of finding that the
5021 * asynchronous reap has already freed enough memory, and we don't
5022 * need to call arc_reduce_target_size().
5023 */
5024 delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
5025
5026 /*
5027 * Reduce the target size as needed to maintain the amount of free
5028 * memory in the system at a fraction of the arc_size (1/128th by
5029 * default). If oversubscribed (free_memory < 0) then reduce the
5030 * target arc_size by the deficit amount plus the fractional
5031 * amount. If free memory is positive but less than the fractional
5032 * amount, reduce by what is needed to hit the fractional amount.
5033 */
5034 free_memory = arc_available_memory();
5035 can_free = arc_c - arc_c_min;
5036 to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory;
5037 if (to_free > 0)
5038 arc_reduce_target_size(to_free);
5039 spl_fstrans_unmark(cookie);
5040 }
5041
5042 #ifdef _KERNEL
5043 /*
5044 * Determine the amount of memory eligible for eviction contained in the
5045 * ARC. All clean data reported by the ghost lists can always be safely
5046 * evicted. Due to arc_c_min, the same does not hold for all clean data
5047 * contained by the regular mru and mfu lists.
5048 *
5049 * In the case of the regular mru and mfu lists, we need to report as
5050 * much clean data as possible, such that evicting that same reported
5051 * data will not bring arc_size below arc_c_min. Thus, in certain
5052 * circumstances, the total amount of clean data in the mru and mfu
5053 * lists might not actually be evictable.
5054 *
5055 * The following two distinct cases are accounted for:
5056 *
5057 * 1. The sum of the amount of dirty data contained by both the mru and
5058 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5059 * is greater than or equal to arc_c_min.
5060 * (i.e. amount of dirty data >= arc_c_min)
5061 *
5062 * This is the easy case; all clean data contained by the mru and mfu
5063 * lists is evictable. Evicting all clean data can only drop arc_size
5064 * to the amount of dirty data, which is greater than arc_c_min.
5065 *
5066 * 2. The sum of the amount of dirty data contained by both the mru and
5067 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5068 * is less than arc_c_min.
5069 * (i.e. arc_c_min > amount of dirty data)
5070 *
5071 * 2.1. arc_size is greater than or equal arc_c_min.
5072 * (i.e. arc_size >= arc_c_min > amount of dirty data)
5073 *
5074 * In this case, not all clean data from the regular mru and mfu
5075 * lists is actually evictable; we must leave enough clean data
5076 * to keep arc_size above arc_c_min. Thus, the maximum amount of
5077 * evictable data from the two lists combined, is exactly the
5078 * difference between arc_size and arc_c_min.
5079 *
5080 * 2.2. arc_size is less than arc_c_min
5081 * (i.e. arc_c_min > arc_size > amount of dirty data)
5082 *
5083 * In this case, none of the data contained in the mru and mfu
5084 * lists is evictable, even if it's clean. Since arc_size is
5085 * already below arc_c_min, evicting any more would only
5086 * increase this negative difference.
5087 */
5088
5089 #endif /* _KERNEL */
5090
5091 /*
5092 * Adapt arc info given the number of bytes we are trying to add and
5093 * the state that we are coming from. This function is only called
5094 * when we are adding new content to the cache.
5095 */
5096 static void
arc_adapt(uint64_t bytes)5097 arc_adapt(uint64_t bytes)
5098 {
5099 /*
5100 * Wake reap thread if we do not have any available memory
5101 */
5102 if (arc_reclaim_needed()) {
5103 zthr_wakeup(arc_reap_zthr);
5104 return;
5105 }
5106
5107 if (arc_no_grow)
5108 return;
5109
5110 if (arc_c >= arc_c_max)
5111 return;
5112
5113 /*
5114 * If we're within (2 * maxblocksize) bytes of the target
5115 * cache size, increment the target cache size
5116 */
5117 if (aggsum_upper_bound(&arc_sums.arcstat_size) +
5118 2 * SPA_MAXBLOCKSIZE >= arc_c) {
5119 uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
5120 if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
5121 arc_c = arc_c_max;
5122 }
5123 }
5124
5125 /*
5126 * Check if ARC current size has grown past our upper thresholds.
5127 */
5128 static arc_ovf_level_t
arc_is_overflowing(boolean_t lax,boolean_t use_reserve)5129 arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
5130 {
5131 /*
5132 * We just compare the lower bound here for performance reasons. Our
5133 * primary goals are to make sure that the arc never grows without
5134 * bound, and that it can reach its maximum size. This check
5135 * accomplishes both goals. The maximum amount we could run over by is
5136 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5137 * in the ARC. In practice, that's in the tens of MB, which is low
5138 * enough to be safe.
5139 */
5140 int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
5141 zfs_max_recordsize;
5142 int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
5143 arc_dnode_limit;
5144
5145 /* Always allow at least one block of overflow. */
5146 if (arc_over < 0 && dn_over <= 0)
5147 return (ARC_OVF_NONE);
5148
5149 /* If we are under memory pressure, report severe overflow. */
5150 if (!lax)
5151 return (ARC_OVF_SEVERE);
5152
5153 /* We are not under pressure, so be more or less relaxed. */
5154 int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
5155 if (use_reserve)
5156 overflow *= 3;
5157 return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
5158 }
5159
5160 static abd_t *
arc_get_data_abd(arc_buf_hdr_t * hdr,uint64_t size,const void * tag,int alloc_flags)5161 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
5162 int alloc_flags)
5163 {
5164 arc_buf_contents_t type = arc_buf_type(hdr);
5165
5166 arc_get_data_impl(hdr, size, tag, alloc_flags);
5167 if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
5168 return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
5169 else
5170 return (abd_alloc(size, type == ARC_BUFC_METADATA));
5171 }
5172
5173 static void *
arc_get_data_buf(arc_buf_hdr_t * hdr,uint64_t size,const void * tag)5174 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
5175 {
5176 arc_buf_contents_t type = arc_buf_type(hdr);
5177
5178 arc_get_data_impl(hdr, size, tag, 0);
5179 if (type == ARC_BUFC_METADATA) {
5180 return (zio_buf_alloc(size));
5181 } else {
5182 ASSERT(type == ARC_BUFC_DATA);
5183 return (zio_data_buf_alloc(size));
5184 }
5185 }
5186
5187 /*
5188 * Wait for the specified amount of data (in bytes) to be evicted from the
5189 * ARC, and for there to be sufficient free memory in the system.
5190 * The lax argument specifies that caller does not have a specific reason
5191 * to wait, not aware of any memory pressure. Low memory handlers though
5192 * should set it to B_FALSE to wait for all required evictions to complete.
5193 * The use_reserve argument allows some callers to wait less than others
5194 * to not block critical code paths, possibly blocking other resources.
5195 */
5196 void
arc_wait_for_eviction(uint64_t amount,boolean_t lax,boolean_t use_reserve)5197 arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
5198 {
5199 switch (arc_is_overflowing(lax, use_reserve)) {
5200 case ARC_OVF_NONE:
5201 return;
5202 case ARC_OVF_SOME:
5203 /*
5204 * This is a bit racy without taking arc_evict_lock, but the
5205 * worst that can happen is we either call zthr_wakeup() extra
5206 * time due to race with other thread here, or the set flag
5207 * get cleared by arc_evict_cb(), which is unlikely due to
5208 * big hysteresis, but also not important since at this level
5209 * of overflow the eviction is purely advisory. Same time
5210 * taking the global lock here every time without waiting for
5211 * the actual eviction creates a significant lock contention.
5212 */
5213 if (!arc_evict_needed) {
5214 arc_evict_needed = B_TRUE;
5215 zthr_wakeup(arc_evict_zthr);
5216 }
5217 return;
5218 case ARC_OVF_SEVERE:
5219 default:
5220 {
5221 arc_evict_waiter_t aw;
5222 list_link_init(&aw.aew_node);
5223 cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
5224
5225 uint64_t last_count = 0;
5226 mutex_enter(&arc_evict_lock);
5227 arc_evict_waiter_t *last;
5228 if ((last = list_tail(&arc_evict_waiters)) != NULL) {
5229 last_count = last->aew_count;
5230 } else if (!arc_evict_needed) {
5231 arc_evict_needed = B_TRUE;
5232 zthr_wakeup(arc_evict_zthr);
5233 }
5234 /*
5235 * Note, the last waiter's count may be less than
5236 * arc_evict_count if we are low on memory in which
5237 * case arc_evict_state_impl() may have deferred
5238 * wakeups (but still incremented arc_evict_count).
5239 */
5240 aw.aew_count = MAX(last_count, arc_evict_count) + amount;
5241
5242 list_insert_tail(&arc_evict_waiters, &aw);
5243
5244 arc_set_need_free();
5245
5246 DTRACE_PROBE3(arc__wait__for__eviction,
5247 uint64_t, amount,
5248 uint64_t, arc_evict_count,
5249 uint64_t, aw.aew_count);
5250
5251 /*
5252 * We will be woken up either when arc_evict_count reaches
5253 * aew_count, or when the ARC is no longer overflowing and
5254 * eviction completes.
5255 * In case of "false" wakeup, we will still be on the list.
5256 */
5257 do {
5258 cv_wait(&aw.aew_cv, &arc_evict_lock);
5259 } while (list_link_active(&aw.aew_node));
5260 mutex_exit(&arc_evict_lock);
5261
5262 cv_destroy(&aw.aew_cv);
5263 }
5264 }
5265 }
5266
5267 /*
5268 * Allocate a block and return it to the caller. If we are hitting the
5269 * hard limit for the cache size, we must sleep, waiting for the eviction
5270 * thread to catch up. If we're past the target size but below the hard
5271 * limit, we'll only signal the reclaim thread and continue on.
5272 */
5273 static void
arc_get_data_impl(arc_buf_hdr_t * hdr,uint64_t size,const void * tag,int alloc_flags)5274 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
5275 int alloc_flags)
5276 {
5277 arc_adapt(size);
5278
5279 /*
5280 * If arc_size is currently overflowing, we must be adding data
5281 * faster than we are evicting. To ensure we don't compound the
5282 * problem by adding more data and forcing arc_size to grow even
5283 * further past it's target size, we wait for the eviction thread to
5284 * make some progress. We also wait for there to be sufficient free
5285 * memory in the system, as measured by arc_free_memory().
5286 *
5287 * Specifically, we wait for zfs_arc_eviction_pct percent of the
5288 * requested size to be evicted. This should be more than 100%, to
5289 * ensure that that progress is also made towards getting arc_size
5290 * under arc_c. See the comment above zfs_arc_eviction_pct.
5291 */
5292 arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
5293 B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE);
5294
5295 arc_buf_contents_t type = arc_buf_type(hdr);
5296 if (type == ARC_BUFC_METADATA) {
5297 arc_space_consume(size, ARC_SPACE_META);
5298 } else {
5299 arc_space_consume(size, ARC_SPACE_DATA);
5300 }
5301
5302 /*
5303 * Update the state size. Note that ghost states have a
5304 * "ghost size" and so don't need to be updated.
5305 */
5306 arc_state_t *state = hdr->b_l1hdr.b_state;
5307 if (!GHOST_STATE(state)) {
5308
5309 (void) zfs_refcount_add_many(&state->arcs_size[type], size,
5310 tag);
5311
5312 /*
5313 * If this is reached via arc_read, the link is
5314 * protected by the hash lock. If reached via
5315 * arc_buf_alloc, the header should not be accessed by
5316 * any other thread. And, if reached via arc_read_done,
5317 * the hash lock will protect it if it's found in the
5318 * hash table; otherwise no other thread should be
5319 * trying to [add|remove]_reference it.
5320 */
5321 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5322 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5323 (void) zfs_refcount_add_many(&state->arcs_esize[type],
5324 size, tag);
5325 }
5326 }
5327 }
5328
5329 static void
arc_free_data_abd(arc_buf_hdr_t * hdr,abd_t * abd,uint64_t size,const void * tag)5330 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
5331 const void *tag)
5332 {
5333 arc_free_data_impl(hdr, size, tag);
5334 abd_free(abd);
5335 }
5336
5337 static void
arc_free_data_buf(arc_buf_hdr_t * hdr,void * buf,uint64_t size,const void * tag)5338 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
5339 {
5340 arc_buf_contents_t type = arc_buf_type(hdr);
5341
5342 arc_free_data_impl(hdr, size, tag);
5343 if (type == ARC_BUFC_METADATA) {
5344 zio_buf_free(buf, size);
5345 } else {
5346 ASSERT(type == ARC_BUFC_DATA);
5347 zio_data_buf_free(buf, size);
5348 }
5349 }
5350
5351 /*
5352 * Free the arc data buffer.
5353 */
5354 static void
arc_free_data_impl(arc_buf_hdr_t * hdr,uint64_t size,const void * tag)5355 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
5356 {
5357 arc_state_t *state = hdr->b_l1hdr.b_state;
5358 arc_buf_contents_t type = arc_buf_type(hdr);
5359
5360 /* protected by hash lock, if in the hash table */
5361 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5362 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5363 ASSERT(state != arc_anon && state != arc_l2c_only);
5364
5365 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
5366 size, tag);
5367 }
5368 (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
5369
5370 VERIFY3U(hdr->b_type, ==, type);
5371 if (type == ARC_BUFC_METADATA) {
5372 arc_space_return(size, ARC_SPACE_META);
5373 } else {
5374 ASSERT(type == ARC_BUFC_DATA);
5375 arc_space_return(size, ARC_SPACE_DATA);
5376 }
5377 }
5378
5379 /*
5380 * This routine is called whenever a buffer is accessed.
5381 */
5382 static void
arc_access(arc_buf_hdr_t * hdr,arc_flags_t arc_flags,boolean_t hit)5383 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
5384 {
5385 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
5386 ASSERT(HDR_HAS_L1HDR(hdr));
5387
5388 /*
5389 * Update buffer prefetch status.
5390 */
5391 boolean_t was_prefetch = HDR_PREFETCH(hdr);
5392 boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
5393 if (was_prefetch != now_prefetch) {
5394 if (was_prefetch) {
5395 ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
5396 HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
5397 prefetch);
5398 }
5399 if (HDR_HAS_L2HDR(hdr))
5400 l2arc_hdr_arcstats_decrement_state(hdr);
5401 if (was_prefetch) {
5402 arc_hdr_clear_flags(hdr,
5403 ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
5404 } else {
5405 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5406 }
5407 if (HDR_HAS_L2HDR(hdr))
5408 l2arc_hdr_arcstats_increment_state(hdr);
5409 }
5410 if (now_prefetch) {
5411 if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
5412 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5413 ARCSTAT_BUMP(arcstat_prescient_prefetch);
5414 } else {
5415 ARCSTAT_BUMP(arcstat_predictive_prefetch);
5416 }
5417 }
5418 if (arc_flags & ARC_FLAG_L2CACHE)
5419 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5420
5421 clock_t now = ddi_get_lbolt();
5422 if (hdr->b_l1hdr.b_state == arc_anon) {
5423 arc_state_t *new_state;
5424 /*
5425 * This buffer is not in the cache, and does not appear in
5426 * our "ghost" lists. Add it to the MRU or uncached state.
5427 */
5428 ASSERT0(hdr->b_l1hdr.b_arc_access);
5429 hdr->b_l1hdr.b_arc_access = now;
5430 if (HDR_UNCACHED(hdr)) {
5431 new_state = arc_uncached;
5432 DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
5433 hdr);
5434 } else {
5435 new_state = arc_mru;
5436 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5437 }
5438 arc_change_state(new_state, hdr);
5439 } else if (hdr->b_l1hdr.b_state == arc_mru) {
5440 /*
5441 * This buffer has been accessed once recently and either
5442 * its read is still in progress or it is in the cache.
5443 */
5444 if (HDR_IO_IN_PROGRESS(hdr)) {
5445 hdr->b_l1hdr.b_arc_access = now;
5446 return;
5447 }
5448 hdr->b_l1hdr.b_mru_hits++;
5449 ARCSTAT_BUMP(arcstat_mru_hits);
5450
5451 /*
5452 * If the previous access was a prefetch, then it already
5453 * handled possible promotion, so nothing more to do for now.
5454 */
5455 if (was_prefetch) {
5456 hdr->b_l1hdr.b_arc_access = now;
5457 return;
5458 }
5459
5460 /*
5461 * If more than ARC_MINTIME have passed from the previous
5462 * hit, promote the buffer to the MFU state.
5463 */
5464 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
5465 ARC_MINTIME)) {
5466 hdr->b_l1hdr.b_arc_access = now;
5467 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5468 arc_change_state(arc_mfu, hdr);
5469 }
5470 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5471 arc_state_t *new_state;
5472 /*
5473 * This buffer has been accessed once recently, but was
5474 * evicted from the cache. Would we have bigger MRU, it
5475 * would be an MRU hit, so handle it the same way, except
5476 * we don't need to check the previous access time.
5477 */
5478 hdr->b_l1hdr.b_mru_ghost_hits++;
5479 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5480 hdr->b_l1hdr.b_arc_access = now;
5481 wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
5482 arc_hdr_size(hdr));
5483 if (was_prefetch) {
5484 new_state = arc_mru;
5485 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5486 } else {
5487 new_state = arc_mfu;
5488 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5489 }
5490 arc_change_state(new_state, hdr);
5491 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
5492 /*
5493 * This buffer has been accessed more than once and either
5494 * still in the cache or being restored from one of ghosts.
5495 */
5496 if (!HDR_IO_IN_PROGRESS(hdr)) {
5497 hdr->b_l1hdr.b_mfu_hits++;
5498 ARCSTAT_BUMP(arcstat_mfu_hits);
5499 }
5500 hdr->b_l1hdr.b_arc_access = now;
5501 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5502 /*
5503 * This buffer has been accessed more than once recently, but
5504 * has been evicted from the cache. Would we have bigger MFU
5505 * it would stay in cache, so move it back to MFU state.
5506 */
5507 hdr->b_l1hdr.b_mfu_ghost_hits++;
5508 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5509 hdr->b_l1hdr.b_arc_access = now;
5510 wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
5511 arc_hdr_size(hdr));
5512 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5513 arc_change_state(arc_mfu, hdr);
5514 } else if (hdr->b_l1hdr.b_state == arc_uncached) {
5515 /*
5516 * This buffer is uncacheable, but we got a hit. Probably
5517 * a demand read after prefetch. Nothing more to do here.
5518 */
5519 if (!HDR_IO_IN_PROGRESS(hdr))
5520 ARCSTAT_BUMP(arcstat_uncached_hits);
5521 hdr->b_l1hdr.b_arc_access = now;
5522 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5523 /*
5524 * This buffer is on the 2nd Level ARC and was not accessed
5525 * for a long time, so treat it as new and put into MRU.
5526 */
5527 hdr->b_l1hdr.b_arc_access = now;
5528 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5529 arc_change_state(arc_mru, hdr);
5530 } else {
5531 cmn_err(CE_PANIC, "invalid arc state 0x%p",
5532 hdr->b_l1hdr.b_state);
5533 }
5534 }
5535
5536 /*
5537 * This routine is called by dbuf_hold() to update the arc_access() state
5538 * which otherwise would be skipped for entries in the dbuf cache.
5539 */
5540 void
arc_buf_access(arc_buf_t * buf)5541 arc_buf_access(arc_buf_t *buf)
5542 {
5543 arc_buf_hdr_t *hdr = buf->b_hdr;
5544
5545 /*
5546 * Avoid taking the hash_lock when possible as an optimization.
5547 * The header must be checked again under the hash_lock in order
5548 * to handle the case where it is concurrently being released.
5549 */
5550 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
5551 return;
5552
5553 kmutex_t *hash_lock = HDR_LOCK(hdr);
5554 mutex_enter(hash_lock);
5555
5556 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5557 mutex_exit(hash_lock);
5558 ARCSTAT_BUMP(arcstat_access_skip);
5559 return;
5560 }
5561
5562 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5563 hdr->b_l1hdr.b_state == arc_mfu ||
5564 hdr->b_l1hdr.b_state == arc_uncached);
5565
5566 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5567 arc_access(hdr, 0, B_TRUE);
5568 mutex_exit(hash_lock);
5569
5570 ARCSTAT_BUMP(arcstat_hits);
5571 ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
5572 !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5573 }
5574
5575 /* a generic arc_read_done_func_t */
5576 void
arc_getbuf_func(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * arg)5577 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5578 arc_buf_t *buf, void *arg)
5579 {
5580 (void) zb, (void) bp;
5581 arc_buf_t **bufp = arg;
5582
5583 if (buf == NULL) {
5584 ASSERT(zio == NULL || zio->io_error != 0);
5585 *bufp = NULL;
5586 } else {
5587 ASSERT(zio == NULL || zio->io_error == 0);
5588 *bufp = buf;
5589 ASSERT(buf->b_data != NULL);
5590 }
5591 }
5592
5593 static void
arc_hdr_verify(arc_buf_hdr_t * hdr,blkptr_t * bp)5594 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5595 {
5596 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5597 ASSERT0(HDR_GET_PSIZE(hdr));
5598 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
5599 } else {
5600 if (HDR_COMPRESSION_ENABLED(hdr)) {
5601 ASSERT3U(arc_hdr_get_compress(hdr), ==,
5602 BP_GET_COMPRESS(bp));
5603 }
5604 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5605 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5606 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
5607 }
5608 }
5609
5610 static void
arc_read_done(zio_t * zio)5611 arc_read_done(zio_t *zio)
5612 {
5613 blkptr_t *bp = zio->io_bp;
5614 arc_buf_hdr_t *hdr = zio->io_private;
5615 kmutex_t *hash_lock = NULL;
5616 arc_callback_t *callback_list;
5617 arc_callback_t *acb;
5618
5619 /*
5620 * The hdr was inserted into hash-table and removed from lists
5621 * prior to starting I/O. We should find this header, since
5622 * it's in the hash table, and it should be legit since it's
5623 * not possible to evict it during the I/O. The only possible
5624 * reason for it not to be found is if we were freed during the
5625 * read.
5626 */
5627 if (HDR_IN_HASH_TABLE(hdr)) {
5628 arc_buf_hdr_t *found;
5629
5630 ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp));
5631 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5632 BP_IDENTITY(zio->io_bp)->dva_word[0]);
5633 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5634 BP_IDENTITY(zio->io_bp)->dva_word[1]);
5635
5636 found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
5637
5638 ASSERT((found == hdr &&
5639 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5640 (found == hdr && HDR_L2_READING(hdr)));
5641 ASSERT3P(hash_lock, !=, NULL);
5642 }
5643
5644 if (BP_IS_PROTECTED(bp)) {
5645 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5646 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5647 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5648 hdr->b_crypt_hdr.b_iv);
5649
5650 if (zio->io_error == 0) {
5651 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5652 void *tmpbuf;
5653
5654 tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5655 sizeof (zil_chain_t));
5656 zio_crypt_decode_mac_zil(tmpbuf,
5657 hdr->b_crypt_hdr.b_mac);
5658 abd_return_buf(zio->io_abd, tmpbuf,
5659 sizeof (zil_chain_t));
5660 } else {
5661 zio_crypt_decode_mac_bp(bp,
5662 hdr->b_crypt_hdr.b_mac);
5663 }
5664 }
5665 }
5666
5667 if (zio->io_error == 0) {
5668 /* byteswap if necessary */
5669 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5670 if (BP_GET_LEVEL(zio->io_bp) > 0) {
5671 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5672 } else {
5673 hdr->b_l1hdr.b_byteswap =
5674 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5675 }
5676 } else {
5677 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5678 }
5679 if (!HDR_L2_READING(hdr)) {
5680 hdr->b_complevel = zio->io_prop.zp_complevel;
5681 }
5682 }
5683
5684 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5685 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5686 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5687
5688 callback_list = hdr->b_l1hdr.b_acb;
5689 ASSERT3P(callback_list, !=, NULL);
5690 hdr->b_l1hdr.b_acb = NULL;
5691
5692 /*
5693 * If a read request has a callback (i.e. acb_done is not NULL), then we
5694 * make a buf containing the data according to the parameters which were
5695 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5696 * aren't needlessly decompressing the data multiple times.
5697 */
5698 int callback_cnt = 0;
5699 for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5700
5701 /* We need the last one to call below in original order. */
5702 callback_list = acb;
5703
5704 if (!acb->acb_done || acb->acb_nobuf)
5705 continue;
5706
5707 callback_cnt++;
5708
5709 if (zio->io_error != 0)
5710 continue;
5711
5712 int error = arc_buf_alloc_impl(hdr, zio->io_spa,
5713 &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
5714 acb->acb_compressed, acb->acb_noauth, B_TRUE,
5715 &acb->acb_buf);
5716
5717 /*
5718 * Assert non-speculative zios didn't fail because an
5719 * encryption key wasn't loaded
5720 */
5721 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
5722 error != EACCES);
5723
5724 /*
5725 * If we failed to decrypt, report an error now (as the zio
5726 * layer would have done if it had done the transforms).
5727 */
5728 if (error == ECKSUM) {
5729 ASSERT(BP_IS_PROTECTED(bp));
5730 error = SET_ERROR(EIO);
5731 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5732 spa_log_error(zio->io_spa, &acb->acb_zb,
5733 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
5734 (void) zfs_ereport_post(
5735 FM_EREPORT_ZFS_AUTHENTICATION,
5736 zio->io_spa, NULL, &acb->acb_zb, zio, 0);
5737 }
5738 }
5739
5740 if (error != 0) {
5741 /*
5742 * Decompression or decryption failed. Set
5743 * io_error so that when we call acb_done
5744 * (below), we will indicate that the read
5745 * failed. Note that in the unusual case
5746 * where one callback is compressed and another
5747 * uncompressed, we will mark all of them
5748 * as failed, even though the uncompressed
5749 * one can't actually fail. In this case,
5750 * the hdr will not be anonymous, because
5751 * if there are multiple callbacks, it's
5752 * because multiple threads found the same
5753 * arc buf in the hash table.
5754 */
5755 zio->io_error = error;
5756 }
5757 }
5758
5759 /*
5760 * If there are multiple callbacks, we must have the hash lock,
5761 * because the only way for multiple threads to find this hdr is
5762 * in the hash table. This ensures that if there are multiple
5763 * callbacks, the hdr is not anonymous. If it were anonymous,
5764 * we couldn't use arc_buf_destroy() in the error case below.
5765 */
5766 ASSERT(callback_cnt < 2 || hash_lock != NULL);
5767
5768 if (zio->io_error == 0) {
5769 arc_hdr_verify(hdr, zio->io_bp);
5770 } else {
5771 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5772 if (hdr->b_l1hdr.b_state != arc_anon)
5773 arc_change_state(arc_anon, hdr);
5774 if (HDR_IN_HASH_TABLE(hdr))
5775 buf_hash_remove(hdr);
5776 }
5777
5778 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5779 (void) remove_reference(hdr, hdr);
5780
5781 if (hash_lock != NULL)
5782 mutex_exit(hash_lock);
5783
5784 /* execute each callback and free its structure */
5785 while ((acb = callback_list) != NULL) {
5786 if (acb->acb_done != NULL) {
5787 if (zio->io_error != 0 && acb->acb_buf != NULL) {
5788 /*
5789 * If arc_buf_alloc_impl() fails during
5790 * decompression, the buf will still be
5791 * allocated, and needs to be freed here.
5792 */
5793 arc_buf_destroy(acb->acb_buf,
5794 acb->acb_private);
5795 acb->acb_buf = NULL;
5796 }
5797 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5798 acb->acb_buf, acb->acb_private);
5799 }
5800
5801 if (acb->acb_zio_dummy != NULL) {
5802 acb->acb_zio_dummy->io_error = zio->io_error;
5803 zio_nowait(acb->acb_zio_dummy);
5804 }
5805
5806 callback_list = acb->acb_prev;
5807 if (acb->acb_wait) {
5808 mutex_enter(&acb->acb_wait_lock);
5809 acb->acb_wait_error = zio->io_error;
5810 acb->acb_wait = B_FALSE;
5811 cv_signal(&acb->acb_wait_cv);
5812 mutex_exit(&acb->acb_wait_lock);
5813 /* acb will be freed by the waiting thread. */
5814 } else {
5815 kmem_free(acb, sizeof (arc_callback_t));
5816 }
5817 }
5818 }
5819
5820 /*
5821 * Lookup the block at the specified DVA (in bp), and return the manner in
5822 * which the block is cached. A zero return indicates not cached.
5823 */
5824 int
arc_cached(spa_t * spa,const blkptr_t * bp)5825 arc_cached(spa_t *spa, const blkptr_t *bp)
5826 {
5827 arc_buf_hdr_t *hdr = NULL;
5828 kmutex_t *hash_lock = NULL;
5829 uint64_t guid = spa_load_guid(spa);
5830 int flags = 0;
5831
5832 if (BP_IS_EMBEDDED(bp))
5833 return (ARC_CACHED_EMBEDDED);
5834
5835 hdr = buf_hash_find(guid, bp, &hash_lock);
5836 if (hdr == NULL)
5837 return (0);
5838
5839 if (HDR_HAS_L1HDR(hdr)) {
5840 arc_state_t *state = hdr->b_l1hdr.b_state;
5841 /*
5842 * We switch to ensure that any future arc_state_type_t
5843 * changes are handled. This is just a shift to promote
5844 * more compile-time checking.
5845 */
5846 switch (state->arcs_state) {
5847 case ARC_STATE_ANON:
5848 break;
5849 case ARC_STATE_MRU:
5850 flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1;
5851 break;
5852 case ARC_STATE_MFU:
5853 flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1;
5854 break;
5855 case ARC_STATE_UNCACHED:
5856 /* The header is still in L1, probably not for long */
5857 flags |= ARC_CACHED_IN_L1;
5858 break;
5859 default:
5860 break;
5861 }
5862 }
5863 if (HDR_HAS_L2HDR(hdr))
5864 flags |= ARC_CACHED_IN_L2;
5865
5866 mutex_exit(hash_lock);
5867
5868 return (flags);
5869 }
5870
5871 /*
5872 * "Read" the block at the specified DVA (in bp) via the
5873 * cache. If the block is found in the cache, invoke the provided
5874 * callback immediately and return. Note that the `zio' parameter
5875 * in the callback will be NULL in this case, since no IO was
5876 * required. If the block is not in the cache pass the read request
5877 * on to the spa with a substitute callback function, so that the
5878 * requested block will be added to the cache.
5879 *
5880 * If a read request arrives for a block that has a read in-progress,
5881 * either wait for the in-progress read to complete (and return the
5882 * results); or, if this is a read with a "done" func, add a record
5883 * to the read to invoke the "done" func when the read completes,
5884 * and return; or just return.
5885 *
5886 * arc_read_done() will invoke all the requested "done" functions
5887 * for readers of this block.
5888 */
5889 int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_read_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,arc_flags_t * arc_flags,const zbookmark_phys_t * zb)5890 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
5891 arc_read_done_func_t *done, void *private, zio_priority_t priority,
5892 int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5893 {
5894 arc_buf_hdr_t *hdr = NULL;
5895 kmutex_t *hash_lock = NULL;
5896 zio_t *rzio;
5897 uint64_t guid = spa_load_guid(spa);
5898 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
5899 boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
5900 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5901 boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
5902 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5903 boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
5904 boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
5905 arc_buf_t *buf = NULL;
5906 int rc = 0;
5907 boolean_t bp_validation = B_FALSE;
5908
5909 ASSERT(!embedded_bp ||
5910 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5911 ASSERT(!BP_IS_HOLE(bp));
5912 ASSERT(!BP_IS_REDACTED(bp));
5913
5914 /*
5915 * Normally SPL_FSTRANS will already be set since kernel threads which
5916 * expect to call the DMU interfaces will set it when created. System
5917 * calls are similarly handled by setting/cleaning the bit in the
5918 * registered callback (module/os/.../zfs/zpl_*).
5919 *
5920 * External consumers such as Lustre which call the exported DMU
5921 * interfaces may not have set SPL_FSTRANS. To avoid a deadlock
5922 * on the hash_lock always set and clear the bit.
5923 */
5924 fstrans_cookie_t cookie = spl_fstrans_mark();
5925 top:
5926 if (!embedded_bp) {
5927 /*
5928 * Embedded BP's have no DVA and require no I/O to "read".
5929 * Create an anonymous arc buf to back it.
5930 */
5931 hdr = buf_hash_find(guid, bp, &hash_lock);
5932 }
5933
5934 /*
5935 * Determine if we have an L1 cache hit or a cache miss. For simplicity
5936 * we maintain encrypted data separately from compressed / uncompressed
5937 * data. If the user is requesting raw encrypted data and we don't have
5938 * that in the header we will read from disk to guarantee that we can
5939 * get it even if the encryption keys aren't loaded.
5940 */
5941 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
5942 (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
5943 boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
5944
5945 /*
5946 * Verify the block pointer contents are reasonable. This
5947 * should always be the case since the blkptr is protected by
5948 * a checksum.
5949 */
5950 if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
5951 BLK_VERIFY_LOG)) {
5952 mutex_exit(hash_lock);
5953 rc = SET_ERROR(ECKSUM);
5954 goto done;
5955 }
5956
5957 if (HDR_IO_IN_PROGRESS(hdr)) {
5958 if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
5959 mutex_exit(hash_lock);
5960 ARCSTAT_BUMP(arcstat_cached_only_in_progress);
5961 rc = SET_ERROR(ENOENT);
5962 goto done;
5963 }
5964
5965 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5966 ASSERT3P(head_zio, !=, NULL);
5967 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5968 priority == ZIO_PRIORITY_SYNC_READ) {
5969 /*
5970 * This is a sync read that needs to wait for
5971 * an in-flight async read. Request that the
5972 * zio have its priority upgraded.
5973 */
5974 zio_change_priority(head_zio, priority);
5975 DTRACE_PROBE1(arc__async__upgrade__sync,
5976 arc_buf_hdr_t *, hdr);
5977 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5978 }
5979
5980 DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
5981 arc_access(hdr, *arc_flags, B_FALSE);
5982
5983 /*
5984 * If there are multiple threads reading the same block
5985 * and that block is not yet in the ARC, then only one
5986 * thread will do the physical I/O and all other
5987 * threads will wait until that I/O completes.
5988 * Synchronous reads use the acb_wait_cv whereas nowait
5989 * reads register a callback. Both are signalled/called
5990 * in arc_read_done.
5991 *
5992 * Errors of the physical I/O may need to be propagated.
5993 * Synchronous read errors are returned here from
5994 * arc_read_done via acb_wait_error. Nowait reads
5995 * attach the acb_zio_dummy zio to pio and
5996 * arc_read_done propagates the physical I/O's io_error
5997 * to acb_zio_dummy, and thereby to pio.
5998 */
5999 arc_callback_t *acb = NULL;
6000 if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
6001 acb = kmem_zalloc(sizeof (arc_callback_t),
6002 KM_SLEEP);
6003 acb->acb_done = done;
6004 acb->acb_private = private;
6005 acb->acb_compressed = compressed_read;
6006 acb->acb_encrypted = encrypted_read;
6007 acb->acb_noauth = noauth_read;
6008 acb->acb_nobuf = no_buf;
6009 if (*arc_flags & ARC_FLAG_WAIT) {
6010 acb->acb_wait = B_TRUE;
6011 mutex_init(&acb->acb_wait_lock, NULL,
6012 MUTEX_DEFAULT, NULL);
6013 cv_init(&acb->acb_wait_cv, NULL,
6014 CV_DEFAULT, NULL);
6015 }
6016 acb->acb_zb = *zb;
6017 if (pio != NULL) {
6018 acb->acb_zio_dummy = zio_null(pio,
6019 spa, NULL, NULL, NULL, zio_flags);
6020 }
6021 acb->acb_zio_head = head_zio;
6022 acb->acb_next = hdr->b_l1hdr.b_acb;
6023 hdr->b_l1hdr.b_acb->acb_prev = acb;
6024 hdr->b_l1hdr.b_acb = acb;
6025 }
6026 mutex_exit(hash_lock);
6027
6028 ARCSTAT_BUMP(arcstat_iohits);
6029 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6030 demand, prefetch, is_data, data, metadata, iohits);
6031
6032 if (*arc_flags & ARC_FLAG_WAIT) {
6033 mutex_enter(&acb->acb_wait_lock);
6034 while (acb->acb_wait) {
6035 cv_wait(&acb->acb_wait_cv,
6036 &acb->acb_wait_lock);
6037 }
6038 rc = acb->acb_wait_error;
6039 mutex_exit(&acb->acb_wait_lock);
6040 mutex_destroy(&acb->acb_wait_lock);
6041 cv_destroy(&acb->acb_wait_cv);
6042 kmem_free(acb, sizeof (arc_callback_t));
6043 }
6044 goto out;
6045 }
6046
6047 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
6048 hdr->b_l1hdr.b_state == arc_mfu ||
6049 hdr->b_l1hdr.b_state == arc_uncached);
6050
6051 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
6052 arc_access(hdr, *arc_flags, B_TRUE);
6053
6054 if (done && !no_buf) {
6055 ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
6056
6057 /* Get a buf with the desired data in it. */
6058 rc = arc_buf_alloc_impl(hdr, spa, zb, private,
6059 encrypted_read, compressed_read, noauth_read,
6060 B_TRUE, &buf);
6061 if (rc == ECKSUM) {
6062 /*
6063 * Convert authentication and decryption errors
6064 * to EIO (and generate an ereport if needed)
6065 * before leaving the ARC.
6066 */
6067 rc = SET_ERROR(EIO);
6068 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
6069 spa_log_error(spa, zb, hdr->b_birth);
6070 (void) zfs_ereport_post(
6071 FM_EREPORT_ZFS_AUTHENTICATION,
6072 spa, NULL, zb, NULL, 0);
6073 }
6074 }
6075 if (rc != 0) {
6076 arc_buf_destroy_impl(buf);
6077 buf = NULL;
6078 (void) remove_reference(hdr, private);
6079 }
6080
6081 /* assert any errors weren't due to unloaded keys */
6082 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
6083 rc != EACCES);
6084 }
6085 mutex_exit(hash_lock);
6086 ARCSTAT_BUMP(arcstat_hits);
6087 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6088 demand, prefetch, is_data, data, metadata, hits);
6089 *arc_flags |= ARC_FLAG_CACHED;
6090 goto done;
6091 } else {
6092 uint64_t lsize = BP_GET_LSIZE(bp);
6093 uint64_t psize = BP_GET_PSIZE(bp);
6094 arc_callback_t *acb;
6095 vdev_t *vd = NULL;
6096 uint64_t addr = 0;
6097 boolean_t devw = B_FALSE;
6098 uint64_t size;
6099 abd_t *hdr_abd;
6100 int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
6101 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
6102 int config_lock;
6103 int error;
6104
6105 if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
6106 if (hash_lock != NULL)
6107 mutex_exit(hash_lock);
6108 rc = SET_ERROR(ENOENT);
6109 goto done;
6110 }
6111
6112 if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
6113 config_lock = BLK_CONFIG_HELD;
6114 } else if (hash_lock != NULL) {
6115 /*
6116 * Prevent lock order reversal
6117 */
6118 config_lock = BLK_CONFIG_NEEDED_TRY;
6119 } else {
6120 config_lock = BLK_CONFIG_NEEDED;
6121 }
6122
6123 /*
6124 * Verify the block pointer contents are reasonable. This
6125 * should always be the case since the blkptr is protected by
6126 * a checksum.
6127 */
6128 if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
6129 config_lock, BLK_VERIFY_LOG))) {
6130 if (hash_lock != NULL)
6131 mutex_exit(hash_lock);
6132 if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
6133 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
6134 bp_validation = B_TRUE;
6135 goto top;
6136 }
6137 rc = SET_ERROR(ECKSUM);
6138 goto done;
6139 }
6140
6141 if (hdr == NULL) {
6142 /*
6143 * This block is not in the cache or it has
6144 * embedded data.
6145 */
6146 arc_buf_hdr_t *exists = NULL;
6147 hdr = arc_hdr_alloc(guid, psize, lsize,
6148 BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
6149
6150 if (!embedded_bp) {
6151 hdr->b_dva = *BP_IDENTITY(bp);
6152 hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp);
6153 exists = buf_hash_insert(hdr, &hash_lock);
6154 }
6155 if (exists != NULL) {
6156 /* somebody beat us to the hash insert */
6157 mutex_exit(hash_lock);
6158 buf_discard_identity(hdr);
6159 arc_hdr_destroy(hdr);
6160 goto top; /* restart the IO request */
6161 }
6162 } else {
6163 /*
6164 * This block is in the ghost cache or encrypted data
6165 * was requested and we didn't have it. If it was
6166 * L2-only (and thus didn't have an L1 hdr),
6167 * we realloc the header to add an L1 hdr.
6168 */
6169 if (!HDR_HAS_L1HDR(hdr)) {
6170 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
6171 hdr_full_cache);
6172 }
6173
6174 if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
6175 ASSERT0P(hdr->b_l1hdr.b_pabd);
6176 ASSERT(!HDR_HAS_RABD(hdr));
6177 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6178 ASSERT0(zfs_refcount_count(
6179 &hdr->b_l1hdr.b_refcnt));
6180 ASSERT0P(hdr->b_l1hdr.b_buf);
6181 #ifdef ZFS_DEBUG
6182 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
6183 #endif
6184 } else if (HDR_IO_IN_PROGRESS(hdr)) {
6185 /*
6186 * If this header already had an IO in progress
6187 * and we are performing another IO to fetch
6188 * encrypted data we must wait until the first
6189 * IO completes so as not to confuse
6190 * arc_read_done(). This should be very rare
6191 * and so the performance impact shouldn't
6192 * matter.
6193 */
6194 arc_callback_t *acb = kmem_zalloc(
6195 sizeof (arc_callback_t), KM_SLEEP);
6196 acb->acb_wait = B_TRUE;
6197 mutex_init(&acb->acb_wait_lock, NULL,
6198 MUTEX_DEFAULT, NULL);
6199 cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
6200 NULL);
6201 acb->acb_zio_head =
6202 hdr->b_l1hdr.b_acb->acb_zio_head;
6203 acb->acb_next = hdr->b_l1hdr.b_acb;
6204 hdr->b_l1hdr.b_acb->acb_prev = acb;
6205 hdr->b_l1hdr.b_acb = acb;
6206 mutex_exit(hash_lock);
6207 mutex_enter(&acb->acb_wait_lock);
6208 while (acb->acb_wait) {
6209 cv_wait(&acb->acb_wait_cv,
6210 &acb->acb_wait_lock);
6211 }
6212 mutex_exit(&acb->acb_wait_lock);
6213 mutex_destroy(&acb->acb_wait_lock);
6214 cv_destroy(&acb->acb_wait_cv);
6215 kmem_free(acb, sizeof (arc_callback_t));
6216 goto top;
6217 }
6218 }
6219 if (*arc_flags & ARC_FLAG_UNCACHED) {
6220 arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
6221 if (!encrypted_read)
6222 alloc_flags |= ARC_HDR_ALLOC_LINEAR;
6223 }
6224
6225 /*
6226 * Take additional reference for IO_IN_PROGRESS. It stops
6227 * arc_access() from putting this header without any buffers
6228 * and so other references but obviously nonevictable onto
6229 * the evictable list of MRU or MFU state.
6230 */
6231 add_reference(hdr, hdr);
6232 if (!embedded_bp)
6233 arc_access(hdr, *arc_flags, B_FALSE);
6234 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6235 arc_hdr_alloc_abd(hdr, alloc_flags);
6236 if (encrypted_read) {
6237 ASSERT(HDR_HAS_RABD(hdr));
6238 size = HDR_GET_PSIZE(hdr);
6239 hdr_abd = hdr->b_crypt_hdr.b_rabd;
6240 zio_flags |= ZIO_FLAG_RAW;
6241 } else {
6242 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6243 size = arc_hdr_size(hdr);
6244 hdr_abd = hdr->b_l1hdr.b_pabd;
6245
6246 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
6247 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6248 }
6249
6250 /*
6251 * For authenticated bp's, we do not ask the ZIO layer
6252 * to authenticate them since this will cause the entire
6253 * IO to fail if the key isn't loaded. Instead, we
6254 * defer authentication until arc_buf_fill(), which will
6255 * verify the data when the key is available.
6256 */
6257 if (BP_IS_AUTHENTICATED(bp))
6258 zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
6259 }
6260
6261 if (BP_IS_AUTHENTICATED(bp))
6262 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6263 if (BP_GET_LEVEL(bp) > 0)
6264 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
6265 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
6266
6267 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
6268 acb->acb_done = done;
6269 acb->acb_private = private;
6270 acb->acb_compressed = compressed_read;
6271 acb->acb_encrypted = encrypted_read;
6272 acb->acb_noauth = noauth_read;
6273 acb->acb_nobuf = no_buf;
6274 acb->acb_zb = *zb;
6275
6276 ASSERT0P(hdr->b_l1hdr.b_acb);
6277 hdr->b_l1hdr.b_acb = acb;
6278
6279 if (HDR_HAS_L2HDR(hdr) &&
6280 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
6281 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
6282 addr = hdr->b_l2hdr.b_daddr;
6283 /*
6284 * Lock out L2ARC device removal.
6285 */
6286 if (vdev_is_dead(vd) ||
6287 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6288 vd = NULL;
6289 }
6290
6291 /*
6292 * We count both async reads and scrub IOs as asynchronous so
6293 * that both can be upgraded in the event of a cache hit while
6294 * the read IO is still in-flight.
6295 */
6296 if (priority == ZIO_PRIORITY_ASYNC_READ ||
6297 priority == ZIO_PRIORITY_SCRUB)
6298 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6299 else
6300 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6301
6302 /*
6303 * At this point, we have a level 1 cache miss or a blkptr
6304 * with embedded data. Try again in L2ARC if possible.
6305 */
6306 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6307
6308 /*
6309 * Skip ARC stat bump for block pointers with embedded
6310 * data. The data are read from the blkptr itself via
6311 * decode_embedded_bp_compressed().
6312 */
6313 if (!embedded_bp) {
6314 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
6315 blkptr_t *, bp, uint64_t, lsize,
6316 zbookmark_phys_t *, zb);
6317 ARCSTAT_BUMP(arcstat_misses);
6318 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6319 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
6320 metadata, misses);
6321 zfs_racct_read(spa, size, 1,
6322 (*arc_flags & ARC_FLAG_UNCACHED) ?
6323 DMU_UNCACHEDIO : 0);
6324 }
6325
6326 /* Check if the spa even has l2 configured */
6327 const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
6328 spa->spa_l2cache.sav_count > 0;
6329
6330 if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
6331 /*
6332 * Read from the L2ARC if the following are true:
6333 * 1. The L2ARC vdev was previously cached.
6334 * 2. This buffer still has L2ARC metadata.
6335 * 3. This buffer isn't currently writing to the L2ARC.
6336 * 4. The L2ARC entry wasn't evicted, which may
6337 * also have invalidated the vdev.
6338 */
6339 if (HDR_HAS_L2HDR(hdr) &&
6340 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
6341 l2arc_read_callback_t *cb;
6342 abd_t *abd;
6343 uint64_t asize;
6344
6345 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6346 ARCSTAT_BUMP(arcstat_l2_hits);
6347 hdr->b_l2hdr.b_hits++;
6348
6349 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
6350 KM_SLEEP);
6351 cb->l2rcb_hdr = hdr;
6352 cb->l2rcb_bp = *bp;
6353 cb->l2rcb_zb = *zb;
6354 cb->l2rcb_flags = zio_flags;
6355
6356 /*
6357 * When Compressed ARC is disabled, but the
6358 * L2ARC block is compressed, arc_hdr_size()
6359 * will have returned LSIZE rather than PSIZE.
6360 */
6361 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
6362 !HDR_COMPRESSION_ENABLED(hdr) &&
6363 HDR_GET_PSIZE(hdr) != 0) {
6364 size = HDR_GET_PSIZE(hdr);
6365 }
6366
6367 asize = vdev_psize_to_asize(vd, size);
6368 if (asize != size) {
6369 abd = abd_alloc_for_io(asize,
6370 HDR_ISTYPE_METADATA(hdr));
6371 cb->l2rcb_abd = abd;
6372 } else {
6373 abd = hdr_abd;
6374 }
6375
6376 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
6377 addr + asize <= vd->vdev_psize -
6378 VDEV_LABEL_END_SIZE);
6379
6380 /*
6381 * l2arc read. The SCL_L2ARC lock will be
6382 * released by l2arc_read_done().
6383 * Issue a null zio if the underlying buffer
6384 * was squashed to zero size by compression.
6385 */
6386 ASSERT3U(arc_hdr_get_compress(hdr), !=,
6387 ZIO_COMPRESS_EMPTY);
6388 rzio = zio_read_phys(pio, vd, addr,
6389 asize, abd,
6390 ZIO_CHECKSUM_OFF,
6391 l2arc_read_done, cb, priority,
6392 zio_flags | ZIO_FLAG_CANFAIL |
6393 ZIO_FLAG_DONT_PROPAGATE |
6394 ZIO_FLAG_DONT_RETRY, B_FALSE);
6395 acb->acb_zio_head = rzio;
6396
6397 if (hash_lock != NULL)
6398 mutex_exit(hash_lock);
6399
6400 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6401 zio_t *, rzio);
6402 ARCSTAT_INCR(arcstat_l2_read_bytes,
6403 HDR_GET_PSIZE(hdr));
6404
6405 if (*arc_flags & ARC_FLAG_NOWAIT) {
6406 zio_nowait(rzio);
6407 goto out;
6408 }
6409
6410 ASSERT(*arc_flags & ARC_FLAG_WAIT);
6411 if (zio_wait(rzio) == 0)
6412 goto out;
6413
6414 /* l2arc read error; goto zio_read() */
6415 if (hash_lock != NULL)
6416 mutex_enter(hash_lock);
6417 } else {
6418 DTRACE_PROBE1(l2arc__miss,
6419 arc_buf_hdr_t *, hdr);
6420 ARCSTAT_BUMP(arcstat_l2_misses);
6421 if (HDR_L2_WRITING(hdr))
6422 ARCSTAT_BUMP(arcstat_l2_rw_clash);
6423 spa_config_exit(spa, SCL_L2ARC, vd);
6424 }
6425 } else {
6426 if (vd != NULL)
6427 spa_config_exit(spa, SCL_L2ARC, vd);
6428
6429 /*
6430 * Only a spa with l2 should contribute to l2
6431 * miss stats. (Including the case of having a
6432 * faulted cache device - that's also a miss.)
6433 */
6434 if (spa_has_l2) {
6435 /*
6436 * Skip ARC stat bump for block pointers with
6437 * embedded data. The data are read from the
6438 * blkptr itself via
6439 * decode_embedded_bp_compressed().
6440 */
6441 if (!embedded_bp) {
6442 DTRACE_PROBE1(l2arc__miss,
6443 arc_buf_hdr_t *, hdr);
6444 ARCSTAT_BUMP(arcstat_l2_misses);
6445 }
6446 }
6447 }
6448
6449 rzio = zio_read(pio, spa, bp, hdr_abd, size,
6450 arc_read_done, hdr, priority, zio_flags, zb);
6451 acb->acb_zio_head = rzio;
6452
6453 if (hash_lock != NULL)
6454 mutex_exit(hash_lock);
6455
6456 if (*arc_flags & ARC_FLAG_WAIT) {
6457 rc = zio_wait(rzio);
6458 goto out;
6459 }
6460
6461 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6462 zio_nowait(rzio);
6463 }
6464
6465 out:
6466 /* embedded bps don't actually go to disk */
6467 if (!embedded_bp)
6468 spa_read_history_add(spa, zb, *arc_flags);
6469 spl_fstrans_unmark(cookie);
6470 return (rc);
6471
6472 done:
6473 if (done)
6474 done(NULL, zb, bp, buf, private);
6475 if (pio && rc != 0) {
6476 zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
6477 zio->io_error = rc;
6478 zio_nowait(zio);
6479 }
6480 goto out;
6481 }
6482
6483 arc_prune_t *
arc_add_prune_callback(arc_prune_func_t * func,void * private)6484 arc_add_prune_callback(arc_prune_func_t *func, void *private)
6485 {
6486 arc_prune_t *p;
6487
6488 p = kmem_alloc(sizeof (*p), KM_SLEEP);
6489 p->p_pfunc = func;
6490 p->p_private = private;
6491 list_link_init(&p->p_node);
6492 zfs_refcount_create(&p->p_refcnt);
6493
6494 mutex_enter(&arc_prune_mtx);
6495 zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
6496 list_insert_head(&arc_prune_list, p);
6497 mutex_exit(&arc_prune_mtx);
6498
6499 return (p);
6500 }
6501
6502 void
arc_remove_prune_callback(arc_prune_t * p)6503 arc_remove_prune_callback(arc_prune_t *p)
6504 {
6505 boolean_t wait = B_FALSE;
6506 mutex_enter(&arc_prune_mtx);
6507 list_remove(&arc_prune_list, p);
6508 if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
6509 wait = B_TRUE;
6510 mutex_exit(&arc_prune_mtx);
6511
6512 /* wait for arc_prune_task to finish */
6513 if (wait)
6514 taskq_wait_outstanding(arc_prune_taskq, 0);
6515 ASSERT0(zfs_refcount_count(&p->p_refcnt));
6516 zfs_refcount_destroy(&p->p_refcnt);
6517 kmem_free(p, sizeof (*p));
6518 }
6519
6520 /*
6521 * Helper function for arc_prune_async() it is responsible for safely
6522 * handling the execution of a registered arc_prune_func_t.
6523 */
6524 static void
arc_prune_task(void * ptr)6525 arc_prune_task(void *ptr)
6526 {
6527 arc_prune_t *ap = (arc_prune_t *)ptr;
6528 arc_prune_func_t *func = ap->p_pfunc;
6529
6530 if (func != NULL)
6531 func(ap->p_adjust, ap->p_private);
6532
6533 (void) zfs_refcount_remove(&ap->p_refcnt, func);
6534 }
6535
6536 /*
6537 * Notify registered consumers they must drop holds on a portion of the ARC
6538 * buffers they reference. This provides a mechanism to ensure the ARC can
6539 * honor the metadata limit and reclaim otherwise pinned ARC buffers.
6540 *
6541 * This operation is performed asynchronously so it may be safely called
6542 * in the context of the arc_reclaim_thread(). A reference is taken here
6543 * for each registered arc_prune_t and the arc_prune_task() is responsible
6544 * for releasing it once the registered arc_prune_func_t has completed.
6545 */
6546 static void
arc_prune_async(uint64_t adjust)6547 arc_prune_async(uint64_t adjust)
6548 {
6549 arc_prune_t *ap;
6550
6551 mutex_enter(&arc_prune_mtx);
6552 for (ap = list_head(&arc_prune_list); ap != NULL;
6553 ap = list_next(&arc_prune_list, ap)) {
6554
6555 if (zfs_refcount_count(&ap->p_refcnt) >= 2)
6556 continue;
6557
6558 zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
6559 ap->p_adjust = adjust;
6560 if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
6561 ap, TQ_SLEEP) == TASKQID_INVALID) {
6562 (void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
6563 continue;
6564 }
6565 ARCSTAT_BUMP(arcstat_prune);
6566 }
6567 mutex_exit(&arc_prune_mtx);
6568 }
6569
6570 /*
6571 * Notify the arc that a block was freed, and thus will never be used again.
6572 */
6573 void
arc_freed(spa_t * spa,const blkptr_t * bp)6574 arc_freed(spa_t *spa, const blkptr_t *bp)
6575 {
6576 arc_buf_hdr_t *hdr;
6577 kmutex_t *hash_lock;
6578 uint64_t guid = spa_load_guid(spa);
6579
6580 ASSERT(!BP_IS_EMBEDDED(bp));
6581
6582 hdr = buf_hash_find(guid, bp, &hash_lock);
6583 if (hdr == NULL)
6584 return;
6585
6586 /*
6587 * We might be trying to free a block that is still doing I/O
6588 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
6589 * dmu_sync-ed block). A block may also have a reference if it is
6590 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6591 * have written the new block to its final resting place on disk but
6592 * without the dedup flag set. This would have left the hdr in the MRU
6593 * state and discoverable. When the txg finally syncs it detects that
6594 * the block was overridden in open context and issues an override I/O.
6595 * Since this is a dedup block, the override I/O will determine if the
6596 * block is already in the DDT. If so, then it will replace the io_bp
6597 * with the bp from the DDT and allow the I/O to finish. When the I/O
6598 * reaches the done callback, dbuf_write_override_done, it will
6599 * check to see if the io_bp and io_bp_override are identical.
6600 * If they are not, then it indicates that the bp was replaced with
6601 * the bp in the DDT and the override bp is freed. This allows
6602 * us to arrive here with a reference on a block that is being
6603 * freed. So if we have an I/O in progress, or a reference to
6604 * this hdr, then we don't destroy the hdr.
6605 */
6606 if (!HDR_HAS_L1HDR(hdr) ||
6607 zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6608 arc_change_state(arc_anon, hdr);
6609 arc_hdr_destroy(hdr);
6610 mutex_exit(hash_lock);
6611 } else {
6612 mutex_exit(hash_lock);
6613 }
6614
6615 }
6616
6617 /*
6618 * Release this buffer from the cache, making it an anonymous buffer. This
6619 * must be done after a read and prior to modifying the buffer contents.
6620 * If the buffer has more than one reference, we must make
6621 * a new hdr for the buffer.
6622 */
6623 void
arc_release(arc_buf_t * buf,const void * tag)6624 arc_release(arc_buf_t *buf, const void *tag)
6625 {
6626 arc_buf_hdr_t *hdr = buf->b_hdr;
6627
6628 /*
6629 * It would be nice to assert that if its DMU metadata (level >
6630 * 0 || it's the dnode file), then it must be syncing context.
6631 * But we don't know that information at this level.
6632 */
6633
6634 ASSERT(HDR_HAS_L1HDR(hdr));
6635
6636 /*
6637 * We don't grab the hash lock prior to this check, because if
6638 * the buffer's header is in the arc_anon state, it won't be
6639 * linked into the hash table.
6640 */
6641 if (hdr->b_l1hdr.b_state == arc_anon) {
6642 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6643 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6644 ASSERT(!HDR_HAS_L2HDR(hdr));
6645
6646 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
6647 ASSERT(ARC_BUF_LAST(buf));
6648 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6649 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6650
6651 hdr->b_l1hdr.b_arc_access = 0;
6652
6653 /*
6654 * If the buf is being overridden then it may already
6655 * have a hdr that is not empty.
6656 */
6657 buf_discard_identity(hdr);
6658 arc_buf_thaw(buf);
6659
6660 return;
6661 }
6662
6663 kmutex_t *hash_lock = HDR_LOCK(hdr);
6664 mutex_enter(hash_lock);
6665
6666 /*
6667 * This assignment is only valid as long as the hash_lock is
6668 * held, we must be careful not to reference state or the
6669 * b_state field after dropping the lock.
6670 */
6671 arc_state_t *state = hdr->b_l1hdr.b_state;
6672 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6673 ASSERT3P(state, !=, arc_anon);
6674 ASSERT3P(state, !=, arc_l2c_only);
6675
6676 /* this buffer is not on any list */
6677 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6678
6679 /*
6680 * Do we have more than one buf? Or L2_WRITING with unshared data?
6681 * Single-buf L2_WRITING with shared data can reuse the header since
6682 * L2ARC uses its own transformed copy.
6683 */
6684 if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf) ||
6685 (HDR_L2_WRITING(hdr) && !ARC_BUF_SHARED(buf))) {
6686 arc_buf_hdr_t *nhdr;
6687 uint64_t spa = hdr->b_spa;
6688 uint64_t psize = HDR_GET_PSIZE(hdr);
6689 uint64_t lsize = HDR_GET_LSIZE(hdr);
6690 boolean_t protected = HDR_PROTECTED(hdr);
6691 enum zio_compress compress = arc_hdr_get_compress(hdr);
6692 uint8_t complevel = hdr->b_complevel;
6693 arc_buf_contents_t type = arc_buf_type(hdr);
6694 boolean_t single_buf_l2writing = (hdr->b_l1hdr.b_buf == buf &&
6695 ARC_BUF_LAST(buf) && HDR_L2_WRITING(hdr));
6696
6697 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
6698 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6699 ASSERT(ARC_BUF_LAST(buf));
6700 }
6701
6702 /*
6703 * Pull the buffer off of this hdr and find the last buffer
6704 * in the hdr's buffer list.
6705 */
6706 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6707 EQUIV(single_buf_l2writing, lastbuf == NULL);
6708
6709 /*
6710 * If the current arc_buf_t and the hdr are sharing their data
6711 * buffer, then we must stop sharing that block.
6712 */
6713 if (!single_buf_l2writing) {
6714 if (ARC_BUF_SHARED(buf)) {
6715 ASSERT(!arc_buf_is_shared(lastbuf));
6716
6717 /*
6718 * First, sever the block sharing relationship
6719 * between buf and the arc_buf_hdr_t.
6720 */
6721 arc_unshare_buf(hdr, buf);
6722
6723 /*
6724 * Now we need to recreate the hdr's b_pabd.
6725 * Since we have lastbuf handy, we try to share
6726 * with it, but if we can't then we allocate a
6727 * new b_pabd and copy the data from buf into it
6728 */
6729 if (arc_can_share(hdr, lastbuf)) {
6730 arc_share_buf(hdr, lastbuf);
6731 } else {
6732 arc_hdr_alloc_abd(hdr, 0);
6733 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6734 buf->b_data, psize);
6735 }
6736 } else if (HDR_SHARED_DATA(hdr)) {
6737 /*
6738 * Uncompressed shared buffers are always at the
6739 * end of the list. Compressed buffers don't
6740 * have the same requirements. This makes it
6741 * hard to simply assert that the lastbuf is
6742 * shared so we rely on the hdr's compression
6743 * flags to determine if we have a compressed,
6744 * shared buffer.
6745 */
6746 ASSERT(arc_buf_is_shared(lastbuf) ||
6747 arc_hdr_get_compress(hdr) !=
6748 ZIO_COMPRESS_OFF);
6749 ASSERT(!arc_buf_is_shared(buf));
6750 }
6751 }
6752
6753 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
6754
6755 (void) zfs_refcount_remove_many(&state->arcs_size[type],
6756 arc_buf_size(buf), buf);
6757
6758 arc_cksum_verify(buf);
6759 arc_buf_unwatch(buf);
6760
6761 /* if this is the last uncompressed buf free the checksum */
6762 if (!arc_hdr_has_uncompressed_buf(hdr))
6763 arc_cksum_free(hdr);
6764
6765 if (single_buf_l2writing)
6766 VERIFY3S(remove_reference(hdr, tag), ==, 0);
6767 else
6768 VERIFY3S(remove_reference(hdr, tag), >, 0);
6769
6770 mutex_exit(hash_lock);
6771
6772 nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress,
6773 complevel, type);
6774 ASSERT0P(nhdr->b_l1hdr.b_buf);
6775 ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
6776 VERIFY3U(nhdr->b_type, ==, type);
6777 ASSERT(!HDR_SHARED_DATA(nhdr));
6778
6779 nhdr->b_l1hdr.b_buf = buf;
6780 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6781 buf->b_hdr = nhdr;
6782
6783 (void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
6784 arc_buf_size(buf), buf);
6785 } else {
6786 ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6787 /* protected by hash lock, or hdr is on arc_anon */
6788 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6789 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6790
6791 if (HDR_HAS_L2HDR(hdr)) {
6792 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6793 /* Recheck to prevent race with l2arc_evict(). */
6794 if (HDR_HAS_L2HDR(hdr))
6795 arc_hdr_l2hdr_destroy(hdr);
6796 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6797 }
6798
6799 hdr->b_l1hdr.b_mru_hits = 0;
6800 hdr->b_l1hdr.b_mru_ghost_hits = 0;
6801 hdr->b_l1hdr.b_mfu_hits = 0;
6802 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
6803 arc_change_state(arc_anon, hdr);
6804 hdr->b_l1hdr.b_arc_access = 0;
6805
6806 mutex_exit(hash_lock);
6807 buf_discard_identity(hdr);
6808 arc_buf_thaw(buf);
6809 }
6810 }
6811
6812 int
arc_released(arc_buf_t * buf)6813 arc_released(arc_buf_t *buf)
6814 {
6815 return (buf->b_data != NULL &&
6816 buf->b_hdr->b_l1hdr.b_state == arc_anon);
6817 }
6818
6819 #ifdef ZFS_DEBUG
6820 int
arc_referenced(arc_buf_t * buf)6821 arc_referenced(arc_buf_t *buf)
6822 {
6823 return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6824 }
6825 #endif
6826
6827 static void
arc_write_ready(zio_t * zio)6828 arc_write_ready(zio_t *zio)
6829 {
6830 arc_write_callback_t *callback = zio->io_private;
6831 arc_buf_t *buf = callback->awcb_buf;
6832 arc_buf_hdr_t *hdr = buf->b_hdr;
6833 blkptr_t *bp = zio->io_bp;
6834 uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
6835 fstrans_cookie_t cookie = spl_fstrans_mark();
6836
6837 ASSERT(HDR_HAS_L1HDR(hdr));
6838 ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6839 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
6840
6841 /*
6842 * If we're reexecuting this zio because the pool suspended, then
6843 * cleanup any state that was previously set the first time the
6844 * callback was invoked.
6845 */
6846 if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6847 arc_cksum_free(hdr);
6848 arc_buf_unwatch(buf);
6849 if (hdr->b_l1hdr.b_pabd != NULL) {
6850 if (ARC_BUF_SHARED(buf)) {
6851 arc_unshare_buf(hdr, buf);
6852 } else {
6853 ASSERT(!arc_buf_is_shared(buf));
6854 arc_hdr_free_abd(hdr, B_FALSE);
6855 }
6856 }
6857
6858 if (HDR_HAS_RABD(hdr))
6859 arc_hdr_free_abd(hdr, B_TRUE);
6860 }
6861 ASSERT0P(hdr->b_l1hdr.b_pabd);
6862 ASSERT(!HDR_HAS_RABD(hdr));
6863 ASSERT(!HDR_SHARED_DATA(hdr));
6864 ASSERT(!arc_buf_is_shared(buf));
6865
6866 callback->awcb_ready(zio, buf, callback->awcb_private);
6867
6868 if (HDR_IO_IN_PROGRESS(hdr)) {
6869 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6870 } else {
6871 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6872 add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
6873 }
6874
6875 if (BP_IS_PROTECTED(bp)) {
6876 /* ZIL blocks are written through zio_rewrite */
6877 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6878
6879 if (BP_SHOULD_BYTESWAP(bp)) {
6880 if (BP_GET_LEVEL(bp) > 0) {
6881 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6882 } else {
6883 hdr->b_l1hdr.b_byteswap =
6884 DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6885 }
6886 } else {
6887 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6888 }
6889
6890 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
6891 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6892 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6893 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6894 hdr->b_crypt_hdr.b_iv);
6895 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6896 } else {
6897 arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
6898 }
6899
6900 /*
6901 * If this block was written for raw encryption but the zio layer
6902 * ended up only authenticating it, adjust the buffer flags now.
6903 */
6904 if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6905 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6906 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6907 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6908 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6909 } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6910 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6911 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6912 }
6913
6914 /* this must be done after the buffer flags are adjusted */
6915 arc_cksum_compute(buf);
6916
6917 enum zio_compress compress;
6918 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
6919 compress = ZIO_COMPRESS_OFF;
6920 } else {
6921 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6922 compress = BP_GET_COMPRESS(bp);
6923 }
6924 HDR_SET_PSIZE(hdr, psize);
6925 arc_hdr_set_compress(hdr, compress);
6926 hdr->b_complevel = zio->io_prop.zp_complevel;
6927
6928 if (zio->io_error != 0 || psize == 0)
6929 goto out;
6930
6931 /*
6932 * Fill the hdr with data. If the buffer is encrypted we have no choice
6933 * but to copy the data into b_radb. If the hdr is compressed, the data
6934 * we want is available from the zio, otherwise we can take it from
6935 * the buf.
6936 *
6937 * We might be able to share the buf's data with the hdr here. However,
6938 * doing so would cause the ARC to be full of linear ABDs if we write a
6939 * lot of shareable data. As a compromise, we check whether scattered
6940 * ABDs are allowed, and assume that if they are then the user wants
6941 * the ARC to be primarily filled with them regardless of the data being
6942 * written. Therefore, if they're allowed then we allocate one and copy
6943 * the data into it; otherwise, we share the data directly if we can.
6944 */
6945 if (ARC_BUF_ENCRYPTED(buf)) {
6946 ASSERT3U(psize, >, 0);
6947 ASSERT(ARC_BUF_COMPRESSED(buf));
6948 arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
6949 ARC_HDR_USE_RESERVE);
6950 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6951 } else if (!(HDR_UNCACHED(hdr) ||
6952 abd_size_alloc_linear(arc_buf_size(buf))) ||
6953 !arc_can_share(hdr, buf)) {
6954 /*
6955 * Ideally, we would always copy the io_abd into b_pabd, but the
6956 * user may have disabled compressed ARC, thus we must check the
6957 * hdr's compression setting rather than the io_bp's.
6958 */
6959 if (BP_IS_ENCRYPTED(bp)) {
6960 ASSERT3U(psize, >, 0);
6961 arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
6962 ARC_HDR_USE_RESERVE);
6963 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6964 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6965 !ARC_BUF_COMPRESSED(buf)) {
6966 ASSERT3U(psize, >, 0);
6967 arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
6968 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6969 } else {
6970 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6971 arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
6972 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6973 arc_buf_size(buf));
6974 }
6975 } else {
6976 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6977 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6978 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
6979 ASSERT(ARC_BUF_LAST(buf));
6980
6981 arc_share_buf(hdr, buf);
6982 }
6983
6984 out:
6985 arc_hdr_verify(hdr, bp);
6986 spl_fstrans_unmark(cookie);
6987 }
6988
6989 static void
arc_write_children_ready(zio_t * zio)6990 arc_write_children_ready(zio_t *zio)
6991 {
6992 arc_write_callback_t *callback = zio->io_private;
6993 arc_buf_t *buf = callback->awcb_buf;
6994
6995 callback->awcb_children_ready(zio, buf, callback->awcb_private);
6996 }
6997
6998 static void
arc_write_done(zio_t * zio)6999 arc_write_done(zio_t *zio)
7000 {
7001 arc_write_callback_t *callback = zio->io_private;
7002 arc_buf_t *buf = callback->awcb_buf;
7003 arc_buf_hdr_t *hdr = buf->b_hdr;
7004
7005 ASSERT0P(hdr->b_l1hdr.b_acb);
7006
7007 if (zio->io_error == 0) {
7008 arc_hdr_verify(hdr, zio->io_bp);
7009
7010 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
7011 buf_discard_identity(hdr);
7012 } else {
7013 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
7014 hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp);
7015 }
7016 } else {
7017 ASSERT(HDR_EMPTY(hdr));
7018 }
7019
7020 /*
7021 * If the block to be written was all-zero or compressed enough to be
7022 * embedded in the BP, no write was performed so there will be no
7023 * dva/birth/checksum. The buffer must therefore remain anonymous
7024 * (and uncached).
7025 */
7026 if (!HDR_EMPTY(hdr)) {
7027 arc_buf_hdr_t *exists;
7028 kmutex_t *hash_lock;
7029
7030 ASSERT0(zio->io_error);
7031
7032 arc_cksum_verify(buf);
7033
7034 exists = buf_hash_insert(hdr, &hash_lock);
7035 if (exists != NULL) {
7036 /*
7037 * This can only happen if we overwrite for
7038 * sync-to-convergence, because we remove
7039 * buffers from the hash table when we arc_free().
7040 */
7041 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
7042 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7043 panic("bad overwrite, hdr=%p exists=%p",
7044 (void *)hdr, (void *)exists);
7045 ASSERT(zfs_refcount_is_zero(
7046 &exists->b_l1hdr.b_refcnt));
7047 arc_change_state(arc_anon, exists);
7048 arc_hdr_destroy(exists);
7049 mutex_exit(hash_lock);
7050 exists = buf_hash_insert(hdr, &hash_lock);
7051 ASSERT0P(exists);
7052 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
7053 /* nopwrite */
7054 ASSERT(zio->io_prop.zp_nopwrite);
7055 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7056 panic("bad nopwrite, hdr=%p exists=%p",
7057 (void *)hdr, (void *)exists);
7058 } else {
7059 /* Dedup */
7060 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
7061 ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
7062 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
7063 ASSERT(BP_GET_DEDUP(zio->io_bp));
7064 ASSERT0(BP_GET_LEVEL(zio->io_bp));
7065 }
7066 }
7067 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7068 VERIFY3S(remove_reference(hdr, hdr), >, 0);
7069 /* if it's not anon, we are doing a scrub */
7070 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
7071 arc_access(hdr, 0, B_FALSE);
7072 mutex_exit(hash_lock);
7073 } else {
7074 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7075 VERIFY3S(remove_reference(hdr, hdr), >, 0);
7076 }
7077
7078 callback->awcb_done(zio, buf, callback->awcb_private);
7079
7080 abd_free(zio->io_abd);
7081 kmem_free(callback, sizeof (arc_write_callback_t));
7082 }
7083
7084 zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t uncached,boolean_t l2arc,const zio_prop_t * zp,arc_write_done_func_t * ready,arc_write_done_func_t * children_ready,arc_write_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,const zbookmark_phys_t * zb)7085 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
7086 blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
7087 const zio_prop_t *zp, arc_write_done_func_t *ready,
7088 arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
7089 void *private, zio_priority_t priority, int zio_flags,
7090 const zbookmark_phys_t *zb)
7091 {
7092 arc_buf_hdr_t *hdr = buf->b_hdr;
7093 arc_write_callback_t *callback;
7094 zio_t *zio;
7095 zio_prop_t localprop = *zp;
7096
7097 ASSERT3P(ready, !=, NULL);
7098 ASSERT3P(done, !=, NULL);
7099 ASSERT(!HDR_IO_ERROR(hdr));
7100 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
7101 ASSERT0P(hdr->b_l1hdr.b_acb);
7102 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
7103 if (uncached)
7104 arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
7105 else if (l2arc)
7106 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
7107
7108 if (ARC_BUF_ENCRYPTED(buf)) {
7109 ASSERT(ARC_BUF_COMPRESSED(buf));
7110 localprop.zp_encrypt = B_TRUE;
7111 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7112 localprop.zp_complevel = hdr->b_complevel;
7113 localprop.zp_byteorder =
7114 (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
7115 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
7116 memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
7117 ZIO_DATA_SALT_LEN);
7118 memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
7119 ZIO_DATA_IV_LEN);
7120 memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
7121 ZIO_DATA_MAC_LEN);
7122 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
7123 localprop.zp_nopwrite = B_FALSE;
7124 localprop.zp_copies =
7125 MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
7126 localprop.zp_gang_copies =
7127 MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
7128 }
7129 zio_flags |= ZIO_FLAG_RAW;
7130 } else if (ARC_BUF_COMPRESSED(buf)) {
7131 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
7132 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7133 localprop.zp_complevel = hdr->b_complevel;
7134 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
7135 }
7136 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
7137 callback->awcb_ready = ready;
7138 callback->awcb_children_ready = children_ready;
7139 callback->awcb_done = done;
7140 callback->awcb_private = private;
7141 callback->awcb_buf = buf;
7142
7143 /*
7144 * The hdr's b_pabd is now stale, free it now. A new data block
7145 * will be allocated when the zio pipeline calls arc_write_ready().
7146 */
7147 if (hdr->b_l1hdr.b_pabd != NULL) {
7148 /*
7149 * If the buf is currently sharing the data block with
7150 * the hdr then we need to break that relationship here.
7151 * The hdr will remain with a NULL data pointer and the
7152 * buf will take sole ownership of the block.
7153 */
7154 if (ARC_BUF_SHARED(buf)) {
7155 arc_unshare_buf(hdr, buf);
7156 } else {
7157 ASSERT(!arc_buf_is_shared(buf));
7158 arc_hdr_free_abd(hdr, B_FALSE);
7159 }
7160 VERIFY3P(buf->b_data, !=, NULL);
7161 }
7162
7163 if (HDR_HAS_RABD(hdr))
7164 arc_hdr_free_abd(hdr, B_TRUE);
7165
7166 if (!(zio_flags & ZIO_FLAG_RAW))
7167 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
7168
7169 ASSERT(!arc_buf_is_shared(buf));
7170 ASSERT0P(hdr->b_l1hdr.b_pabd);
7171
7172 zio = zio_write(pio, spa, txg, bp,
7173 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
7174 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
7175 (children_ready != NULL) ? arc_write_children_ready : NULL,
7176 arc_write_done, callback, priority, zio_flags, zb);
7177
7178 return (zio);
7179 }
7180
7181 void
arc_tempreserve_clear(uint64_t reserve)7182 arc_tempreserve_clear(uint64_t reserve)
7183 {
7184 atomic_add_64(&arc_tempreserve, -reserve);
7185 ASSERT((int64_t)arc_tempreserve >= 0);
7186 }
7187
7188 int
arc_tempreserve_space(spa_t * spa,uint64_t reserve,uint64_t txg)7189 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
7190 {
7191 int error;
7192 uint64_t anon_size;
7193
7194 if (!arc_no_grow &&
7195 reserve > arc_c/4 &&
7196 reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
7197 arc_c = MIN(arc_c_max, reserve * 4);
7198
7199 /*
7200 * Throttle when the calculated memory footprint for the TXG
7201 * exceeds the target ARC size.
7202 */
7203 if (reserve > arc_c) {
7204 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
7205 return (SET_ERROR(ERESTART));
7206 }
7207
7208 /*
7209 * Don't count loaned bufs as in flight dirty data to prevent long
7210 * network delays from blocking transactions that are ready to be
7211 * assigned to a txg.
7212 */
7213
7214 /* assert that it has not wrapped around */
7215 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
7216
7217 anon_size = MAX((int64_t)
7218 (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
7219 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
7220 arc_loaned_bytes), 0);
7221
7222 /*
7223 * Writes will, almost always, require additional memory allocations
7224 * in order to compress/encrypt/etc the data. We therefore need to
7225 * make sure that there is sufficient available memory for this.
7226 */
7227 error = arc_memory_throttle(spa, reserve, txg);
7228 if (error != 0)
7229 return (error);
7230
7231 /*
7232 * Throttle writes when the amount of dirty data in the cache
7233 * gets too large. We try to keep the cache less than half full
7234 * of dirty blocks so that our sync times don't grow too large.
7235 *
7236 * In the case of one pool being built on another pool, we want
7237 * to make sure we don't end up throttling the lower (backing)
7238 * pool when the upper pool is the majority contributor to dirty
7239 * data. To insure we make forward progress during throttling, we
7240 * also check the current pool's net dirty data and only throttle
7241 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
7242 * data in the cache.
7243 *
7244 * Note: if two requests come in concurrently, we might let them
7245 * both succeed, when one of them should fail. Not a huge deal.
7246 */
7247 uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
7248 uint64_t spa_dirty_anon = spa_dirty_data(spa);
7249 uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
7250 if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
7251 anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
7252 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
7253 #ifdef ZFS_DEBUG
7254 uint64_t meta_esize = zfs_refcount_count(
7255 &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7256 uint64_t data_esize =
7257 zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7258 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
7259 "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
7260 (u_longlong_t)arc_tempreserve >> 10,
7261 (u_longlong_t)meta_esize >> 10,
7262 (u_longlong_t)data_esize >> 10,
7263 (u_longlong_t)reserve >> 10,
7264 (u_longlong_t)rarc_c >> 10);
7265 #endif
7266 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
7267 return (SET_ERROR(ERESTART));
7268 }
7269 atomic_add_64(&arc_tempreserve, reserve);
7270 return (0);
7271 }
7272
7273 static void
arc_kstat_update_state(arc_state_t * state,kstat_named_t * size,kstat_named_t * data,kstat_named_t * metadata,kstat_named_t * evict_data,kstat_named_t * evict_metadata)7274 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
7275 kstat_named_t *data, kstat_named_t *metadata,
7276 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
7277 {
7278 data->value.ui64 =
7279 zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
7280 metadata->value.ui64 =
7281 zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
7282 size->value.ui64 = data->value.ui64 + metadata->value.ui64;
7283 evict_data->value.ui64 =
7284 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
7285 evict_metadata->value.ui64 =
7286 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
7287 }
7288
7289 static int
arc_kstat_update(kstat_t * ksp,int rw)7290 arc_kstat_update(kstat_t *ksp, int rw)
7291 {
7292 arc_stats_t *as = ksp->ks_data;
7293
7294 if (rw == KSTAT_WRITE)
7295 return (SET_ERROR(EACCES));
7296
7297 as->arcstat_hits.value.ui64 =
7298 wmsum_value(&arc_sums.arcstat_hits);
7299 as->arcstat_iohits.value.ui64 =
7300 wmsum_value(&arc_sums.arcstat_iohits);
7301 as->arcstat_misses.value.ui64 =
7302 wmsum_value(&arc_sums.arcstat_misses);
7303 as->arcstat_demand_data_hits.value.ui64 =
7304 wmsum_value(&arc_sums.arcstat_demand_data_hits);
7305 as->arcstat_demand_data_iohits.value.ui64 =
7306 wmsum_value(&arc_sums.arcstat_demand_data_iohits);
7307 as->arcstat_demand_data_misses.value.ui64 =
7308 wmsum_value(&arc_sums.arcstat_demand_data_misses);
7309 as->arcstat_demand_metadata_hits.value.ui64 =
7310 wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
7311 as->arcstat_demand_metadata_iohits.value.ui64 =
7312 wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
7313 as->arcstat_demand_metadata_misses.value.ui64 =
7314 wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
7315 as->arcstat_prefetch_data_hits.value.ui64 =
7316 wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
7317 as->arcstat_prefetch_data_iohits.value.ui64 =
7318 wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
7319 as->arcstat_prefetch_data_misses.value.ui64 =
7320 wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
7321 as->arcstat_prefetch_metadata_hits.value.ui64 =
7322 wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
7323 as->arcstat_prefetch_metadata_iohits.value.ui64 =
7324 wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
7325 as->arcstat_prefetch_metadata_misses.value.ui64 =
7326 wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
7327 as->arcstat_mru_hits.value.ui64 =
7328 wmsum_value(&arc_sums.arcstat_mru_hits);
7329 as->arcstat_mru_ghost_hits.value.ui64 =
7330 wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
7331 as->arcstat_mfu_hits.value.ui64 =
7332 wmsum_value(&arc_sums.arcstat_mfu_hits);
7333 as->arcstat_mfu_ghost_hits.value.ui64 =
7334 wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
7335 as->arcstat_uncached_hits.value.ui64 =
7336 wmsum_value(&arc_sums.arcstat_uncached_hits);
7337 as->arcstat_deleted.value.ui64 =
7338 wmsum_value(&arc_sums.arcstat_deleted);
7339 as->arcstat_mutex_miss.value.ui64 =
7340 wmsum_value(&arc_sums.arcstat_mutex_miss);
7341 as->arcstat_access_skip.value.ui64 =
7342 wmsum_value(&arc_sums.arcstat_access_skip);
7343 as->arcstat_evict_skip.value.ui64 =
7344 wmsum_value(&arc_sums.arcstat_evict_skip);
7345 as->arcstat_evict_not_enough.value.ui64 =
7346 wmsum_value(&arc_sums.arcstat_evict_not_enough);
7347 as->arcstat_evict_l2_cached.value.ui64 =
7348 wmsum_value(&arc_sums.arcstat_evict_l2_cached);
7349 as->arcstat_evict_l2_eligible.value.ui64 =
7350 wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
7351 as->arcstat_evict_l2_eligible_mfu.value.ui64 =
7352 wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
7353 as->arcstat_evict_l2_eligible_mru.value.ui64 =
7354 wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
7355 as->arcstat_evict_l2_ineligible.value.ui64 =
7356 wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
7357 as->arcstat_evict_l2_skip.value.ui64 =
7358 wmsum_value(&arc_sums.arcstat_evict_l2_skip);
7359 as->arcstat_hash_elements.value.ui64 =
7360 as->arcstat_hash_elements_max.value.ui64 =
7361 wmsum_value(&arc_sums.arcstat_hash_elements);
7362 as->arcstat_hash_collisions.value.ui64 =
7363 wmsum_value(&arc_sums.arcstat_hash_collisions);
7364 as->arcstat_hash_chains.value.ui64 =
7365 wmsum_value(&arc_sums.arcstat_hash_chains);
7366 as->arcstat_size.value.ui64 =
7367 aggsum_value(&arc_sums.arcstat_size);
7368 as->arcstat_compressed_size.value.ui64 =
7369 wmsum_value(&arc_sums.arcstat_compressed_size);
7370 as->arcstat_uncompressed_size.value.ui64 =
7371 wmsum_value(&arc_sums.arcstat_uncompressed_size);
7372 as->arcstat_overhead_size.value.ui64 =
7373 wmsum_value(&arc_sums.arcstat_overhead_size);
7374 as->arcstat_hdr_size.value.ui64 =
7375 wmsum_value(&arc_sums.arcstat_hdr_size);
7376 as->arcstat_data_size.value.ui64 =
7377 wmsum_value(&arc_sums.arcstat_data_size);
7378 as->arcstat_metadata_size.value.ui64 =
7379 wmsum_value(&arc_sums.arcstat_metadata_size);
7380 as->arcstat_dbuf_size.value.ui64 =
7381 wmsum_value(&arc_sums.arcstat_dbuf_size);
7382 #if defined(COMPAT_FREEBSD11)
7383 as->arcstat_other_size.value.ui64 =
7384 wmsum_value(&arc_sums.arcstat_bonus_size) +
7385 aggsum_value(&arc_sums.arcstat_dnode_size) +
7386 wmsum_value(&arc_sums.arcstat_dbuf_size);
7387 #endif
7388
7389 arc_kstat_update_state(arc_anon,
7390 &as->arcstat_anon_size,
7391 &as->arcstat_anon_data,
7392 &as->arcstat_anon_metadata,
7393 &as->arcstat_anon_evictable_data,
7394 &as->arcstat_anon_evictable_metadata);
7395 arc_kstat_update_state(arc_mru,
7396 &as->arcstat_mru_size,
7397 &as->arcstat_mru_data,
7398 &as->arcstat_mru_metadata,
7399 &as->arcstat_mru_evictable_data,
7400 &as->arcstat_mru_evictable_metadata);
7401 arc_kstat_update_state(arc_mru_ghost,
7402 &as->arcstat_mru_ghost_size,
7403 &as->arcstat_mru_ghost_data,
7404 &as->arcstat_mru_ghost_metadata,
7405 &as->arcstat_mru_ghost_evictable_data,
7406 &as->arcstat_mru_ghost_evictable_metadata);
7407 arc_kstat_update_state(arc_mfu,
7408 &as->arcstat_mfu_size,
7409 &as->arcstat_mfu_data,
7410 &as->arcstat_mfu_metadata,
7411 &as->arcstat_mfu_evictable_data,
7412 &as->arcstat_mfu_evictable_metadata);
7413 arc_kstat_update_state(arc_mfu_ghost,
7414 &as->arcstat_mfu_ghost_size,
7415 &as->arcstat_mfu_ghost_data,
7416 &as->arcstat_mfu_ghost_metadata,
7417 &as->arcstat_mfu_ghost_evictable_data,
7418 &as->arcstat_mfu_ghost_evictable_metadata);
7419 arc_kstat_update_state(arc_uncached,
7420 &as->arcstat_uncached_size,
7421 &as->arcstat_uncached_data,
7422 &as->arcstat_uncached_metadata,
7423 &as->arcstat_uncached_evictable_data,
7424 &as->arcstat_uncached_evictable_metadata);
7425
7426 as->arcstat_dnode_size.value.ui64 =
7427 aggsum_value(&arc_sums.arcstat_dnode_size);
7428 as->arcstat_bonus_size.value.ui64 =
7429 wmsum_value(&arc_sums.arcstat_bonus_size);
7430 as->arcstat_l2_ndev.value.ui64 = l2arc_ndev;
7431 as->arcstat_l2_hits.value.ui64 =
7432 wmsum_value(&arc_sums.arcstat_l2_hits);
7433 as->arcstat_l2_misses.value.ui64 =
7434 wmsum_value(&arc_sums.arcstat_l2_misses);
7435 as->arcstat_l2_prefetch_asize.value.ui64 =
7436 wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
7437 as->arcstat_l2_mru_asize.value.ui64 =
7438 wmsum_value(&arc_sums.arcstat_l2_mru_asize);
7439 as->arcstat_l2_mfu_asize.value.ui64 =
7440 wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
7441 as->arcstat_l2_bufc_data_asize.value.ui64 =
7442 wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
7443 as->arcstat_l2_bufc_metadata_asize.value.ui64 =
7444 wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
7445 as->arcstat_l2_feeds.value.ui64 =
7446 wmsum_value(&arc_sums.arcstat_l2_feeds);
7447 as->arcstat_l2_rw_clash.value.ui64 =
7448 wmsum_value(&arc_sums.arcstat_l2_rw_clash);
7449 as->arcstat_l2_read_bytes.value.ui64 =
7450 wmsum_value(&arc_sums.arcstat_l2_read_bytes);
7451 as->arcstat_l2_write_bytes.value.ui64 =
7452 wmsum_value(&arc_sums.arcstat_l2_write_bytes);
7453 as->arcstat_l2_writes_sent.value.ui64 =
7454 wmsum_value(&arc_sums.arcstat_l2_writes_sent);
7455 as->arcstat_l2_writes_done.value.ui64 =
7456 wmsum_value(&arc_sums.arcstat_l2_writes_done);
7457 as->arcstat_l2_writes_error.value.ui64 =
7458 wmsum_value(&arc_sums.arcstat_l2_writes_error);
7459 as->arcstat_l2_writes_lock_retry.value.ui64 =
7460 wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
7461 as->arcstat_l2_evict_lock_retry.value.ui64 =
7462 wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
7463 as->arcstat_l2_evict_reading.value.ui64 =
7464 wmsum_value(&arc_sums.arcstat_l2_evict_reading);
7465 as->arcstat_l2_evict_l1cached.value.ui64 =
7466 wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
7467 as->arcstat_l2_free_on_write.value.ui64 =
7468 wmsum_value(&arc_sums.arcstat_l2_free_on_write);
7469 as->arcstat_l2_abort_lowmem.value.ui64 =
7470 wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
7471 as->arcstat_l2_cksum_bad.value.ui64 =
7472 wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
7473 as->arcstat_l2_io_error.value.ui64 =
7474 wmsum_value(&arc_sums.arcstat_l2_io_error);
7475 as->arcstat_l2_lsize.value.ui64 =
7476 wmsum_value(&arc_sums.arcstat_l2_lsize);
7477 as->arcstat_l2_psize.value.ui64 =
7478 wmsum_value(&arc_sums.arcstat_l2_psize);
7479 as->arcstat_l2_hdr_size.value.ui64 =
7480 aggsum_value(&arc_sums.arcstat_l2_hdr_size);
7481 as->arcstat_l2_log_blk_writes.value.ui64 =
7482 wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
7483 as->arcstat_l2_log_blk_asize.value.ui64 =
7484 wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
7485 as->arcstat_l2_log_blk_count.value.ui64 =
7486 wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
7487 as->arcstat_l2_rebuild_success.value.ui64 =
7488 wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
7489 as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
7490 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
7491 as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
7492 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
7493 as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
7494 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
7495 as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
7496 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
7497 as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
7498 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
7499 as->arcstat_l2_rebuild_size.value.ui64 =
7500 wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
7501 as->arcstat_l2_rebuild_asize.value.ui64 =
7502 wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
7503 as->arcstat_l2_rebuild_bufs.value.ui64 =
7504 wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
7505 as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
7506 wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
7507 as->arcstat_l2_rebuild_log_blks.value.ui64 =
7508 wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
7509 as->arcstat_memory_throttle_count.value.ui64 =
7510 wmsum_value(&arc_sums.arcstat_memory_throttle_count);
7511 as->arcstat_memory_direct_count.value.ui64 =
7512 wmsum_value(&arc_sums.arcstat_memory_direct_count);
7513 as->arcstat_memory_indirect_count.value.ui64 =
7514 wmsum_value(&arc_sums.arcstat_memory_indirect_count);
7515
7516 as->arcstat_memory_all_bytes.value.ui64 =
7517 arc_all_memory();
7518 as->arcstat_memory_free_bytes.value.ui64 =
7519 arc_free_memory();
7520 as->arcstat_memory_available_bytes.value.i64 =
7521 arc_available_memory();
7522
7523 as->arcstat_prune.value.ui64 =
7524 wmsum_value(&arc_sums.arcstat_prune);
7525 as->arcstat_meta_used.value.ui64 =
7526 wmsum_value(&arc_sums.arcstat_meta_used);
7527 as->arcstat_async_upgrade_sync.value.ui64 =
7528 wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
7529 as->arcstat_predictive_prefetch.value.ui64 =
7530 wmsum_value(&arc_sums.arcstat_predictive_prefetch);
7531 as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
7532 wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
7533 as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
7534 wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
7535 as->arcstat_prescient_prefetch.value.ui64 =
7536 wmsum_value(&arc_sums.arcstat_prescient_prefetch);
7537 as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
7538 wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
7539 as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
7540 wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
7541 as->arcstat_raw_size.value.ui64 =
7542 wmsum_value(&arc_sums.arcstat_raw_size);
7543 as->arcstat_cached_only_in_progress.value.ui64 =
7544 wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
7545 as->arcstat_abd_chunk_waste_size.value.ui64 =
7546 wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
7547
7548 return (0);
7549 }
7550
7551 /*
7552 * This function *must* return indices evenly distributed between all
7553 * sublists of the multilist. This is needed due to how the ARC eviction
7554 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
7555 * distributed between all sublists and uses this assumption when
7556 * deciding which sublist to evict from and how much to evict from it.
7557 */
7558 static unsigned int
arc_state_multilist_index_func(multilist_t * ml,void * obj)7559 arc_state_multilist_index_func(multilist_t *ml, void *obj)
7560 {
7561 arc_buf_hdr_t *hdr = obj;
7562
7563 /*
7564 * We rely on b_dva to generate evenly distributed index
7565 * numbers using buf_hash below. So, as an added precaution,
7566 * let's make sure we never add empty buffers to the arc lists.
7567 */
7568 ASSERT(!HDR_EMPTY(hdr));
7569
7570 /*
7571 * The assumption here, is the hash value for a given
7572 * arc_buf_hdr_t will remain constant throughout its lifetime
7573 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
7574 * Thus, we don't need to store the header's sublist index
7575 * on insertion, as this index can be recalculated on removal.
7576 *
7577 * Also, the low order bits of the hash value are thought to be
7578 * distributed evenly. Otherwise, in the case that the multilist
7579 * has a power of two number of sublists, each sublists' usage
7580 * would not be evenly distributed. In this context full 64bit
7581 * division would be a waste of time, so limit it to 32 bits.
7582 */
7583 return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7584 multilist_get_num_sublists(ml));
7585 }
7586
7587 static unsigned int
arc_state_l2c_multilist_index_func(multilist_t * ml,void * obj)7588 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
7589 {
7590 panic("Header %p insert into arc_l2c_only %p", obj, ml);
7591 }
7592
7593 #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \
7594 if ((do_warn) && (tuning) && ((tuning) != (value))) { \
7595 cmn_err(CE_WARN, \
7596 "ignoring tunable %s (using %llu instead)", \
7597 (#tuning), (u_longlong_t)(value)); \
7598 } \
7599 } while (0)
7600
7601 /*
7602 * Called during module initialization and periodically thereafter to
7603 * apply reasonable changes to the exposed performance tunings. Can also be
7604 * called explicitly by param_set_arc_*() functions when ARC tunables are
7605 * updated manually. Non-zero zfs_* values which differ from the currently set
7606 * values will be applied.
7607 */
7608 void
arc_tuning_update(boolean_t verbose)7609 arc_tuning_update(boolean_t verbose)
7610 {
7611 uint64_t allmem = arc_all_memory();
7612
7613 /* Valid range: 32M - <arc_c_max> */
7614 if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
7615 (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
7616 (zfs_arc_min <= arc_c_max)) {
7617 arc_c_min = zfs_arc_min;
7618 arc_c = MAX(arc_c, arc_c_min);
7619 }
7620 WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
7621
7622 /* Valid range: 64M - <all physical memory> */
7623 if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
7624 (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
7625 (zfs_arc_max > arc_c_min)) {
7626 arc_c_max = zfs_arc_max;
7627 arc_c = MIN(arc_c, arc_c_max);
7628 if (arc_dnode_limit > arc_c_max)
7629 arc_dnode_limit = arc_c_max;
7630 }
7631 WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
7632
7633 /* Valid range: 0 - <all physical memory> */
7634 arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
7635 MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
7636 WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
7637
7638 /* Valid range: 1 - N */
7639 if (zfs_arc_grow_retry)
7640 arc_grow_retry = zfs_arc_grow_retry;
7641
7642 /* Valid range: 1 - N */
7643 if (zfs_arc_shrink_shift) {
7644 arc_shrink_shift = zfs_arc_shrink_shift;
7645 zfs_arc_no_grow_shift = MIN(zfs_arc_no_grow_shift,
7646 arc_shrink_shift - 1);
7647 }
7648
7649 /* Valid range: 1 - N ms */
7650 if (zfs_arc_min_prefetch_ms)
7651 arc_min_prefetch = MSEC_TO_TICK(zfs_arc_min_prefetch_ms);
7652
7653 /* Valid range: 1 - N ms */
7654 if (zfs_arc_min_prescient_prefetch_ms) {
7655 arc_min_prescient_prefetch =
7656 MSEC_TO_TICK(zfs_arc_min_prescient_prefetch_ms);
7657 }
7658
7659 /* Valid range: 0 - 100 */
7660 if (zfs_arc_lotsfree_percent <= 100)
7661 arc_lotsfree_percent = zfs_arc_lotsfree_percent;
7662 WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
7663 verbose);
7664
7665 /* Valid range: 0 - <all physical memory> */
7666 if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
7667 arc_sys_free = MIN(zfs_arc_sys_free, allmem);
7668 WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
7669 }
7670
7671 static void
arc_state_multilist_init(multilist_t * ml,multilist_sublist_index_func_t * index_func,int * maxcountp)7672 arc_state_multilist_init(multilist_t *ml,
7673 multilist_sublist_index_func_t *index_func, int *maxcountp)
7674 {
7675 multilist_create(ml, sizeof (arc_buf_hdr_t),
7676 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
7677 *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
7678 }
7679
7680 static void
arc_state_init(void)7681 arc_state_init(void)
7682 {
7683 int num_sublists = 0;
7684
7685 arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
7686 arc_state_multilist_index_func, &num_sublists);
7687 arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
7688 arc_state_multilist_index_func, &num_sublists);
7689 arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
7690 arc_state_multilist_index_func, &num_sublists);
7691 arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
7692 arc_state_multilist_index_func, &num_sublists);
7693 arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
7694 arc_state_multilist_index_func, &num_sublists);
7695 arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
7696 arc_state_multilist_index_func, &num_sublists);
7697 arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
7698 arc_state_multilist_index_func, &num_sublists);
7699 arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
7700 arc_state_multilist_index_func, &num_sublists);
7701 arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
7702 arc_state_multilist_index_func, &num_sublists);
7703 arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
7704 arc_state_multilist_index_func, &num_sublists);
7705
7706 /*
7707 * L2 headers should never be on the L2 state list since they don't
7708 * have L1 headers allocated. Special index function asserts that.
7709 */
7710 arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
7711 arc_state_l2c_multilist_index_func, &num_sublists);
7712 arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
7713 arc_state_l2c_multilist_index_func, &num_sublists);
7714
7715 /*
7716 * Keep track of the number of markers needed to reclaim buffers from
7717 * any ARC state. The markers will be pre-allocated so as to minimize
7718 * the number of memory allocations performed by the eviction thread.
7719 */
7720 arc_state_evict_marker_count = num_sublists;
7721
7722 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7723 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7724 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7725 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7726 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7727 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7728 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7729 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7730 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7731 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7732 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7733 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7734 zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
7735 zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
7736
7737 zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
7738 zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
7739 zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
7740 zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
7741 zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
7742 zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
7743 zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
7744 zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
7745 zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
7746 zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
7747 zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
7748 zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
7749 zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
7750 zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
7751
7752 wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
7753 wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
7754 wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
7755 wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
7756
7757 wmsum_init(&arc_sums.arcstat_hits, 0);
7758 wmsum_init(&arc_sums.arcstat_iohits, 0);
7759 wmsum_init(&arc_sums.arcstat_misses, 0);
7760 wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
7761 wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
7762 wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
7763 wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
7764 wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
7765 wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
7766 wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
7767 wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
7768 wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
7769 wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
7770 wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
7771 wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
7772 wmsum_init(&arc_sums.arcstat_mru_hits, 0);
7773 wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
7774 wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
7775 wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
7776 wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
7777 wmsum_init(&arc_sums.arcstat_deleted, 0);
7778 wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
7779 wmsum_init(&arc_sums.arcstat_access_skip, 0);
7780 wmsum_init(&arc_sums.arcstat_evict_skip, 0);
7781 wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
7782 wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
7783 wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
7784 wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
7785 wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
7786 wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
7787 wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
7788 wmsum_init(&arc_sums.arcstat_hash_elements, 0);
7789 wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
7790 wmsum_init(&arc_sums.arcstat_hash_chains, 0);
7791 aggsum_init(&arc_sums.arcstat_size, 0);
7792 wmsum_init(&arc_sums.arcstat_compressed_size, 0);
7793 wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
7794 wmsum_init(&arc_sums.arcstat_overhead_size, 0);
7795 wmsum_init(&arc_sums.arcstat_hdr_size, 0);
7796 wmsum_init(&arc_sums.arcstat_data_size, 0);
7797 wmsum_init(&arc_sums.arcstat_metadata_size, 0);
7798 wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
7799 aggsum_init(&arc_sums.arcstat_dnode_size, 0);
7800 wmsum_init(&arc_sums.arcstat_bonus_size, 0);
7801 wmsum_init(&arc_sums.arcstat_l2_hits, 0);
7802 wmsum_init(&arc_sums.arcstat_l2_misses, 0);
7803 wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
7804 wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
7805 wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
7806 wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
7807 wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
7808 wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
7809 wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
7810 wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
7811 wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
7812 wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
7813 wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
7814 wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
7815 wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
7816 wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
7817 wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
7818 wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
7819 wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
7820 wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
7821 wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
7822 wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
7823 wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
7824 wmsum_init(&arc_sums.arcstat_l2_psize, 0);
7825 aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
7826 wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
7827 wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
7828 wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
7829 wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
7830 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
7831 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
7832 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
7833 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
7834 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
7835 wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
7836 wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
7837 wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
7838 wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
7839 wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
7840 wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
7841 wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
7842 wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
7843 wmsum_init(&arc_sums.arcstat_prune, 0);
7844 wmsum_init(&arc_sums.arcstat_meta_used, 0);
7845 wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
7846 wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
7847 wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
7848 wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
7849 wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
7850 wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
7851 wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
7852 wmsum_init(&arc_sums.arcstat_raw_size, 0);
7853 wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
7854 wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
7855
7856 arc_anon->arcs_state = ARC_STATE_ANON;
7857 arc_mru->arcs_state = ARC_STATE_MRU;
7858 arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
7859 arc_mfu->arcs_state = ARC_STATE_MFU;
7860 arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
7861 arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
7862 arc_uncached->arcs_state = ARC_STATE_UNCACHED;
7863 }
7864
7865 static void
arc_state_fini(void)7866 arc_state_fini(void)
7867 {
7868 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7869 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7870 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7871 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7872 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7873 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7874 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7875 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7876 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7877 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7878 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7879 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7880 zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
7881 zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
7882
7883 zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
7884 zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
7885 zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
7886 zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
7887 zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
7888 zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
7889 zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
7890 zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
7891 zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
7892 zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
7893 zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
7894 zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
7895 zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
7896 zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
7897
7898 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
7899 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7900 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7901 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7902 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
7903 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7904 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
7905 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7906 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7907 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
7908 multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
7909 multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
7910
7911 wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
7912 wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
7913 wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
7914 wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
7915
7916 wmsum_fini(&arc_sums.arcstat_hits);
7917 wmsum_fini(&arc_sums.arcstat_iohits);
7918 wmsum_fini(&arc_sums.arcstat_misses);
7919 wmsum_fini(&arc_sums.arcstat_demand_data_hits);
7920 wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
7921 wmsum_fini(&arc_sums.arcstat_demand_data_misses);
7922 wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
7923 wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
7924 wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
7925 wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
7926 wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
7927 wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
7928 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
7929 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
7930 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
7931 wmsum_fini(&arc_sums.arcstat_mru_hits);
7932 wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
7933 wmsum_fini(&arc_sums.arcstat_mfu_hits);
7934 wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
7935 wmsum_fini(&arc_sums.arcstat_uncached_hits);
7936 wmsum_fini(&arc_sums.arcstat_deleted);
7937 wmsum_fini(&arc_sums.arcstat_mutex_miss);
7938 wmsum_fini(&arc_sums.arcstat_access_skip);
7939 wmsum_fini(&arc_sums.arcstat_evict_skip);
7940 wmsum_fini(&arc_sums.arcstat_evict_not_enough);
7941 wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
7942 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
7943 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
7944 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
7945 wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
7946 wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
7947 wmsum_fini(&arc_sums.arcstat_hash_elements);
7948 wmsum_fini(&arc_sums.arcstat_hash_collisions);
7949 wmsum_fini(&arc_sums.arcstat_hash_chains);
7950 aggsum_fini(&arc_sums.arcstat_size);
7951 wmsum_fini(&arc_sums.arcstat_compressed_size);
7952 wmsum_fini(&arc_sums.arcstat_uncompressed_size);
7953 wmsum_fini(&arc_sums.arcstat_overhead_size);
7954 wmsum_fini(&arc_sums.arcstat_hdr_size);
7955 wmsum_fini(&arc_sums.arcstat_data_size);
7956 wmsum_fini(&arc_sums.arcstat_metadata_size);
7957 wmsum_fini(&arc_sums.arcstat_dbuf_size);
7958 aggsum_fini(&arc_sums.arcstat_dnode_size);
7959 wmsum_fini(&arc_sums.arcstat_bonus_size);
7960 wmsum_fini(&arc_sums.arcstat_l2_hits);
7961 wmsum_fini(&arc_sums.arcstat_l2_misses);
7962 wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
7963 wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
7964 wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
7965 wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
7966 wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
7967 wmsum_fini(&arc_sums.arcstat_l2_feeds);
7968 wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
7969 wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
7970 wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
7971 wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
7972 wmsum_fini(&arc_sums.arcstat_l2_writes_done);
7973 wmsum_fini(&arc_sums.arcstat_l2_writes_error);
7974 wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
7975 wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
7976 wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
7977 wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
7978 wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
7979 wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
7980 wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
7981 wmsum_fini(&arc_sums.arcstat_l2_io_error);
7982 wmsum_fini(&arc_sums.arcstat_l2_lsize);
7983 wmsum_fini(&arc_sums.arcstat_l2_psize);
7984 aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
7985 wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
7986 wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
7987 wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
7988 wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
7989 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
7990 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
7991 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
7992 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
7993 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
7994 wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
7995 wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
7996 wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
7997 wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
7998 wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
7999 wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
8000 wmsum_fini(&arc_sums.arcstat_memory_direct_count);
8001 wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
8002 wmsum_fini(&arc_sums.arcstat_prune);
8003 wmsum_fini(&arc_sums.arcstat_meta_used);
8004 wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
8005 wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
8006 wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
8007 wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
8008 wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
8009 wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
8010 wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
8011 wmsum_fini(&arc_sums.arcstat_raw_size);
8012 wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
8013 wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
8014 }
8015
8016 uint64_t
arc_target_bytes(void)8017 arc_target_bytes(void)
8018 {
8019 return (arc_c);
8020 }
8021
8022 void
arc_set_limits(uint64_t allmem)8023 arc_set_limits(uint64_t allmem)
8024 {
8025 /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
8026 arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
8027
8028 /* How to set default max varies by platform. */
8029 arc_c_max = arc_default_max(arc_c_min, allmem);
8030 }
8031
8032 void
arc_init(void)8033 arc_init(void)
8034 {
8035 uint64_t percent, allmem = arc_all_memory();
8036 mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
8037 list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
8038 offsetof(arc_evict_waiter_t, aew_node));
8039
8040 arc_min_prefetch = MSEC_TO_TICK(1000);
8041 arc_min_prescient_prefetch = MSEC_TO_TICK(6000);
8042
8043 #if defined(_KERNEL)
8044 arc_lowmem_init();
8045 #endif
8046
8047 arc_set_limits(allmem);
8048
8049 #ifdef _KERNEL
8050 /*
8051 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
8052 * environment before the module was loaded, don't block setting the
8053 * maximum because it is less than arc_c_min, instead, reset arc_c_min
8054 * to a lower value.
8055 * zfs_arc_min will be handled by arc_tuning_update().
8056 */
8057 if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
8058 zfs_arc_max < allmem) {
8059 arc_c_max = zfs_arc_max;
8060 if (arc_c_min >= arc_c_max) {
8061 arc_c_min = MAX(zfs_arc_max / 2,
8062 2ULL << SPA_MAXBLOCKSHIFT);
8063 }
8064 }
8065 #else
8066 /*
8067 * In userland, there's only the memory pressure that we artificially
8068 * create (see arc_available_memory()). Don't let arc_c get too
8069 * small, because it can cause transactions to be larger than
8070 * arc_c, causing arc_tempreserve_space() to fail.
8071 */
8072 arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
8073 #endif
8074
8075 arc_c = arc_c_min;
8076 /*
8077 * 32-bit fixed point fractions of metadata from total ARC size,
8078 * MRU data from all data and MRU metadata from all metadata.
8079 */
8080 arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */
8081 arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */
8082 arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */
8083
8084 percent = MIN(zfs_arc_dnode_limit_percent, 100);
8085 arc_dnode_limit = arc_c_max * percent / 100;
8086
8087 /* Apply user specified tunings */
8088 arc_tuning_update(B_TRUE);
8089
8090 /* if kmem_flags are set, lets try to use less memory */
8091 if (kmem_debugging())
8092 arc_c = arc_c / 2;
8093 if (arc_c < arc_c_min)
8094 arc_c = arc_c_min;
8095
8096 arc_register_hotplug();
8097
8098 arc_state_init();
8099
8100 buf_init();
8101
8102 list_create(&arc_prune_list, sizeof (arc_prune_t),
8103 offsetof(arc_prune_t, p_node));
8104 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
8105
8106 arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
8107 defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
8108
8109 arc_evict_thread_init();
8110
8111 list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
8112 offsetof(arc_async_flush_t, af_node));
8113 mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
8114 arc_flush_taskq = taskq_create("arc_flush", MIN(boot_ncpus, 4),
8115 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
8116
8117 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
8118 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
8119
8120 if (arc_ksp != NULL) {
8121 arc_ksp->ks_data = &arc_stats;
8122 arc_ksp->ks_update = arc_kstat_update;
8123 kstat_install(arc_ksp);
8124 }
8125
8126 arc_state_evict_markers =
8127 arc_state_alloc_markers(arc_state_evict_marker_count);
8128 arc_evict_zthr = zthr_create_timer("arc_evict",
8129 arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
8130 arc_reap_zthr = zthr_create_timer("arc_reap",
8131 arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
8132
8133 arc_warm = B_FALSE;
8134
8135 /*
8136 * Calculate maximum amount of dirty data per pool.
8137 *
8138 * If it has been set by a module parameter, take that.
8139 * Otherwise, use a percentage of physical memory defined by
8140 * zfs_dirty_data_max_percent (default 10%) with a cap at
8141 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
8142 */
8143 #ifdef __LP64__
8144 if (zfs_dirty_data_max_max == 0)
8145 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
8146 allmem * zfs_dirty_data_max_max_percent / 100);
8147 #else
8148 if (zfs_dirty_data_max_max == 0)
8149 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
8150 allmem * zfs_dirty_data_max_max_percent / 100);
8151 #endif
8152
8153 if (zfs_dirty_data_max == 0) {
8154 zfs_dirty_data_max = allmem *
8155 zfs_dirty_data_max_percent / 100;
8156 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
8157 zfs_dirty_data_max_max);
8158 }
8159
8160 if (zfs_wrlog_data_max == 0) {
8161
8162 /*
8163 * dp_wrlog_total is reduced for each txg at the end of
8164 * spa_sync(). However, dp_dirty_total is reduced every time
8165 * a block is written out. Thus under normal operation,
8166 * dp_wrlog_total could grow 2 times as big as
8167 * zfs_dirty_data_max.
8168 */
8169 zfs_wrlog_data_max = zfs_dirty_data_max * 2;
8170 }
8171 }
8172
8173 void
arc_fini(void)8174 arc_fini(void)
8175 {
8176 arc_prune_t *p;
8177
8178 #ifdef _KERNEL
8179 arc_lowmem_fini();
8180 #endif /* _KERNEL */
8181
8182 /* Wait for any background flushes */
8183 taskq_wait(arc_flush_taskq);
8184 taskq_destroy(arc_flush_taskq);
8185
8186 /* Use B_TRUE to ensure *all* buffers are evicted */
8187 arc_flush(NULL, B_TRUE);
8188
8189 if (arc_ksp != NULL) {
8190 kstat_delete(arc_ksp);
8191 arc_ksp = NULL;
8192 }
8193
8194 taskq_wait(arc_prune_taskq);
8195 taskq_destroy(arc_prune_taskq);
8196
8197 list_destroy(&arc_async_flush_list);
8198 mutex_destroy(&arc_async_flush_lock);
8199
8200 mutex_enter(&arc_prune_mtx);
8201 while ((p = list_remove_head(&arc_prune_list)) != NULL) {
8202 (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
8203 zfs_refcount_destroy(&p->p_refcnt);
8204 kmem_free(p, sizeof (*p));
8205 }
8206 mutex_exit(&arc_prune_mtx);
8207
8208 list_destroy(&arc_prune_list);
8209 mutex_destroy(&arc_prune_mtx);
8210
8211 if (arc_evict_taskq != NULL)
8212 taskq_wait(arc_evict_taskq);
8213
8214 (void) zthr_cancel(arc_evict_zthr);
8215 (void) zthr_cancel(arc_reap_zthr);
8216 arc_state_free_markers(arc_state_evict_markers,
8217 arc_state_evict_marker_count);
8218
8219 if (arc_evict_taskq != NULL) {
8220 taskq_destroy(arc_evict_taskq);
8221 kmem_free(arc_evict_arg,
8222 sizeof (evict_arg_t) * zfs_arc_evict_threads);
8223 }
8224
8225 mutex_destroy(&arc_evict_lock);
8226 list_destroy(&arc_evict_waiters);
8227
8228 /*
8229 * Free any buffers that were tagged for destruction. This needs
8230 * to occur before arc_state_fini() runs and destroys the aggsum
8231 * values which are updated when freeing scatter ABDs.
8232 * Pass NULL to free all ABDs regardless of device.
8233 */
8234 l2arc_do_free_on_write(NULL);
8235
8236 /*
8237 * buf_fini() must proceed arc_state_fini() because buf_fin() may
8238 * trigger the release of kmem magazines, which can callback to
8239 * arc_space_return() which accesses aggsums freed in act_state_fini().
8240 */
8241 buf_fini();
8242 arc_state_fini();
8243
8244 arc_unregister_hotplug();
8245
8246 /*
8247 * We destroy the zthrs after all the ARC state has been
8248 * torn down to avoid the case of them receiving any
8249 * wakeup() signals after they are destroyed.
8250 */
8251 zthr_destroy(arc_evict_zthr);
8252 zthr_destroy(arc_reap_zthr);
8253
8254 ASSERT0(arc_loaned_bytes);
8255 }
8256
8257 /*
8258 * Level 2 ARC
8259 *
8260 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
8261 * It uses dedicated storage devices to hold cached data, which are populated
8262 * using large infrequent writes. The main role of this cache is to boost
8263 * the performance of random read workloads. The intended L2ARC devices
8264 * include short-stroked disks, solid state disks, and other media with
8265 * substantially faster read latency than disk.
8266 *
8267 * +-----------------------+
8268 * | ARC |
8269 * +-----------------------+
8270 * | ^ ^
8271 * | | |
8272 * l2arc_feed_thread() arc_read()
8273 * | | |
8274 * | l2arc read |
8275 * V | |
8276 * +---------------+ |
8277 * | L2ARC | |
8278 * +---------------+ |
8279 * | ^ |
8280 * l2arc_write() | |
8281 * | | |
8282 * V | |
8283 * +-------+ +-------+
8284 * | vdev | | vdev |
8285 * | cache | | cache |
8286 * +-------+ +-------+
8287 * +=========+ .-----.
8288 * : L2ARC : |-_____-|
8289 * : devices : | Disks |
8290 * +=========+ `-_____-'
8291 *
8292 * Read requests are satisfied from the following sources, in order:
8293 *
8294 * 1) ARC
8295 * 2) vdev cache of L2ARC devices
8296 * 3) L2ARC devices
8297 * 4) vdev cache of disks
8298 * 5) disks
8299 *
8300 * Some L2ARC device types exhibit extremely slow write performance.
8301 * To accommodate for this there are some significant differences between
8302 * the L2ARC and traditional cache design:
8303 *
8304 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
8305 * the ARC behave as usual, freeing buffers and placing headers on ghost
8306 * lists. The ARC does not send buffers to the L2ARC during eviction as
8307 * this would add inflated write latencies for all ARC memory pressure.
8308 *
8309 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
8310 * It does this by periodically scanning buffers from the eviction-end of
8311 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
8312 * not already there. It scans until a headroom of buffers is satisfied,
8313 * which itself is a buffer for ARC eviction. If a compressible buffer is
8314 * found during scanning and selected for writing to an L2ARC device, we
8315 * temporarily boost scanning headroom during the next scan cycle to make
8316 * sure we adapt to compression effects (which might significantly reduce
8317 * the data volume we write to L2ARC). The thread that does this is
8318 * l2arc_feed_thread(), illustrated below; example sizes are included to
8319 * provide a better sense of ratio than this diagram:
8320 *
8321 * head --> tail
8322 * +---------------------+----------+
8323 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
8324 * +---------------------+----------+ | o L2ARC eligible
8325 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
8326 * +---------------------+----------+ |
8327 * 15.9 Gbytes ^ 32 Mbytes |
8328 * headroom |
8329 * l2arc_feed_thread()
8330 * |
8331 * l2arc write hand <--[oooo]--'
8332 * | 8 Mbyte
8333 * | write max
8334 * V
8335 * +==============================+
8336 * L2ARC dev |####|#|###|###| |####| ... |
8337 * +==============================+
8338 * 32 Gbytes
8339 *
8340 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
8341 * evicted, then the L2ARC has cached a buffer much sooner than it probably
8342 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
8343 * safe to say that this is an uncommon case, since buffers at the end of
8344 * the ARC lists have moved there due to inactivity.
8345 *
8346 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
8347 * then the L2ARC simply misses copying some buffers. This serves as a
8348 * pressure valve to prevent heavy read workloads from both stalling the ARC
8349 * with waits and clogging the L2ARC with writes. This also helps prevent
8350 * the potential for the L2ARC to churn if it attempts to cache content too
8351 * quickly, such as during backups of the entire pool.
8352 *
8353 * 5. After system boot and before the ARC has filled main memory, there are
8354 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
8355 * lists can remain mostly static. Instead of searching from tail of these
8356 * lists as pictured, the l2arc_feed_thread() will search from the list heads
8357 * for eligible buffers, greatly increasing its chance of finding them.
8358 *
8359 * The L2ARC device write speed is also boosted during this time so that
8360 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
8361 * there are no L2ARC reads, and no fear of degrading read performance
8362 * through increased writes.
8363 *
8364 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
8365 * the vdev queue can aggregate them into larger and fewer writes. Each
8366 * device is written to in a rotor fashion, sweeping writes through
8367 * available space then repeating.
8368 *
8369 * 7. The L2ARC does not store dirty content. It never needs to flush
8370 * write buffers back to disk based storage.
8371 *
8372 * 8. If an ARC buffer is written (and dirtied) which also exists in the
8373 * L2ARC, the now stale L2ARC buffer is immediately dropped.
8374 *
8375 * The performance of the L2ARC can be tweaked by a number of tunables, which
8376 * may be necessary for different workloads:
8377 *
8378 * l2arc_write_max max write bytes per interval
8379 * l2arc_dwpd_limit device write endurance limit (100 = 1.0 DWPD)
8380 * l2arc_noprefetch skip caching prefetched buffers
8381 * l2arc_headroom number of max device writes to precache
8382 * l2arc_headroom_boost when we find compressed buffers during ARC
8383 * scanning, we multiply headroom by this
8384 * percentage factor for the next scan cycle,
8385 * since more compressed buffers are likely to
8386 * be present
8387 * l2arc_feed_secs seconds between L2ARC writing
8388 *
8389 * Tunables may be removed or added as future performance improvements are
8390 * integrated, and also may become zpool properties.
8391 *
8392 * There are three key functions that control how the L2ARC warms up:
8393 *
8394 * l2arc_write_eligible() check if a buffer is eligible to cache
8395 * l2arc_write_size() calculate how much to write
8396 *
8397 * These three functions determine what to write, how much, and how quickly
8398 * to send writes.
8399 *
8400 * L2ARC persistence:
8401 *
8402 * When writing buffers to L2ARC, we periodically add some metadata to
8403 * make sure we can pick them up after reboot, thus dramatically reducing
8404 * the impact that any downtime has on the performance of storage systems
8405 * with large caches.
8406 *
8407 * The implementation works fairly simply by integrating the following two
8408 * modifications:
8409 *
8410 * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
8411 * which is an additional piece of metadata which describes what's been
8412 * written. This allows us to rebuild the arc_buf_hdr_t structures of the
8413 * main ARC buffers. There are 2 linked-lists of log blocks headed by
8414 * dh_start_lbps[2]. We alternate which chain we append to, so they are
8415 * time-wise and offset-wise interleaved, but that is an optimization rather
8416 * than for correctness. The log block also includes a pointer to the
8417 * previous block in its chain.
8418 *
8419 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
8420 * for our header bookkeeping purposes. This contains a device header,
8421 * which contains our top-level reference structures. We update it each
8422 * time we write a new log block, so that we're able to locate it in the
8423 * L2ARC device. If this write results in an inconsistent device header
8424 * (e.g. due to power failure), we detect this by verifying the header's
8425 * checksum and simply fail to reconstruct the L2ARC after reboot.
8426 *
8427 * Implementation diagram:
8428 *
8429 * +=== L2ARC device (not to scale) ======================================+
8430 * | ___two newest log block pointers__.__________ |
8431 * | / \dh_start_lbps[1] |
8432 * | / \ \dh_start_lbps[0]|
8433 * |.___/__. V V |
8434 * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
8435 * || hdr| ^ /^ /^ / / |
8436 * |+------+ ...--\-------/ \-----/--\------/ / |
8437 * | \--------------/ \--------------/ |
8438 * +======================================================================+
8439 *
8440 * As can be seen on the diagram, rather than using a simple linked list,
8441 * we use a pair of linked lists with alternating elements. This is a
8442 * performance enhancement due to the fact that we only find out the
8443 * address of the next log block access once the current block has been
8444 * completely read in. Obviously, this hurts performance, because we'd be
8445 * keeping the device's I/O queue at only a 1 operation deep, thus
8446 * incurring a large amount of I/O round-trip latency. Having two lists
8447 * allows us to fetch two log blocks ahead of where we are currently
8448 * rebuilding L2ARC buffers.
8449 *
8450 * On-device data structures:
8451 *
8452 * L2ARC device header: l2arc_dev_hdr_phys_t
8453 * L2ARC log block: l2arc_log_blk_phys_t
8454 *
8455 * L2ARC reconstruction:
8456 *
8457 * When writing data, we simply write in the standard rotary fashion,
8458 * evicting buffers as we go and simply writing new data over them (writing
8459 * a new log block every now and then). This obviously means that once we
8460 * loop around the end of the device, we will start cutting into an already
8461 * committed log block (and its referenced data buffers), like so:
8462 *
8463 * current write head__ __old tail
8464 * \ /
8465 * V V
8466 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
8467 * ^ ^^^^^^^^^___________________________________
8468 * | \
8469 * <<nextwrite>> may overwrite this blk and/or its bufs --'
8470 *
8471 * When importing the pool, we detect this situation and use it to stop
8472 * our scanning process (see l2arc_rebuild).
8473 *
8474 * There is one significant caveat to consider when rebuilding ARC contents
8475 * from an L2ARC device: what about invalidated buffers? Given the above
8476 * construction, we cannot update blocks which we've already written to amend
8477 * them to remove buffers which were invalidated. Thus, during reconstruction,
8478 * we might be populating the cache with buffers for data that's not on the
8479 * main pool anymore, or may have been overwritten!
8480 *
8481 * As it turns out, this isn't a problem. Every arc_read request includes
8482 * both the DVA and, crucially, the birth TXG of the BP the caller is
8483 * looking for. So even if the cache were populated by completely rotten
8484 * blocks for data that had been long deleted and/or overwritten, we'll
8485 * never actually return bad data from the cache, since the DVA with the
8486 * birth TXG uniquely identify a block in space and time - once created,
8487 * a block is immutable on disk. The worst thing we have done is wasted
8488 * some time and memory at l2arc rebuild to reconstruct outdated ARC
8489 * entries that will get dropped from the l2arc as it is being updated
8490 * with new blocks.
8491 *
8492 * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
8493 * hand are not restored. This is done by saving the offset (in bytes)
8494 * l2arc_evict() has evicted to in the L2ARC device header and taking it
8495 * into account when restoring buffers.
8496 */
8497
8498 static boolean_t
l2arc_write_eligible(uint64_t spa_guid,arc_buf_hdr_t * hdr)8499 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
8500 {
8501 /*
8502 * A buffer is *not* eligible for the L2ARC if it:
8503 * 1. belongs to a different spa.
8504 * 2. is already cached on the L2ARC.
8505 * 3. has an I/O in progress (it may be an incomplete read).
8506 * 4. is flagged not eligible (zfs property).
8507 */
8508 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
8509 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
8510 return (B_FALSE);
8511
8512 return (B_TRUE);
8513 }
8514
8515 static uint64_t
l2arc_write_size(l2arc_dev_t * dev,clock_t * interval)8516 l2arc_write_size(l2arc_dev_t *dev, clock_t *interval)
8517 {
8518 uint64_t size;
8519 uint64_t write_rate = l2arc_get_write_rate(dev);
8520
8521 if (write_rate > L2ARC_BURST_SIZE_MAX) {
8522 /* Calculate interval to achieve desired rate with burst cap */
8523 uint64_t feeds_per_sec =
8524 MAX(DIV_ROUND_UP(write_rate, L2ARC_BURST_SIZE_MAX), 1);
8525 *interval = hz / feeds_per_sec;
8526 size = write_rate / feeds_per_sec;
8527 } else {
8528 *interval = hz; /* 1 second default */
8529 size = write_rate;
8530 }
8531
8532 /* We need to add in the worst case scenario of log block overhead. */
8533 size += l2arc_log_blk_overhead(size, dev);
8534 if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
8535 /*
8536 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
8537 * times the writesize, whichever is greater.
8538 */
8539 size += MAX(64 * 1024 * 1024,
8540 (size * l2arc_trim_ahead) / 100);
8541 }
8542
8543 /*
8544 * Make sure the write size does not exceed the size of the cache
8545 * device. This is important in l2arc_evict(), otherwise infinite
8546 * iteration can occur.
8547 */
8548 size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
8549
8550 size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
8551
8552 return (size);
8553
8554 }
8555
8556 /*
8557 * Free buffers that were tagged for destruction.
8558 */
8559 static void
l2arc_do_free_on_write(l2arc_dev_t * dev)8560 l2arc_do_free_on_write(l2arc_dev_t *dev)
8561 {
8562 l2arc_data_free_t *df, *df_next;
8563 boolean_t all = (dev == NULL);
8564
8565 mutex_enter(&l2arc_free_on_write_mtx);
8566 df = list_head(l2arc_free_on_write);
8567 while (df != NULL) {
8568 df_next = list_next(l2arc_free_on_write, df);
8569 if (all || df->l2df_dev == dev) {
8570 list_remove(l2arc_free_on_write, df);
8571 ASSERT3P(df->l2df_abd, !=, NULL);
8572 abd_free(df->l2df_abd);
8573 kmem_free(df, sizeof (l2arc_data_free_t));
8574 }
8575 df = df_next;
8576 }
8577 mutex_exit(&l2arc_free_on_write_mtx);
8578 }
8579
8580 /*
8581 * A write to a cache device has completed. Update all headers to allow
8582 * reads from these buffers to begin.
8583 */
8584 static void
l2arc_write_done(zio_t * zio)8585 l2arc_write_done(zio_t *zio)
8586 {
8587 l2arc_write_callback_t *cb;
8588 l2arc_lb_abd_buf_t *abd_buf;
8589 l2arc_lb_ptr_buf_t *lb_ptr_buf;
8590 l2arc_dev_t *dev;
8591 l2arc_dev_hdr_phys_t *l2dhdr;
8592 list_t *buflist;
8593 arc_buf_hdr_t *head, *hdr, *hdr_prev;
8594 kmutex_t *hash_lock;
8595 int64_t bytes_dropped = 0;
8596
8597 cb = zio->io_private;
8598 ASSERT3P(cb, !=, NULL);
8599 dev = cb->l2wcb_dev;
8600 l2dhdr = dev->l2ad_dev_hdr;
8601 ASSERT3P(dev, !=, NULL);
8602 head = cb->l2wcb_head;
8603 ASSERT3P(head, !=, NULL);
8604 buflist = &dev->l2ad_buflist;
8605 ASSERT3P(buflist, !=, NULL);
8606 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
8607 l2arc_write_callback_t *, cb);
8608
8609 /*
8610 * All writes completed, or an error was hit.
8611 */
8612 top:
8613 mutex_enter(&dev->l2ad_mtx);
8614 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
8615 hdr_prev = list_prev(buflist, hdr);
8616
8617 hash_lock = HDR_LOCK(hdr);
8618
8619 /*
8620 * We cannot use mutex_enter or else we can deadlock
8621 * with l2arc_write_buffers (due to swapping the order
8622 * the hash lock and l2ad_mtx are taken).
8623 */
8624 if (!mutex_tryenter(hash_lock)) {
8625 /*
8626 * Missed the hash lock. We must retry so we
8627 * don't leave the ARC_FLAG_L2_WRITING bit set.
8628 */
8629 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
8630
8631 /*
8632 * We don't want to rescan the headers we've
8633 * already marked as having been written out, so
8634 * we reinsert the head node so we can pick up
8635 * where we left off.
8636 */
8637 list_remove(buflist, head);
8638 list_insert_after(buflist, hdr, head);
8639
8640 mutex_exit(&dev->l2ad_mtx);
8641
8642 /*
8643 * We wait for the hash lock to become available
8644 * to try and prevent busy waiting, and increase
8645 * the chance we'll be able to acquire the lock
8646 * the next time around.
8647 */
8648 mutex_enter(hash_lock);
8649 mutex_exit(hash_lock);
8650 goto top;
8651 }
8652
8653 /*
8654 * We could not have been moved into the arc_l2c_only
8655 * state while in-flight due to our ARC_FLAG_L2_WRITING
8656 * bit being set. Let's just ensure that's being enforced.
8657 */
8658 ASSERT(HDR_HAS_L1HDR(hdr));
8659
8660 /*
8661 * Skipped - drop L2ARC entry and mark the header as no
8662 * longer L2 eligibile.
8663 */
8664 if (zio->io_error != 0) {
8665 /*
8666 * Error - drop L2ARC entry.
8667 */
8668 list_remove(buflist, hdr);
8669 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
8670
8671 uint64_t psize = HDR_GET_PSIZE(hdr);
8672 l2arc_hdr_arcstats_decrement(hdr);
8673
8674 ASSERT(dev->l2ad_vdev != NULL);
8675
8676 bytes_dropped +=
8677 vdev_psize_to_asize(dev->l2ad_vdev, psize);
8678 (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
8679 arc_hdr_size(hdr), hdr);
8680 }
8681
8682 /*
8683 * Allow ARC to begin reads and ghost list evictions to
8684 * this L2ARC entry.
8685 */
8686 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
8687
8688 mutex_exit(hash_lock);
8689 }
8690
8691 /*
8692 * Free the allocated abd buffers for writing the log blocks.
8693 * If the zio failed reclaim the allocated space and remove the
8694 * pointers to these log blocks from the log block pointer list
8695 * of the L2ARC device.
8696 */
8697 while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
8698 abd_free(abd_buf->abd);
8699 zio_buf_free(abd_buf, sizeof (*abd_buf));
8700 if (zio->io_error != 0) {
8701 lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
8702 /*
8703 * L2BLK_GET_PSIZE returns aligned size for log
8704 * blocks.
8705 */
8706 uint64_t asize =
8707 L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
8708 bytes_dropped += asize;
8709 ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
8710 ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
8711 zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
8712 lb_ptr_buf);
8713 (void) zfs_refcount_remove(&dev->l2ad_lb_count,
8714 lb_ptr_buf);
8715 kmem_free(lb_ptr_buf->lb_ptr,
8716 sizeof (l2arc_log_blkptr_t));
8717 kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
8718 }
8719 }
8720 list_destroy(&cb->l2wcb_abd_list);
8721
8722 if (zio->io_error != 0) {
8723 ARCSTAT_BUMP(arcstat_l2_writes_error);
8724
8725 /*
8726 * Restore the lbps array in the header to its previous state.
8727 * If the list of log block pointers is empty, zero out the
8728 * log block pointers in the device header.
8729 */
8730 lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
8731 for (int i = 0; i < 2; i++) {
8732 if (lb_ptr_buf == NULL) {
8733 /*
8734 * If the list is empty zero out the device
8735 * header. Otherwise zero out the second log
8736 * block pointer in the header.
8737 */
8738 if (i == 0) {
8739 memset(l2dhdr, 0,
8740 dev->l2ad_dev_hdr_asize);
8741 } else {
8742 memset(&l2dhdr->dh_start_lbps[i], 0,
8743 sizeof (l2arc_log_blkptr_t));
8744 }
8745 break;
8746 }
8747 memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
8748 sizeof (l2arc_log_blkptr_t));
8749 lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
8750 lb_ptr_buf);
8751 }
8752 }
8753
8754 ARCSTAT_BUMP(arcstat_l2_writes_done);
8755 list_remove(buflist, head);
8756 ASSERT(!HDR_HAS_L1HDR(head));
8757 kmem_cache_free(hdr_l2only_cache, head);
8758 mutex_exit(&dev->l2ad_mtx);
8759
8760 ASSERT(dev->l2ad_vdev != NULL);
8761 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
8762
8763 l2arc_do_free_on_write(dev);
8764
8765 kmem_free(cb, sizeof (l2arc_write_callback_t));
8766 }
8767
8768 static int
l2arc_untransform(zio_t * zio,l2arc_read_callback_t * cb)8769 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
8770 {
8771 int ret;
8772 spa_t *spa = zio->io_spa;
8773 arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
8774 blkptr_t *bp = zio->io_bp;
8775 uint8_t salt[ZIO_DATA_SALT_LEN];
8776 uint8_t iv[ZIO_DATA_IV_LEN];
8777 uint8_t mac[ZIO_DATA_MAC_LEN];
8778 boolean_t no_crypt = B_FALSE;
8779
8780 /*
8781 * ZIL data is never be written to the L2ARC, so we don't need
8782 * special handling for its unique MAC storage.
8783 */
8784 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
8785 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
8786 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8787
8788 /*
8789 * If the data was encrypted, decrypt it now. Note that
8790 * we must check the bp here and not the hdr, since the
8791 * hdr does not have its encryption parameters updated
8792 * until arc_read_done().
8793 */
8794 if (BP_IS_ENCRYPTED(bp)) {
8795 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8796 ARC_HDR_USE_RESERVE);
8797
8798 zio_crypt_decode_params_bp(bp, salt, iv);
8799 zio_crypt_decode_mac_bp(bp, mac);
8800
8801 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
8802 BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
8803 salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
8804 hdr->b_l1hdr.b_pabd, &no_crypt);
8805 if (ret != 0) {
8806 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8807 goto error;
8808 }
8809
8810 /*
8811 * If we actually performed decryption, replace b_pabd
8812 * with the decrypted data. Otherwise we can just throw
8813 * our decryption buffer away.
8814 */
8815 if (!no_crypt) {
8816 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8817 arc_hdr_size(hdr), hdr);
8818 hdr->b_l1hdr.b_pabd = eabd;
8819 zio->io_abd = eabd;
8820 } else {
8821 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8822 }
8823 }
8824
8825 /*
8826 * If the L2ARC block was compressed, but ARC compression
8827 * is disabled we decompress the data into a new buffer and
8828 * replace the existing data.
8829 */
8830 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8831 !HDR_COMPRESSION_ENABLED(hdr)) {
8832 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8833 ARC_HDR_USE_RESERVE);
8834
8835 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
8836 hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
8837 HDR_GET_LSIZE(hdr), &hdr->b_complevel);
8838 if (ret != 0) {
8839 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
8840 goto error;
8841 }
8842
8843 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8844 arc_hdr_size(hdr), hdr);
8845 hdr->b_l1hdr.b_pabd = cabd;
8846 zio->io_abd = cabd;
8847 zio->io_size = HDR_GET_LSIZE(hdr);
8848 }
8849
8850 return (0);
8851
8852 error:
8853 return (ret);
8854 }
8855
8856
8857 /*
8858 * A read to a cache device completed. Validate buffer contents before
8859 * handing over to the regular ARC routines.
8860 */
8861 static void
l2arc_read_done(zio_t * zio)8862 l2arc_read_done(zio_t *zio)
8863 {
8864 int tfm_error = 0;
8865 l2arc_read_callback_t *cb = zio->io_private;
8866 arc_buf_hdr_t *hdr;
8867 kmutex_t *hash_lock;
8868 boolean_t valid_cksum;
8869 boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8870 (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
8871
8872 ASSERT3P(zio->io_vd, !=, NULL);
8873 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8874
8875 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
8876
8877 ASSERT3P(cb, !=, NULL);
8878 hdr = cb->l2rcb_hdr;
8879 ASSERT3P(hdr, !=, NULL);
8880
8881 hash_lock = HDR_LOCK(hdr);
8882 mutex_enter(hash_lock);
8883 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
8884
8885 /*
8886 * If the data was read into a temporary buffer,
8887 * move it and free the buffer.
8888 */
8889 if (cb->l2rcb_abd != NULL) {
8890 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8891 if (zio->io_error == 0) {
8892 if (using_rdata) {
8893 abd_copy(hdr->b_crypt_hdr.b_rabd,
8894 cb->l2rcb_abd, arc_hdr_size(hdr));
8895 } else {
8896 abd_copy(hdr->b_l1hdr.b_pabd,
8897 cb->l2rcb_abd, arc_hdr_size(hdr));
8898 }
8899 }
8900
8901 /*
8902 * The following must be done regardless of whether
8903 * there was an error:
8904 * - free the temporary buffer
8905 * - point zio to the real ARC buffer
8906 * - set zio size accordingly
8907 * These are required because zio is either re-used for
8908 * an I/O of the block in the case of the error
8909 * or the zio is passed to arc_read_done() and it
8910 * needs real data.
8911 */
8912 abd_free(cb->l2rcb_abd);
8913 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
8914
8915 if (using_rdata) {
8916 ASSERT(HDR_HAS_RABD(hdr));
8917 zio->io_abd = zio->io_orig_abd =
8918 hdr->b_crypt_hdr.b_rabd;
8919 } else {
8920 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8921 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8922 }
8923 }
8924
8925 ASSERT3P(zio->io_abd, !=, NULL);
8926
8927 /*
8928 * Check this survived the L2ARC journey.
8929 */
8930 ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8931 (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
8932 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
8933 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
8934 zio->io_prop.zp_complevel = hdr->b_complevel;
8935
8936 valid_cksum = arc_cksum_is_equal(hdr, zio);
8937
8938 /*
8939 * b_rabd will always match the data as it exists on disk if it is
8940 * being used. Therefore if we are reading into b_rabd we do not
8941 * attempt to untransform the data.
8942 */
8943 if (valid_cksum && !using_rdata)
8944 tfm_error = l2arc_untransform(zio, cb);
8945
8946 if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8947 !HDR_L2_EVICTED(hdr)) {
8948 mutex_exit(hash_lock);
8949 zio->io_private = hdr;
8950 arc_read_done(zio);
8951 } else {
8952 /*
8953 * Buffer didn't survive caching. Increment stats and
8954 * reissue to the original storage device.
8955 */
8956 if (zio->io_error != 0) {
8957 ARCSTAT_BUMP(arcstat_l2_io_error);
8958 } else {
8959 zio->io_error = SET_ERROR(EIO);
8960 }
8961 if (!valid_cksum || tfm_error != 0)
8962 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8963
8964 /*
8965 * If there's no waiter, issue an async i/o to the primary
8966 * storage now. If there *is* a waiter, the caller must
8967 * issue the i/o in a context where it's OK to block.
8968 */
8969 if (zio->io_waiter == NULL) {
8970 zio_t *pio = zio_unique_parent(zio);
8971 void *abd = (using_rdata) ?
8972 hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
8973
8974 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8975
8976 zio = zio_read(pio, zio->io_spa, zio->io_bp,
8977 abd, zio->io_size, arc_read_done,
8978 hdr, zio->io_priority, cb->l2rcb_flags,
8979 &cb->l2rcb_zb);
8980
8981 /*
8982 * Original ZIO will be freed, so we need to update
8983 * ARC header with the new ZIO pointer to be used
8984 * by zio_change_priority() in arc_read().
8985 */
8986 for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
8987 acb != NULL; acb = acb->acb_next)
8988 acb->acb_zio_head = zio;
8989
8990 mutex_exit(hash_lock);
8991 zio_nowait(zio);
8992 } else {
8993 mutex_exit(hash_lock);
8994 }
8995 }
8996
8997 kmem_free(cb, sizeof (l2arc_read_callback_t));
8998 }
8999
9000 /*
9001 * Get the multilist for the given list number (0..3) to cycle through
9002 * lists in the desired order. This order can have a significant effect
9003 * on cache performance.
9004 *
9005 * Currently the metadata lists are hit first, MFU then MRU, followed by
9006 * the data lists.
9007 */
9008 static multilist_t *
l2arc_get_list(int list_num)9009 l2arc_get_list(int list_num)
9010 {
9011 ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
9012
9013 switch (list_num) {
9014 case 0:
9015 return (&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
9016 case 1:
9017 return (&arc_mru->arcs_list[ARC_BUFC_METADATA]);
9018 case 2:
9019 return (&arc_mfu->arcs_list[ARC_BUFC_DATA]);
9020 case 3:
9021 return (&arc_mru->arcs_list[ARC_BUFC_DATA]);
9022 default:
9023 return (NULL);
9024 }
9025 }
9026
9027
9028 /*
9029 * Lock a specific sublist within the given list number.
9030 */
9031 static multilist_sublist_t *
l2arc_sublist_lock(int list_num,int sublist_idx)9032 l2arc_sublist_lock(int list_num, int sublist_idx)
9033 {
9034 multilist_t *ml = l2arc_get_list(list_num);
9035 if (ml == NULL)
9036 return (NULL);
9037
9038 return (multilist_sublist_lock_idx(ml, sublist_idx));
9039 }
9040
9041 /*
9042 * Check if a pool has any L2ARC devices.
9043 */
9044 static boolean_t
l2arc_pool_has_devices(spa_t * target_spa)9045 l2arc_pool_has_devices(spa_t *target_spa)
9046 {
9047 l2arc_dev_t *dev;
9048
9049 ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
9050
9051 for (dev = list_head(l2arc_dev_list); dev != NULL;
9052 dev = list_next(l2arc_dev_list, dev)) {
9053 if (dev->l2ad_spa == target_spa) {
9054 return (B_TRUE);
9055 }
9056 }
9057
9058 return (B_FALSE);
9059 }
9060
9061 /*
9062 * Initialize pool-based markers for l2arc position saving.
9063 */
9064 static void
l2arc_pool_markers_init(spa_t * spa)9065 l2arc_pool_markers_init(spa_t *spa)
9066 {
9067 mutex_init(&spa->spa_l2arc_info.l2arc_sublist_lock, NULL,
9068 MUTEX_DEFAULT, NULL);
9069
9070 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9071 multilist_t *ml = l2arc_get_list(pass);
9072 if (ml == NULL)
9073 continue;
9074
9075 int num_sublists = multilist_get_num_sublists(ml);
9076
9077 spa->spa_l2arc_info.l2arc_markers[pass] =
9078 arc_state_alloc_markers(num_sublists);
9079 spa->spa_l2arc_info.l2arc_sublist_busy[pass] =
9080 kmem_zalloc(num_sublists * sizeof (boolean_t), KM_SLEEP);
9081 spa->spa_l2arc_info.l2arc_sublist_reset[pass] =
9082 kmem_zalloc(num_sublists * sizeof (boolean_t), KM_SLEEP);
9083
9084 for (int i = 0; i < num_sublists; i++) {
9085 multilist_sublist_t *mls =
9086 multilist_sublist_lock_idx(ml, i);
9087 multilist_sublist_insert_tail(mls,
9088 spa->spa_l2arc_info.l2arc_markers[pass][i]);
9089 multilist_sublist_unlock(mls);
9090 }
9091
9092 spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
9093 }
9094 }
9095
9096 /*
9097 * Free all allocated pool-based markers.
9098 */
9099 static void
l2arc_pool_markers_fini(spa_t * spa)9100 l2arc_pool_markers_fini(spa_t *spa)
9101 {
9102 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9103 if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
9104 continue;
9105
9106 multilist_t *ml = l2arc_get_list(pass);
9107 if (ml == NULL)
9108 continue;
9109
9110 int num_sublists = multilist_get_num_sublists(ml);
9111
9112 for (int i = 0; i < num_sublists; i++) {
9113 ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
9114 !=, NULL);
9115 multilist_sublist_t *mls =
9116 multilist_sublist_lock_idx(ml, i);
9117 ASSERT(multilist_link_active(
9118 &spa->spa_l2arc_info.l2arc_markers[pass][i]->
9119 b_l1hdr.b_arc_node));
9120 multilist_sublist_remove(mls,
9121 spa->spa_l2arc_info.l2arc_markers[pass][i]);
9122 multilist_sublist_unlock(mls);
9123 }
9124
9125 arc_state_free_markers(spa->spa_l2arc_info.l2arc_markers[pass],
9126 num_sublists);
9127 spa->spa_l2arc_info.l2arc_markers[pass] = NULL;
9128
9129 /* Free sublist busy and reset flags for this pass */
9130 ASSERT3P(spa->spa_l2arc_info.l2arc_sublist_busy[pass], !=,
9131 NULL);
9132 kmem_free(spa->spa_l2arc_info.l2arc_sublist_busy[pass],
9133 num_sublists * sizeof (boolean_t));
9134 spa->spa_l2arc_info.l2arc_sublist_busy[pass] = NULL;
9135
9136 ASSERT3P(spa->spa_l2arc_info.l2arc_sublist_reset[pass], !=,
9137 NULL);
9138 kmem_free(spa->spa_l2arc_info.l2arc_sublist_reset[pass],
9139 num_sublists * sizeof (boolean_t));
9140 spa->spa_l2arc_info.l2arc_sublist_reset[pass] = NULL;
9141 }
9142
9143 mutex_destroy(&spa->spa_l2arc_info.l2arc_sublist_lock);
9144 }
9145
9146 /*
9147 * Calculates the maximum overhead of L2ARC metadata log blocks for a given
9148 * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
9149 * overhead in processing to make sure there is enough headroom available
9150 * when writing buffers.
9151 */
9152 static inline uint64_t
l2arc_log_blk_overhead(uint64_t write_sz,l2arc_dev_t * dev)9153 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
9154 {
9155 if (dev->l2ad_log_entries == 0) {
9156 return (0);
9157 } else {
9158 ASSERT(dev->l2ad_vdev != NULL);
9159
9160 uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
9161
9162 uint64_t log_blocks = (log_entries +
9163 dev->l2ad_log_entries - 1) /
9164 dev->l2ad_log_entries;
9165
9166 return (vdev_psize_to_asize(dev->l2ad_vdev,
9167 sizeof (l2arc_log_blk_phys_t)) * log_blocks);
9168 }
9169 }
9170
9171 /*
9172 * Bump the DWPD generation to trigger stats reset on all devices.
9173 */
9174 void
l2arc_dwpd_bump_reset(void)9175 l2arc_dwpd_bump_reset(void)
9176 {
9177 l2arc_dwpd_bump++;
9178 }
9179
9180 /*
9181 * Calculate DWPD rate limit for L2ARC device.
9182 */
9183 static uint64_t
l2arc_dwpd_rate_limit(l2arc_dev_t * dev)9184 l2arc_dwpd_rate_limit(l2arc_dev_t *dev)
9185 {
9186 uint64_t device_size = dev->l2ad_end - dev->l2ad_start;
9187 uint64_t daily_budget = (device_size * l2arc_dwpd_limit) / 100;
9188 uint64_t now = gethrestime_sec();
9189
9190 /* Reset stats on param change or daily period expiry */
9191 if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump ||
9192 (now - dev->l2ad_dwpd_start) >= 24 * 3600) {
9193 if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump) {
9194 /* Full reset on param change, no carryover */
9195 dev->l2ad_dwpd_accumulated = 0;
9196 dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
9197 } else {
9198 /* Save unused budget from last period (max 1 day) */
9199 if (dev->l2ad_dwpd_writes >= daily_budget)
9200 dev->l2ad_dwpd_accumulated = 0;
9201 else
9202 dev->l2ad_dwpd_accumulated =
9203 daily_budget - dev->l2ad_dwpd_writes;
9204 }
9205 dev->l2ad_dwpd_writes = 0;
9206 dev->l2ad_dwpd_start = now;
9207 }
9208
9209 uint64_t elapsed = now - dev->l2ad_dwpd_start;
9210 uint64_t remaining_secs = MAX((24 * 3600) - elapsed, 1);
9211 /* Add burst allowance for the first write after device wrap */
9212 uint64_t total_budget = daily_budget + dev->l2ad_dwpd_accumulated +
9213 L2ARC_BURST_SIZE_MAX;
9214
9215 if (dev->l2ad_dwpd_writes >= total_budget)
9216 return (0);
9217
9218 return ((total_budget - dev->l2ad_dwpd_writes) / remaining_secs);
9219 }
9220
9221 /*
9222 * Get write rate based on device state and DWPD configuration.
9223 */
9224 static uint64_t
l2arc_get_write_rate(l2arc_dev_t * dev)9225 l2arc_get_write_rate(l2arc_dev_t *dev)
9226 {
9227 uint64_t write_max = l2arc_write_max;
9228 spa_t *spa = dev->l2ad_spa;
9229
9230 /*
9231 * Make sure l2arc_write_max is valid in case user altered it.
9232 */
9233 if (write_max == 0) {
9234 cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
9235 "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
9236 write_max = l2arc_write_max = L2ARC_WRITE_SIZE;
9237 }
9238
9239 /* Apply DWPD rate limit for persistent marker configurations */
9240 if (!dev->l2ad_first && l2arc_dwpd_limit > 0 &&
9241 spa->spa_l2arc_info.l2arc_total_capacity >=
9242 L2ARC_PERSIST_THRESHOLD) {
9243 uint64_t dwpd_rate = l2arc_dwpd_rate_limit(dev);
9244 return (MIN(dwpd_rate, write_max));
9245 }
9246
9247 return (write_max);
9248 }
9249
9250 /*
9251 * Evict buffers from the device write hand to the distance specified in
9252 * bytes. This distance may span populated buffers, it may span nothing.
9253 * This is clearing a region on the L2ARC device ready for writing.
9254 * If the 'all' boolean is set, every buffer is evicted.
9255 */
9256 static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)9257 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
9258 {
9259 list_t *buflist;
9260 arc_buf_hdr_t *hdr, *hdr_prev;
9261 kmutex_t *hash_lock;
9262 uint64_t taddr;
9263 l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
9264 vdev_t *vd = dev->l2ad_vdev;
9265 boolean_t rerun;
9266
9267 ASSERT(vd != NULL || all);
9268 ASSERT(dev->l2ad_spa != NULL || all);
9269
9270 buflist = &dev->l2ad_buflist;
9271
9272 top:
9273 rerun = B_FALSE;
9274 if (dev->l2ad_hand + distance > dev->l2ad_end) {
9275 /*
9276 * When there is no space to accommodate upcoming writes,
9277 * evict to the end. Then bump the write and evict hands
9278 * to the start and iterate. This iteration does not
9279 * happen indefinitely as we make sure in
9280 * l2arc_write_size() that when the write hand is reset,
9281 * the write size does not exceed the end of the device.
9282 */
9283 rerun = B_TRUE;
9284 taddr = dev->l2ad_end;
9285 } else {
9286 taddr = dev->l2ad_hand + distance;
9287 }
9288 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
9289 uint64_t, taddr, boolean_t, all);
9290
9291 if (!all) {
9292 /*
9293 * This check has to be placed after deciding whether to
9294 * iterate (rerun).
9295 */
9296 if (dev->l2ad_first) {
9297 /*
9298 * This is the first sweep through the device. There is
9299 * nothing to evict. We have already trimmed the
9300 * whole device.
9301 */
9302 goto out;
9303 } else {
9304 /*
9305 * Trim the space to be evicted.
9306 */
9307 if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
9308 l2arc_trim_ahead > 0) {
9309 /*
9310 * We have to drop the spa_config lock because
9311 * vdev_trim_range() will acquire it.
9312 * l2ad_evict already accounts for the label
9313 * size. To prevent vdev_trim_ranges() from
9314 * adding it again, we subtract it from
9315 * l2ad_evict.
9316 */
9317 spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
9318 vdev_trim_simple(vd,
9319 dev->l2ad_evict - VDEV_LABEL_START_SIZE,
9320 taddr - dev->l2ad_evict);
9321 spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
9322 RW_READER);
9323 }
9324
9325 /*
9326 * When rebuilding L2ARC we retrieve the evict hand
9327 * from the header of the device. Of note, l2arc_evict()
9328 * does not actually delete buffers from the cache
9329 * device, but trimming may do so depending on the
9330 * hardware implementation. Thus keeping track of the
9331 * evict hand is useful.
9332 */
9333 dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
9334 }
9335 }
9336
9337 retry:
9338 mutex_enter(&dev->l2ad_mtx);
9339 /*
9340 * We have to account for evicted log blocks. Run vdev_space_update()
9341 * on log blocks whose offset (in bytes) is before the evicted offset
9342 * (in bytes) by searching in the list of pointers to log blocks
9343 * present in the L2ARC device.
9344 */
9345 for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
9346 lb_ptr_buf = lb_ptr_buf_prev) {
9347
9348 lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
9349
9350 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
9351 uint64_t asize = L2BLK_GET_PSIZE(
9352 (lb_ptr_buf->lb_ptr)->lbp_prop);
9353
9354 /*
9355 * We don't worry about log blocks left behind (ie
9356 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
9357 * will never write more than l2arc_evict() evicts.
9358 */
9359 if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
9360 break;
9361 } else {
9362 if (vd != NULL)
9363 vdev_space_update(vd, -asize, 0, 0);
9364 ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
9365 ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
9366 zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
9367 lb_ptr_buf);
9368 (void) zfs_refcount_remove(&dev->l2ad_lb_count,
9369 lb_ptr_buf);
9370 list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
9371 kmem_free(lb_ptr_buf->lb_ptr,
9372 sizeof (l2arc_log_blkptr_t));
9373 kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
9374 }
9375 }
9376
9377 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
9378 hdr_prev = list_prev(buflist, hdr);
9379
9380 ASSERT(!HDR_EMPTY(hdr));
9381 hash_lock = HDR_LOCK(hdr);
9382
9383 /*
9384 * We cannot use mutex_enter or else we can deadlock
9385 * with l2arc_write_buffers (due to swapping the order
9386 * the hash lock and l2ad_mtx are taken).
9387 */
9388 if (!mutex_tryenter(hash_lock)) {
9389 /*
9390 * Missed the hash lock. Retry.
9391 */
9392 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
9393 mutex_exit(&dev->l2ad_mtx);
9394 mutex_enter(hash_lock);
9395 mutex_exit(hash_lock);
9396 goto retry;
9397 }
9398
9399 /*
9400 * A header can't be on this list if it doesn't have L2 header.
9401 */
9402 ASSERT(HDR_HAS_L2HDR(hdr));
9403
9404 /* Ensure this header has finished being written. */
9405 ASSERT(!HDR_L2_WRITING(hdr));
9406 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
9407
9408 if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
9409 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
9410 /*
9411 * We've evicted to the target address,
9412 * or the end of the device.
9413 */
9414 mutex_exit(hash_lock);
9415 break;
9416 }
9417
9418 if (!HDR_HAS_L1HDR(hdr)) {
9419 ASSERT(!HDR_L2_READING(hdr));
9420 /*
9421 * This doesn't exist in the ARC. Destroy.
9422 * arc_hdr_destroy() will call list_remove()
9423 * and decrement arcstat_l2_lsize.
9424 */
9425 arc_change_state(arc_anon, hdr);
9426 arc_hdr_destroy(hdr);
9427 } else {
9428 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
9429 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
9430 /*
9431 * Invalidate issued or about to be issued
9432 * reads, since we may be about to write
9433 * over this location.
9434 */
9435 if (HDR_L2_READING(hdr)) {
9436 ARCSTAT_BUMP(arcstat_l2_evict_reading);
9437 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
9438 }
9439
9440 arc_hdr_l2hdr_destroy(hdr);
9441 }
9442 mutex_exit(hash_lock);
9443 }
9444 mutex_exit(&dev->l2ad_mtx);
9445
9446 out:
9447 /*
9448 * We need to check if we evict all buffers, otherwise we may iterate
9449 * unnecessarily.
9450 */
9451 if (!all && rerun) {
9452 /*
9453 * Bump device hand to the device start if it is approaching the
9454 * end. l2arc_evict() has already evicted ahead for this case.
9455 */
9456 dev->l2ad_hand = dev->l2ad_start;
9457 dev->l2ad_evict = dev->l2ad_start;
9458 dev->l2ad_first = B_FALSE;
9459 /*
9460 * Reset DWPD counters - first pass writes are free, start
9461 * fresh 24h budget period now that device is full.
9462 */
9463 dev->l2ad_dwpd_writes = 0;
9464 dev->l2ad_dwpd_start = gethrestime_sec();
9465 dev->l2ad_dwpd_accumulated = 0;
9466 dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
9467 goto top;
9468 }
9469
9470 if (!all) {
9471 /*
9472 * In case of cache device removal (all) the following
9473 * assertions may be violated without functional consequences
9474 * as the device is about to be removed.
9475 */
9476 ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
9477 if (!dev->l2ad_first)
9478 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
9479 }
9480 }
9481
9482 /*
9483 * Handle any abd transforms that might be required for writing to the L2ARC.
9484 * If successful, this function will always return an abd with the data
9485 * transformed as it is on disk in a new abd of asize bytes.
9486 */
9487 static int
l2arc_apply_transforms(spa_t * spa,arc_buf_hdr_t * hdr,uint64_t asize,abd_t ** abd_out)9488 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
9489 abd_t **abd_out)
9490 {
9491 int ret;
9492 abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
9493 enum zio_compress compress = HDR_GET_COMPRESS(hdr);
9494 uint64_t psize = HDR_GET_PSIZE(hdr);
9495 uint64_t size = arc_hdr_size(hdr);
9496 boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
9497 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
9498 dsl_crypto_key_t *dck = NULL;
9499 uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
9500 boolean_t no_crypt = B_FALSE;
9501
9502 ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
9503 !HDR_COMPRESSION_ENABLED(hdr)) ||
9504 HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
9505 ASSERT3U(psize, <=, asize);
9506
9507 /*
9508 * If this data simply needs its own buffer, we simply allocate it
9509 * and copy the data. This may be done to eliminate a dependency on a
9510 * shared buffer or to reallocate the buffer to match asize.
9511 */
9512 if (HDR_HAS_RABD(hdr)) {
9513 ASSERT3U(asize, >, psize);
9514 to_write = abd_alloc_for_io(asize, ismd);
9515 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
9516 abd_zero_off(to_write, psize, asize - psize);
9517 goto out;
9518 }
9519
9520 if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
9521 !HDR_ENCRYPTED(hdr)) {
9522 ASSERT3U(size, ==, psize);
9523 to_write = abd_alloc_for_io(asize, ismd);
9524 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
9525 if (asize > size)
9526 abd_zero_off(to_write, size, asize - size);
9527 goto out;
9528 }
9529
9530 if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
9531 cabd = abd_alloc_for_io(MAX(size, asize), ismd);
9532 uint64_t csize = zio_compress_data(compress, to_write, &cabd,
9533 size, MIN(size, psize), hdr->b_complevel);
9534 if (csize >= size || csize > psize) {
9535 /*
9536 * We can't re-compress the block into the original
9537 * psize. Even if it fits into asize, it does not
9538 * matter, since checksum will never match on read.
9539 */
9540 abd_free(cabd);
9541 return (SET_ERROR(EIO));
9542 }
9543 if (asize > csize)
9544 abd_zero_off(cabd, csize, asize - csize);
9545 to_write = cabd;
9546 }
9547
9548 if (HDR_ENCRYPTED(hdr)) {
9549 eabd = abd_alloc_for_io(asize, ismd);
9550
9551 /*
9552 * If the dataset was disowned before the buffer
9553 * made it to this point, the key to re-encrypt
9554 * it won't be available. In this case we simply
9555 * won't write the buffer to the L2ARC.
9556 */
9557 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
9558 FTAG, &dck);
9559 if (ret != 0)
9560 goto error;
9561
9562 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
9563 hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
9564 hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
9565 &no_crypt);
9566 if (ret != 0)
9567 goto error;
9568
9569 if (no_crypt)
9570 abd_copy(eabd, to_write, psize);
9571
9572 if (psize != asize)
9573 abd_zero_off(eabd, psize, asize - psize);
9574
9575 /* assert that the MAC we got here matches the one we saved */
9576 ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
9577 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9578
9579 if (to_write == cabd)
9580 abd_free(cabd);
9581
9582 to_write = eabd;
9583 }
9584
9585 out:
9586 ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
9587 *abd_out = to_write;
9588 return (0);
9589
9590 error:
9591 if (dck != NULL)
9592 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9593 if (cabd != NULL)
9594 abd_free(cabd);
9595 if (eabd != NULL)
9596 abd_free(eabd);
9597
9598 *abd_out = NULL;
9599 return (ret);
9600 }
9601
9602 /*
9603 * Write buffers from a single sublist to L2ARC.
9604 * Handles locking, marker determination, and buffer processing.
9605 * Returns B_TRUE if target size reached, B_FALSE otherwise.
9606 */
9607 static boolean_t
l2arc_write_sublist(spa_t * spa,l2arc_dev_t * dev,int pass,int sublist_idx,uint64_t target_sz,uint64_t * write_asize,uint64_t * write_psize,zio_t ** pio,l2arc_write_callback_t ** cb,arc_buf_hdr_t * head,uint64_t * consumed,uint64_t sublist_headroom,boolean_t save_position)9608 l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, int pass, int sublist_idx,
9609 uint64_t target_sz, uint64_t *write_asize, uint64_t *write_psize,
9610 zio_t **pio, l2arc_write_callback_t **cb, arc_buf_hdr_t *head,
9611 uint64_t *consumed, uint64_t sublist_headroom, boolean_t save_position)
9612 {
9613 multilist_sublist_t *mls;
9614 arc_buf_hdr_t *hdr;
9615 arc_buf_hdr_t *persistent_marker, *local_marker;
9616 boolean_t full = B_FALSE;
9617 boolean_t scan_from_head = B_FALSE;
9618 uint64_t guid = spa_load_guid(spa);
9619
9620 mls = l2arc_sublist_lock(pass, sublist_idx);
9621 ASSERT3P(mls, !=, NULL);
9622
9623 persistent_marker = spa->spa_l2arc_info.
9624 l2arc_markers[pass][sublist_idx];
9625
9626 /*
9627 * Check if this sublist's marker was flagged for reset to tail.
9628 * This handles depth cap resets and global resets without needing
9629 * to coordinate with actively-scanning threads.
9630 */
9631 if (save_position &&
9632 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx]) {
9633 multilist_sublist_remove(mls, persistent_marker);
9634 multilist_sublist_insert_tail(mls, persistent_marker);
9635 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx] =
9636 B_FALSE;
9637 }
9638
9639 if (save_position && persistent_marker == multilist_sublist_head(mls)) {
9640 multilist_sublist_unlock(mls);
9641 return (B_FALSE);
9642 }
9643
9644 local_marker = arc_state_alloc_marker();
9645
9646 if (save_position) {
9647 hdr = multilist_sublist_prev(mls, persistent_marker);
9648 ASSERT3P(hdr, !=, NULL);
9649 scan_from_head = B_FALSE;
9650 } else {
9651 if (arc_warm) {
9652 hdr = multilist_sublist_tail(mls);
9653 scan_from_head = B_FALSE;
9654 } else {
9655 hdr = multilist_sublist_head(mls);
9656 scan_from_head = B_TRUE;
9657 }
9658 ASSERT3P(hdr, !=, NULL);
9659 }
9660
9661 while (hdr != NULL) {
9662 kmutex_t *hash_lock;
9663 abd_t *to_write = NULL;
9664
9665 hash_lock = HDR_LOCK(hdr);
9666 if (!mutex_tryenter(hash_lock)) {
9667 skip:
9668 /* Skip this buffer rather than waiting. */
9669 if (scan_from_head)
9670 hdr = multilist_sublist_next(mls, hdr);
9671 else
9672 hdr = multilist_sublist_prev(mls, hdr);
9673 continue;
9674 }
9675
9676 if (l2arc_headroom != 0 &&
9677 *consumed + HDR_GET_LSIZE(hdr) >
9678 MAX(sublist_headroom, HDR_GET_LSIZE(hdr))) {
9679 /*
9680 * Searched too far in this sublist.
9681 */
9682 mutex_exit(hash_lock);
9683 break;
9684 }
9685
9686 *consumed += HDR_GET_LSIZE(hdr);
9687
9688 if (!l2arc_write_eligible(guid, hdr)) {
9689 mutex_exit(hash_lock);
9690 goto skip;
9691 }
9692
9693 ASSERT(HDR_HAS_L1HDR(hdr));
9694 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
9695 ASSERT3U(arc_hdr_size(hdr), >, 0);
9696 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
9697 uint64_t psize = HDR_GET_PSIZE(hdr);
9698 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
9699
9700 /*
9701 * If the allocated size of this buffer plus the max
9702 * size for the pending log block exceeds the evicted
9703 * target size, terminate writing buffers for this run.
9704 */
9705 if (*write_asize + asize +
9706 sizeof (l2arc_log_blk_phys_t) > target_sz) {
9707 full = B_TRUE;
9708 mutex_exit(hash_lock);
9709 break;
9710 }
9711
9712 /*
9713 * We should not sleep with sublist lock held or it
9714 * may block ARC eviction. Insert a marker to save
9715 * the position and drop the lock.
9716 */
9717 if (scan_from_head)
9718 multilist_sublist_insert_after(mls, hdr, local_marker);
9719 else
9720 multilist_sublist_insert_before(mls, hdr, local_marker);
9721 multilist_sublist_unlock(mls);
9722
9723 /*
9724 * If this header has b_rabd, we can use this since it
9725 * must always match the data exactly as it exists on
9726 * disk. Otherwise, the L2ARC can normally use the
9727 * hdr's data, but if we're sharing data between the
9728 * hdr and one of its bufs, L2ARC needs its own copy of
9729 * the data so that the ZIO below can't race with the
9730 * buf consumer. To ensure that this copy will be
9731 * available for the lifetime of the ZIO and be cleaned
9732 * up afterwards, we add it to the l2arc_free_on_write
9733 * queue. If we need to apply any transforms to the
9734 * data (compression, encryption) we will also need the
9735 * extra buffer.
9736 */
9737 if (HDR_HAS_RABD(hdr) && psize == asize) {
9738 to_write = hdr->b_crypt_hdr.b_rabd;
9739 } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
9740 HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
9741 !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
9742 psize == asize) {
9743 to_write = hdr->b_l1hdr.b_pabd;
9744 } else {
9745 int ret = l2arc_apply_transforms(spa, hdr, asize,
9746 &to_write);
9747 if (ret != 0) {
9748 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
9749 mutex_exit(hash_lock);
9750 goto next;
9751 }
9752
9753 l2arc_free_abd_on_write(to_write, dev);
9754 }
9755
9756 hdr->b_l2hdr.b_dev = dev;
9757 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
9758 hdr->b_l2hdr.b_hits = 0;
9759 hdr->b_l2hdr.b_arcs_state =
9760 hdr->b_l1hdr.b_state->arcs_state;
9761 /* l2arc_hdr_arcstats_update() expects a valid asize */
9762 HDR_SET_L2SIZE(hdr, asize);
9763 arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
9764 ARC_FLAG_L2_WRITING);
9765
9766 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
9767 arc_hdr_size(hdr), hdr);
9768 l2arc_hdr_arcstats_increment(hdr);
9769 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9770
9771 mutex_enter(&dev->l2ad_mtx);
9772 if (*pio == NULL) {
9773 /*
9774 * Insert a dummy header on the buflist so
9775 * l2arc_write_done() can find where the
9776 * write buffers begin without searching.
9777 */
9778 list_insert_head(&dev->l2ad_buflist, head);
9779 }
9780 list_insert_head(&dev->l2ad_buflist, hdr);
9781 mutex_exit(&dev->l2ad_mtx);
9782
9783 boolean_t commit = l2arc_log_blk_insert(dev, hdr);
9784 mutex_exit(hash_lock);
9785
9786 if (*pio == NULL) {
9787 *cb = kmem_alloc(sizeof (l2arc_write_callback_t),
9788 KM_SLEEP);
9789 (*cb)->l2wcb_dev = dev;
9790 (*cb)->l2wcb_head = head;
9791 list_create(&(*cb)->l2wcb_abd_list,
9792 sizeof (l2arc_lb_abd_buf_t),
9793 offsetof(l2arc_lb_abd_buf_t, node));
9794 *pio = zio_root(spa, l2arc_write_done, *cb,
9795 ZIO_FLAG_CANFAIL);
9796 }
9797
9798 zio_t *wzio = zio_write_phys(*pio, dev->l2ad_vdev,
9799 dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF,
9800 NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE,
9801 ZIO_FLAG_CANFAIL, B_FALSE);
9802
9803 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
9804 zio_t *, wzio);
9805 zio_nowait(wzio);
9806
9807 *write_psize += psize;
9808 *write_asize += asize;
9809 dev->l2ad_hand += asize;
9810
9811 if (commit) {
9812 /* l2ad_hand will be adjusted inside. */
9813 *write_asize += l2arc_log_blk_commit(dev, *pio, *cb);
9814 }
9815
9816 next:
9817 multilist_sublist_lock(mls);
9818 if (scan_from_head)
9819 hdr = multilist_sublist_next(mls, local_marker);
9820 else
9821 hdr = multilist_sublist_prev(mls, local_marker);
9822 multilist_sublist_remove(mls, local_marker);
9823 }
9824
9825 /* Reposition persistent marker for next iteration. */
9826 multilist_sublist_remove(mls, persistent_marker);
9827 if (save_position &&
9828 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx]) {
9829 /* Reset flagged during scan, restart from tail. */
9830 multilist_sublist_insert_tail(mls, persistent_marker);
9831 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx] =
9832 B_FALSE;
9833 } else if (save_position && hdr != NULL) {
9834 /*
9835 * Write budget or sublist headroom exhausted, position
9836 * marker after hdr to retry it next time.
9837 */
9838 multilist_sublist_insert_after(mls, hdr, persistent_marker);
9839 } else if (save_position) {
9840 /* End of sublist, position marker at head. */
9841 multilist_sublist_insert_head(mls, persistent_marker);
9842 } else {
9843 /* Non-persistent, reset marker to tail. */
9844 multilist_sublist_insert_tail(mls, persistent_marker);
9845 }
9846
9847 multilist_sublist_unlock(mls);
9848
9849 arc_state_free_marker(local_marker);
9850
9851 return (full);
9852 }
9853
9854 static void
l2arc_blk_fetch_done(zio_t * zio)9855 l2arc_blk_fetch_done(zio_t *zio)
9856 {
9857 l2arc_read_callback_t *cb;
9858
9859 cb = zio->io_private;
9860 if (cb->l2rcb_abd != NULL)
9861 abd_free(cb->l2rcb_abd);
9862 kmem_free(cb, sizeof (l2arc_read_callback_t));
9863 }
9864
9865 /*
9866 * Return the total size of the ARC state corresponding to the given
9867 * L2ARC pass number (0..3).
9868 */
9869 static uint64_t
l2arc_get_state_size(int pass)9870 l2arc_get_state_size(int pass)
9871 {
9872 switch (pass) {
9873 case L2ARC_MFU_META:
9874 return (zfs_refcount_count(
9875 &arc_mfu->arcs_size[ARC_BUFC_METADATA]));
9876 case L2ARC_MRU_META:
9877 return (zfs_refcount_count(
9878 &arc_mru->arcs_size[ARC_BUFC_METADATA]));
9879 case L2ARC_MFU_DATA:
9880 return (zfs_refcount_count(
9881 &arc_mfu->arcs_size[ARC_BUFC_DATA]));
9882 case L2ARC_MRU_DATA:
9883 return (zfs_refcount_count(
9884 &arc_mru->arcs_size[ARC_BUFC_DATA]));
9885 default:
9886 return (0);
9887 }
9888 }
9889
9890 /*
9891 * Flag all sublists for a single pass for lazy marker reset to tail.
9892 * Each sublist's marker will be reset when next visited by a feed thread.
9893 */
9894 static void
l2arc_flag_pass_reset(spa_t * spa,int pass)9895 l2arc_flag_pass_reset(spa_t *spa, int pass)
9896 {
9897 ASSERT(MUTEX_HELD(&spa->spa_l2arc_info.l2arc_sublist_lock));
9898
9899 multilist_t *ml = l2arc_get_list(pass);
9900 int num_sublists = multilist_get_num_sublists(ml);
9901
9902 for (int i = 0; i < num_sublists; i++) {
9903 multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
9904 spa->spa_l2arc_info.l2arc_sublist_reset[pass][i] = B_TRUE;
9905 multilist_sublist_unlock(mls);
9906 }
9907
9908 spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
9909 }
9910
9911 /*
9912 * Flag all L2ARC markers for lazy reset to tail for the given spa.
9913 * Each sublist's marker will be reset when next visited by a feed thread.
9914 */
9915 static void
l2arc_reset_all_markers(spa_t * spa)9916 l2arc_reset_all_markers(spa_t *spa)
9917 {
9918 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++)
9919 l2arc_flag_pass_reset(spa, pass);
9920
9921 /* Reset write counter */
9922 spa->spa_l2arc_info.l2arc_total_writes = 0;
9923 }
9924
9925 /*
9926 * Find and write ARC buffers to the L2ARC device.
9927 *
9928 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
9929 * for reading until they have completed writing.
9930 * The headroom_boost is an in-out parameter used to maintain headroom boost
9931 * state between calls to this function.
9932 *
9933 * Returns the number of bytes actually written (which may be smaller than
9934 * the delta by which the device hand has changed due to alignment and the
9935 * writing of log blocks).
9936 */
9937 static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz)9938 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
9939 {
9940 arc_buf_hdr_t *head;
9941 uint64_t write_asize, write_psize, headroom;
9942 boolean_t full;
9943 l2arc_write_callback_t *cb = NULL;
9944 zio_t *pio;
9945 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
9946
9947 ASSERT3P(dev->l2ad_vdev, !=, NULL);
9948
9949 pio = NULL;
9950 write_asize = write_psize = 0;
9951 full = B_FALSE;
9952 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
9953 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
9954
9955 /*
9956 * Determine L2ARC implementation based on total pool L2ARC capacity
9957 * vs ARC size. Use persistent markers for pools with significant
9958 * L2ARC investment, otherwise use simple HEAD/TAIL scanning.
9959 */
9960 boolean_t save_position =
9961 (spa->spa_l2arc_info.l2arc_total_capacity >=
9962 L2ARC_PERSIST_THRESHOLD);
9963
9964 /*
9965 * Check if markers need reset based on smallest device threshold.
9966 * Reset when cumulative writes exceed 1/8th of smallest device.
9967 * Must be protected since multiple device threads may check/update.
9968 */
9969 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
9970 if (save_position && spa->spa_l2arc_info.l2arc_total_writes >=
9971 spa->spa_l2arc_info.l2arc_smallest_capacity / 8) {
9972 l2arc_reset_all_markers(spa);
9973 }
9974 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
9975
9976 /*
9977 * Copy buffers for L2ARC writing.
9978 */
9979 boolean_t skip_meta = (save_position &&
9980 l2arc_meta_cycles > 0 &&
9981 dev->l2ad_meta_cycles >= l2arc_meta_cycles);
9982 if (skip_meta)
9983 dev->l2ad_meta_cycles = 0;
9984
9985 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9986 /*
9987 * pass == 0: MFU meta
9988 * pass == 1: MRU meta
9989 * pass == 2: MFU data
9990 * pass == 3: MRU data
9991 */
9992 if (l2arc_mfuonly == 1) {
9993 if (pass == 1 || pass == 3)
9994 continue;
9995 } else if (l2arc_mfuonly > 1) {
9996 if (pass == 3)
9997 continue;
9998 }
9999
10000 if (skip_meta && pass <= L2ARC_MRU_META)
10001 continue;
10002
10003 headroom = target_sz * l2arc_headroom;
10004 if (zfs_compressed_arc_enabled)
10005 headroom = (headroom * l2arc_headroom_boost) / 100;
10006
10007 multilist_t *ml = l2arc_get_list(pass);
10008 ASSERT3P(ml, !=, NULL);
10009 int num_sublists = multilist_get_num_sublists(ml);
10010 uint64_t consumed_headroom = 0;
10011
10012 /*
10013 * Equal per-sublist headroom prevents later
10014 * sublists from getting disproportionate shares
10015 * that would defeat the depth cap.
10016 */
10017 uint64_t sublist_headroom = headroom / num_sublists;
10018
10019 int current_sublist = spa->spa_l2arc_info.
10020 l2arc_next_sublist[pass];
10021 int processed_sublists = 0;
10022 while (processed_sublists < num_sublists && !full) {
10023 if (consumed_headroom >= headroom)
10024 break;
10025
10026 /*
10027 * Check if sublist is busy (being processed by another
10028 * L2ARC device thread). If so, skip to next sublist.
10029 */
10030 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10031 if (spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10032 [current_sublist]) {
10033 mutex_exit(&spa->spa_l2arc_info.
10034 l2arc_sublist_lock);
10035 current_sublist = (current_sublist + 1) %
10036 num_sublists;
10037 processed_sublists++;
10038 continue;
10039 }
10040 /* Mark sublist as busy */
10041 spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10042 [current_sublist] = B_TRUE;
10043 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10044
10045 /*
10046 * Write buffers from this sublist to L2ARC.
10047 * Function handles locking, marker management, and
10048 * buffer processing internally.
10049 */
10050 full = l2arc_write_sublist(spa, dev, pass,
10051 current_sublist, target_sz, &write_asize,
10052 &write_psize, &pio, &cb, head,
10053 &consumed_headroom, sublist_headroom,
10054 save_position);
10055
10056 /* Clear busy flag for this sublist */
10057 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10058 spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10059 [current_sublist] = B_FALSE;
10060 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10061
10062 current_sublist = (current_sublist + 1) % num_sublists;
10063 processed_sublists++;
10064 }
10065
10066 spa->spa_l2arc_info.l2arc_next_sublist[pass] =
10067 (spa->spa_l2arc_info.l2arc_next_sublist[pass] + 1) %
10068 num_sublists;
10069
10070 /*
10071 * Count consecutive metadata monopolization toward
10072 * l2arc_meta_cycles. Only count when metadata actually
10073 * filled the write budget, starving data passes.
10074 */
10075 if (save_position && pass <= L2ARC_MRU_META && full)
10076 dev->l2ad_meta_cycles++;
10077
10078 /*
10079 * Depth cap: track cumulative bytes scanned per pass
10080 * and reset markers when the scan cap is reached.
10081 * Keeps the marker near the tail where L2ARC adds
10082 * the most value.
10083 */
10084 if (save_position) {
10085 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10086
10087 spa->spa_l2arc_info.l2arc_ext_scanned[pass] +=
10088 consumed_headroom;
10089
10090 uint64_t state_sz = l2arc_get_state_size(pass);
10091 uint64_t scan_cap =
10092 state_sz * l2arc_ext_headroom_pct / 100;
10093
10094 if (scan_cap > 0 &&
10095 spa->spa_l2arc_info.l2arc_ext_scanned[pass] >=
10096 scan_cap) {
10097 l2arc_flag_pass_reset(spa, pass);
10098 }
10099
10100 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10101 }
10102
10103 if (full == B_TRUE)
10104 break;
10105 }
10106
10107 /*
10108 * If nothing was written at all, reset monopolization counter.
10109 * No point skipping metadata if data has nothing either.
10110 */
10111 if (write_asize == 0)
10112 dev->l2ad_meta_cycles = 0;
10113
10114 /* No buffers selected for writing? */
10115 if (pio == NULL) {
10116 ASSERT0(write_psize);
10117 ASSERT(!HDR_HAS_L1HDR(head));
10118 kmem_cache_free(hdr_l2only_cache, head);
10119
10120 /*
10121 * Although we did not write any buffers l2ad_evict may
10122 * have advanced.
10123 */
10124 if (dev->l2ad_evict != l2dhdr->dh_evict)
10125 l2arc_dev_hdr_update(dev);
10126
10127 return (0);
10128 }
10129
10130 if (!dev->l2ad_first)
10131 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
10132
10133 ASSERT3U(write_asize, <=, target_sz);
10134 ARCSTAT_BUMP(arcstat_l2_writes_sent);
10135 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
10136
10137 dev->l2ad_writing = B_TRUE;
10138 (void) zio_wait(pio);
10139 dev->l2ad_writing = B_FALSE;
10140
10141 /*
10142 * Update cumulative write tracking for marker reset logic.
10143 * Protected for multi-device thread access.
10144 */
10145 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10146 spa->spa_l2arc_info.l2arc_total_writes += write_asize;
10147 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10148
10149 /* Track writes for DWPD rate limiting */
10150 dev->l2ad_dwpd_writes += write_asize;
10151
10152 /*
10153 * Update the device header after the zio completes as
10154 * l2arc_write_done() may have updated the memory holding the log block
10155 * pointers in the device header.
10156 */
10157 l2arc_dev_hdr_update(dev);
10158
10159 return (write_asize);
10160 }
10161
10162 static boolean_t
l2arc_hdr_limit_reached(void)10163 l2arc_hdr_limit_reached(void)
10164 {
10165 int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
10166
10167 return (arc_reclaim_needed() ||
10168 (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
10169 }
10170
10171 /*
10172 * Per-device L2ARC feed thread. Each L2ARC device has its own thread
10173 * to allow parallel writes to multiple devices.
10174 */
10175 static __attribute__((noreturn)) void
l2arc_feed_thread(void * arg)10176 l2arc_feed_thread(void *arg)
10177 {
10178 l2arc_dev_t *dev = arg;
10179 callb_cpr_t cpr;
10180 spa_t *spa;
10181 uint64_t size, wrote;
10182 clock_t begin, next = ddi_get_lbolt();
10183 fstrans_cookie_t cookie;
10184
10185 ASSERT3P(dev, !=, NULL);
10186
10187 CALLB_CPR_INIT(&cpr, &dev->l2ad_feed_thr_lock, callb_generic_cpr, FTAG);
10188
10189 mutex_enter(&dev->l2ad_feed_thr_lock);
10190
10191 cookie = spl_fstrans_mark();
10192 while (dev->l2ad_thread_exit == B_FALSE) {
10193 CALLB_CPR_SAFE_BEGIN(&cpr);
10194 (void) cv_timedwait_idle(&dev->l2ad_feed_cv,
10195 &dev->l2ad_feed_thr_lock, next);
10196 CALLB_CPR_SAFE_END(&cpr, &dev->l2ad_feed_thr_lock);
10197 next = ddi_get_lbolt() + hz;
10198
10199 /*
10200 * Check if thread should exit.
10201 */
10202 if (dev->l2ad_thread_exit)
10203 break;
10204
10205 /*
10206 * Check if device is still valid. If not, thread should exit.
10207 */
10208 if (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev))
10209 break;
10210 begin = ddi_get_lbolt();
10211
10212 /*
10213 * Try to acquire the spa config lock. If we can't get it,
10214 * skip this iteration as removal might be in progress.
10215 * The feed thread will exit naturally when it wakes up and
10216 * sees l2ad_thread_exit is set.
10217 */
10218 spa = dev->l2ad_spa;
10219 ASSERT3P(spa, !=, NULL);
10220 if (!spa_config_tryenter(spa, SCL_L2ARC, dev, RW_READER))
10221 continue;
10222
10223 /*
10224 * Avoid contributing to memory pressure.
10225 */
10226 if (l2arc_hdr_limit_reached()) {
10227 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
10228 spa_config_exit(spa, SCL_L2ARC, dev);
10229 continue;
10230 }
10231
10232 ARCSTAT_BUMP(arcstat_l2_feeds);
10233
10234 clock_t interval;
10235 size = l2arc_write_size(dev, &interval);
10236
10237 /*
10238 * Evict L2ARC buffers that will be overwritten.
10239 */
10240 l2arc_evict(dev, size, B_FALSE);
10241
10242 /*
10243 * Write ARC buffers.
10244 */
10245 wrote = l2arc_write_buffers(spa, dev, size);
10246
10247 /*
10248 * Adjust interval based on actual write.
10249 */
10250 if (wrote == 0)
10251 interval = hz * l2arc_feed_secs;
10252 else if (wrote < size)
10253 interval = (interval * wrote) / size;
10254
10255 /*
10256 * Calculate next feed time.
10257 */
10258 clock_t now = ddi_get_lbolt();
10259 next = MAX(now, MIN(now + interval, begin + interval));
10260 spa_config_exit(spa, SCL_L2ARC, dev);
10261 }
10262 spl_fstrans_unmark(cookie);
10263
10264 dev->l2ad_feed_thread = NULL;
10265 cv_broadcast(&dev->l2ad_feed_cv);
10266 CALLB_CPR_EXIT(&cpr); /* drops dev->l2ad_feed_thr_lock */
10267 thread_exit();
10268 }
10269
10270 boolean_t
l2arc_vdev_present(vdev_t * vd)10271 l2arc_vdev_present(vdev_t *vd)
10272 {
10273 return (l2arc_vdev_get(vd) != NULL);
10274 }
10275
10276 /*
10277 * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
10278 * the vdev_t isn't an L2ARC device.
10279 */
10280 l2arc_dev_t *
l2arc_vdev_get(vdev_t * vd)10281 l2arc_vdev_get(vdev_t *vd)
10282 {
10283 l2arc_dev_t *dev;
10284
10285 mutex_enter(&l2arc_dev_mtx);
10286 for (dev = list_head(l2arc_dev_list); dev != NULL;
10287 dev = list_next(l2arc_dev_list, dev)) {
10288 if (dev->l2ad_vdev == vd)
10289 break;
10290 }
10291 mutex_exit(&l2arc_dev_mtx);
10292
10293 return (dev);
10294 }
10295
10296 static void
l2arc_rebuild_dev(l2arc_dev_t * dev,boolean_t reopen)10297 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
10298 {
10299 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
10300 uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
10301 spa_t *spa = dev->l2ad_spa;
10302
10303 /*
10304 * After a l2arc_remove_vdev(), the spa_t will no longer be valid
10305 */
10306 if (spa == NULL)
10307 return;
10308
10309 /*
10310 * The L2ARC has to hold at least the payload of one log block for
10311 * them to be restored (persistent L2ARC). The payload of a log block
10312 * depends on the amount of its log entries. We always write log blocks
10313 * with 1022 entries. How many of them are committed or restored depends
10314 * on the size of the L2ARC device. Thus the maximum payload of
10315 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
10316 * is less than that, we reduce the amount of committed and restored
10317 * log entries per block so as to enable persistence.
10318 */
10319 if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
10320 dev->l2ad_log_entries = 0;
10321 } else {
10322 dev->l2ad_log_entries = MIN((dev->l2ad_end -
10323 dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
10324 L2ARC_LOG_BLK_MAX_ENTRIES);
10325 }
10326
10327 /*
10328 * Read the device header, if an error is returned do not rebuild L2ARC.
10329 */
10330 if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
10331 /*
10332 * If we are onlining a cache device (vdev_reopen) that was
10333 * still present (l2arc_vdev_present()) and rebuild is enabled,
10334 * we should evict all ARC buffers and pointers to log blocks
10335 * and reclaim their space before restoring its contents to
10336 * L2ARC.
10337 */
10338 if (reopen) {
10339 if (!l2arc_rebuild_enabled) {
10340 return;
10341 } else {
10342 l2arc_evict(dev, 0, B_TRUE);
10343 /* start a new log block */
10344 dev->l2ad_log_ent_idx = 0;
10345 dev->l2ad_log_blk_payload_asize = 0;
10346 dev->l2ad_log_blk_payload_start = 0;
10347 }
10348 }
10349 /*
10350 * Just mark the device as pending for a rebuild. We won't
10351 * be starting a rebuild in line here as it would block pool
10352 * import. Instead spa_load_impl will hand that off to an
10353 * async task which will call l2arc_spa_rebuild_start.
10354 */
10355 dev->l2ad_rebuild = B_TRUE;
10356 } else if (spa_writeable(spa)) {
10357 /*
10358 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
10359 * otherwise create a new header. We zero out the memory holding
10360 * the header to reset dh_start_lbps. If we TRIM the whole
10361 * device the new header will be written by
10362 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
10363 * trim_state in the header too. When reading the header, if
10364 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
10365 * we opt to TRIM the whole device again.
10366 */
10367 if (l2arc_trim_ahead > 0) {
10368 dev->l2ad_trim_all = B_TRUE;
10369 } else {
10370 memset(l2dhdr, 0, l2dhdr_asize);
10371 l2arc_dev_hdr_update(dev);
10372 }
10373 }
10374 }
10375
10376
10377 /*
10378 * Recalculate smallest L2ARC device capacity for the given spa.
10379 * Must be called under l2arc_dev_mtx.
10380 */
10381 static void
l2arc_update_smallest_capacity(spa_t * spa)10382 l2arc_update_smallest_capacity(spa_t *spa)
10383 {
10384 ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
10385 l2arc_dev_t *dev;
10386 uint64_t smallest = UINT64_MAX;
10387
10388 for (dev = list_head(l2arc_dev_list); dev != NULL;
10389 dev = list_next(l2arc_dev_list, dev)) {
10390 if (dev->l2ad_spa == spa) {
10391 uint64_t cap = dev->l2ad_end - dev->l2ad_start;
10392 if (cap < smallest)
10393 smallest = cap;
10394 }
10395 }
10396
10397 spa->spa_l2arc_info.l2arc_smallest_capacity = smallest;
10398 }
10399
10400 /*
10401 * Add a vdev for use by the L2ARC. By this point the spa has already
10402 * validated the vdev and opened it.
10403 */
10404 void
l2arc_add_vdev(spa_t * spa,vdev_t * vd)10405 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
10406 {
10407 l2arc_dev_t *adddev;
10408 uint64_t l2dhdr_asize;
10409
10410 ASSERT(!l2arc_vdev_present(vd));
10411
10412 /*
10413 * Create a new l2arc device entry.
10414 */
10415 adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
10416 adddev->l2ad_spa = spa;
10417 adddev->l2ad_vdev = vd;
10418 /* leave extra size for an l2arc device header */
10419 l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
10420 MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
10421 adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
10422 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
10423 ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
10424 adddev->l2ad_hand = adddev->l2ad_start;
10425 adddev->l2ad_evict = adddev->l2ad_start;
10426 adddev->l2ad_first = B_TRUE;
10427 adddev->l2ad_writing = B_FALSE;
10428 adddev->l2ad_trim_all = B_FALSE;
10429 adddev->l2ad_dwpd_writes = 0;
10430 adddev->l2ad_dwpd_start = gethrestime_sec();
10431 adddev->l2ad_dwpd_accumulated = 0;
10432 adddev->l2ad_dwpd_bump = l2arc_dwpd_bump;
10433 list_link_init(&adddev->l2ad_node);
10434 adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
10435
10436 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
10437 /*
10438 * This is a list of all ARC buffers that are still valid on the
10439 * device.
10440 */
10441 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
10442 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
10443
10444 /*
10445 * This is a list of pointers to log blocks that are still present
10446 * on the device.
10447 */
10448 list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
10449 offsetof(l2arc_lb_ptr_buf_t, node));
10450
10451 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
10452 zfs_refcount_create(&adddev->l2ad_alloc);
10453
10454 /*
10455 * Initialize per-device thread fields
10456 */
10457 adddev->l2ad_thread_exit = B_FALSE;
10458 mutex_init(&adddev->l2ad_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10459 cv_init(&adddev->l2ad_feed_cv, NULL, CV_DEFAULT, NULL);
10460
10461 zfs_refcount_create(&adddev->l2ad_lb_asize);
10462 zfs_refcount_create(&adddev->l2ad_lb_count);
10463
10464 /*
10465 * Decide if dev is eligible for L2ARC rebuild or whole device
10466 * trimming. This has to happen before the device is added in the
10467 * cache device list and l2arc_dev_mtx is released. Otherwise
10468 * l2arc_feed_thread() might already start writing on the
10469 * device.
10470 */
10471 l2arc_rebuild_dev(adddev, B_FALSE);
10472
10473 /*
10474 * Add device to global list
10475 */
10476 mutex_enter(&l2arc_dev_mtx);
10477
10478 /*
10479 * Initialize pool-based position saving markers if this is the first
10480 * L2ARC device for this pool
10481 */
10482 if (!l2arc_pool_has_devices(spa)) {
10483 l2arc_pool_markers_init(spa);
10484 }
10485
10486 list_insert_head(l2arc_dev_list, adddev);
10487 atomic_inc_64(&l2arc_ndev);
10488 spa->spa_l2arc_info.l2arc_total_capacity += (adddev->l2ad_end -
10489 adddev->l2ad_start);
10490 l2arc_update_smallest_capacity(spa);
10491
10492 /*
10493 * Create per-device feed thread only if spa is writable.
10494 * The thread name includes the spa name and device number
10495 * for easy identification.
10496 */
10497 if (spa_writeable(spa)) {
10498 char thread_name[MAXNAMELEN];
10499 snprintf(thread_name, sizeof (thread_name), "l2arc_%s_%llu",
10500 spa_name(spa), (u_longlong_t)vd->vdev_id);
10501 adddev->l2ad_feed_thread = thread_create_named(thread_name,
10502 NULL, 0, l2arc_feed_thread, adddev, 0, &p0, TS_RUN,
10503 minclsyspri);
10504 if (adddev->l2ad_feed_thread == NULL) {
10505 cmn_err(CE_WARN, "l2arc: failed to create feed thread "
10506 "for vdev %llu in pool '%s'",
10507 (u_longlong_t)vd->vdev_id, spa_name(spa));
10508 }
10509 } else {
10510 adddev->l2ad_feed_thread = NULL;
10511 }
10512
10513 mutex_exit(&l2arc_dev_mtx);
10514 }
10515
10516 /*
10517 * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
10518 * in case of onlining a cache device.
10519 */
10520 void
l2arc_rebuild_vdev(vdev_t * vd,boolean_t reopen)10521 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
10522 {
10523 l2arc_dev_t *dev = NULL;
10524
10525 dev = l2arc_vdev_get(vd);
10526 ASSERT3P(dev, !=, NULL);
10527
10528 /*
10529 * In contrast to l2arc_add_vdev() we do not have to worry about
10530 * l2arc_feed_thread() invalidating previous content when onlining a
10531 * cache device. The device parameters (l2ad*) are not cleared when
10532 * offlining the device and writing new buffers will not invalidate
10533 * all previous content. In worst case only buffers that have not had
10534 * their log block written to the device will be lost.
10535 * When onlining the cache device (ie offline->online without exporting
10536 * the pool in between) this happens:
10537 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
10538 * | |
10539 * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE
10540 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
10541 * is set to B_TRUE we might write additional buffers to the device.
10542 */
10543 l2arc_rebuild_dev(dev, reopen);
10544 }
10545
10546 typedef struct {
10547 l2arc_dev_t *rva_l2arc_dev;
10548 uint64_t rva_spa_gid;
10549 uint64_t rva_vdev_gid;
10550 boolean_t rva_async;
10551
10552 } remove_vdev_args_t;
10553
10554 static void
l2arc_device_teardown(void * arg)10555 l2arc_device_teardown(void *arg)
10556 {
10557 remove_vdev_args_t *rva = arg;
10558 l2arc_dev_t *remdev = rva->rva_l2arc_dev;
10559 hrtime_t start_time = gethrtime();
10560
10561 /*
10562 * Clear all buflists and ARC references. L2ARC device flush.
10563 */
10564 l2arc_evict(remdev, 0, B_TRUE);
10565 list_destroy(&remdev->l2ad_buflist);
10566 ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
10567 list_destroy(&remdev->l2ad_lbptr_list);
10568 mutex_destroy(&remdev->l2ad_mtx);
10569 mutex_destroy(&remdev->l2ad_feed_thr_lock);
10570 cv_destroy(&remdev->l2ad_feed_cv);
10571 zfs_refcount_destroy(&remdev->l2ad_alloc);
10572 zfs_refcount_destroy(&remdev->l2ad_lb_asize);
10573 zfs_refcount_destroy(&remdev->l2ad_lb_count);
10574 kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
10575 vmem_free(remdev, sizeof (l2arc_dev_t));
10576
10577 uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
10578 if (elapsed > 0) {
10579 zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
10580 (u_longlong_t)rva->rva_spa_gid,
10581 (u_longlong_t)rva->rva_vdev_gid,
10582 (u_longlong_t)elapsed);
10583 }
10584
10585 if (rva->rva_async)
10586 arc_async_flush_remove(rva->rva_spa_gid, 2);
10587 kmem_free(rva, sizeof (remove_vdev_args_t));
10588 }
10589
10590 /*
10591 * Remove a vdev from the L2ARC.
10592 */
10593 void
l2arc_remove_vdev(vdev_t * vd)10594 l2arc_remove_vdev(vdev_t *vd)
10595 {
10596 spa_t *spa = vd->vdev_spa;
10597 boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED ||
10598 spa->spa_state == POOL_STATE_DESTROYED;
10599
10600 /*
10601 * Find the device by vdev
10602 */
10603 l2arc_dev_t *remdev = l2arc_vdev_get(vd);
10604 ASSERT3P(remdev, !=, NULL);
10605
10606 /*
10607 * Save info for final teardown
10608 */
10609 remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
10610 KM_SLEEP);
10611 rva->rva_l2arc_dev = remdev;
10612 rva->rva_spa_gid = spa_load_guid(spa);
10613 rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
10614
10615 /*
10616 * Cancel any ongoing or scheduled rebuild.
10617 */
10618 mutex_enter(&l2arc_rebuild_thr_lock);
10619 remdev->l2ad_rebuild_cancel = B_TRUE;
10620 if (remdev->l2ad_rebuild_began == B_TRUE) {
10621 while (remdev->l2ad_rebuild == B_TRUE)
10622 cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
10623 }
10624 mutex_exit(&l2arc_rebuild_thr_lock);
10625
10626 /*
10627 * Signal per-device feed thread to exit and wait for it.
10628 * Thread only exists if pool was imported read-write.
10629 */
10630 if (remdev->l2ad_feed_thread != NULL) {
10631 mutex_enter(&remdev->l2ad_feed_thr_lock);
10632 remdev->l2ad_thread_exit = B_TRUE;
10633 cv_signal(&remdev->l2ad_feed_cv);
10634 while (remdev->l2ad_feed_thread != NULL)
10635 cv_wait(&remdev->l2ad_feed_cv,
10636 &remdev->l2ad_feed_thr_lock);
10637 mutex_exit(&remdev->l2ad_feed_thr_lock);
10638 }
10639
10640 rva->rva_async = asynchronous;
10641
10642 /*
10643 * Remove device from global list
10644 */
10645 ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
10646 mutex_enter(&l2arc_dev_mtx);
10647 list_remove(l2arc_dev_list, remdev);
10648 atomic_dec_64(&l2arc_ndev);
10649 spa->spa_l2arc_info.l2arc_total_capacity -=
10650 (remdev->l2ad_end - remdev->l2ad_start);
10651 l2arc_update_smallest_capacity(spa);
10652
10653 /*
10654 * Clean up pool-based markers if this was the last L2ARC device
10655 * for this pool
10656 */
10657 if (!l2arc_pool_has_devices(spa)) {
10658 l2arc_pool_markers_fini(spa);
10659 }
10660
10661 /* During a pool export spa & vdev will no longer be valid */
10662 if (asynchronous) {
10663 remdev->l2ad_spa = NULL;
10664 remdev->l2ad_vdev = NULL;
10665 }
10666 mutex_exit(&l2arc_dev_mtx);
10667
10668 if (!asynchronous) {
10669 l2arc_device_teardown(rva);
10670 return;
10671 }
10672
10673 arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2);
10674
10675 taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva,
10676 TQ_SLEEP, &af->af_tqent);
10677 }
10678
10679 void
l2arc_init(void)10680 l2arc_init(void)
10681 {
10682 l2arc_ndev = 0;
10683
10684 mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10685 cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
10686 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
10687 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
10688
10689 l2arc_dev_list = &L2ARC_dev_list;
10690 l2arc_free_on_write = &L2ARC_free_on_write;
10691 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
10692 offsetof(l2arc_dev_t, l2ad_node));
10693 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
10694 offsetof(l2arc_data_free_t, l2df_list_node));
10695 }
10696
10697 void
l2arc_fini(void)10698 l2arc_fini(void)
10699 {
10700 mutex_destroy(&l2arc_rebuild_thr_lock);
10701 cv_destroy(&l2arc_rebuild_thr_cv);
10702 mutex_destroy(&l2arc_dev_mtx);
10703 mutex_destroy(&l2arc_free_on_write_mtx);
10704
10705 list_destroy(l2arc_dev_list);
10706 list_destroy(l2arc_free_on_write);
10707 }
10708
10709
10710 /*
10711 * Punches out rebuild threads for the L2ARC devices in a spa. This should
10712 * be called after pool import from the spa async thread, since starting
10713 * these threads directly from spa_import() will make them part of the
10714 * "zpool import" context and delay process exit (and thus pool import).
10715 */
10716 void
l2arc_spa_rebuild_start(spa_t * spa)10717 l2arc_spa_rebuild_start(spa_t *spa)
10718 {
10719 ASSERT(spa_namespace_held());
10720
10721 /*
10722 * Locate the spa's l2arc devices and kick off rebuild threads.
10723 */
10724 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10725 l2arc_dev_t *dev =
10726 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10727 if (dev == NULL) {
10728 /* Don't attempt a rebuild if the vdev is UNAVAIL */
10729 continue;
10730 }
10731 mutex_enter(&l2arc_rebuild_thr_lock);
10732 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
10733 dev->l2ad_rebuild_began = B_TRUE;
10734 (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
10735 dev, 0, &p0, TS_RUN, minclsyspri);
10736 }
10737 mutex_exit(&l2arc_rebuild_thr_lock);
10738 }
10739 }
10740
10741 void
l2arc_spa_rebuild_stop(spa_t * spa)10742 l2arc_spa_rebuild_stop(spa_t *spa)
10743 {
10744 ASSERT(spa_namespace_held() ||
10745 spa->spa_export_thread == curthread);
10746
10747 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10748 l2arc_dev_t *dev =
10749 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10750 if (dev == NULL)
10751 continue;
10752 mutex_enter(&l2arc_rebuild_thr_lock);
10753 dev->l2ad_rebuild_cancel = B_TRUE;
10754 mutex_exit(&l2arc_rebuild_thr_lock);
10755 }
10756 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10757 l2arc_dev_t *dev =
10758 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10759 if (dev == NULL)
10760 continue;
10761 mutex_enter(&l2arc_rebuild_thr_lock);
10762 if (dev->l2ad_rebuild_began == B_TRUE) {
10763 while (dev->l2ad_rebuild == B_TRUE) {
10764 cv_wait(&l2arc_rebuild_thr_cv,
10765 &l2arc_rebuild_thr_lock);
10766 }
10767 }
10768 mutex_exit(&l2arc_rebuild_thr_lock);
10769 }
10770 }
10771
10772 /*
10773 * Main entry point for L2ARC rebuilding.
10774 */
10775 static __attribute__((noreturn)) void
l2arc_dev_rebuild_thread(void * arg)10776 l2arc_dev_rebuild_thread(void *arg)
10777 {
10778 l2arc_dev_t *dev = arg;
10779
10780 VERIFY(dev->l2ad_rebuild);
10781 (void) l2arc_rebuild(dev);
10782 mutex_enter(&l2arc_rebuild_thr_lock);
10783 dev->l2ad_rebuild_began = B_FALSE;
10784 dev->l2ad_rebuild = B_FALSE;
10785 cv_signal(&l2arc_rebuild_thr_cv);
10786 mutex_exit(&l2arc_rebuild_thr_lock);
10787
10788 thread_exit();
10789 }
10790
10791 /*
10792 * This function implements the actual L2ARC metadata rebuild. It:
10793 * starts reading the log block chain and restores each block's contents
10794 * to memory (reconstructing arc_buf_hdr_t's).
10795 *
10796 * Operation stops under any of the following conditions:
10797 *
10798 * 1) We reach the end of the log block chain.
10799 * 2) We encounter *any* error condition (cksum errors, io errors)
10800 */
10801 static int
l2arc_rebuild(l2arc_dev_t * dev)10802 l2arc_rebuild(l2arc_dev_t *dev)
10803 {
10804 vdev_t *vd = dev->l2ad_vdev;
10805 spa_t *spa = vd->vdev_spa;
10806 int err = 0;
10807 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
10808 l2arc_log_blk_phys_t *this_lb, *next_lb;
10809 zio_t *this_io = NULL, *next_io = NULL;
10810 l2arc_log_blkptr_t lbps[2];
10811 l2arc_lb_ptr_buf_t *lb_ptr_buf;
10812 boolean_t lock_held;
10813
10814 this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
10815 next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
10816
10817 /*
10818 * We prevent device removal while issuing reads to the device,
10819 * then during the rebuilding phases we drop this lock again so
10820 * that a spa_unload or device remove can be initiated - this is
10821 * safe, because the spa will signal us to stop before removing
10822 * our device and wait for us to stop.
10823 */
10824 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
10825 lock_held = B_TRUE;
10826
10827 /*
10828 * Retrieve the persistent L2ARC device state.
10829 * L2BLK_GET_PSIZE returns aligned size for log blocks.
10830 */
10831 dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
10832 dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
10833 L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
10834 dev->l2ad_start);
10835 dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
10836
10837 vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
10838 vd->vdev_trim_state = l2dhdr->dh_trim_state;
10839
10840 /*
10841 * In case the zfs module parameter l2arc_rebuild_enabled is false
10842 * we do not start the rebuild process.
10843 */
10844 if (!l2arc_rebuild_enabled)
10845 goto out;
10846
10847 /* Prepare the rebuild process */
10848 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
10849
10850 /* Start the rebuild process */
10851 for (;;) {
10852 if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
10853 break;
10854
10855 if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
10856 this_lb, next_lb, this_io, &next_io)) != 0)
10857 goto out;
10858
10859 /*
10860 * Our memory pressure valve. If the system is running low
10861 * on memory, rather than swamping memory with new ARC buf
10862 * hdrs, we opt not to rebuild the L2ARC. At this point,
10863 * however, we have already set up our L2ARC dev to chain in
10864 * new metadata log blocks, so the user may choose to offline/
10865 * online the L2ARC dev at a later time (or re-import the pool)
10866 * to reconstruct it (when there's less memory pressure).
10867 */
10868 if (l2arc_hdr_limit_reached()) {
10869 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
10870 cmn_err(CE_NOTE, "System running low on memory, "
10871 "aborting L2ARC rebuild.");
10872 err = SET_ERROR(ENOMEM);
10873 goto out;
10874 }
10875
10876 spa_config_exit(spa, SCL_L2ARC, vd);
10877 lock_held = B_FALSE;
10878
10879 /*
10880 * Now that we know that the next_lb checks out alright, we
10881 * can start reconstruction from this log block.
10882 * L2BLK_GET_PSIZE returns aligned size for log blocks.
10883 */
10884 uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
10885 l2arc_log_blk_restore(dev, this_lb, asize);
10886
10887 /*
10888 * log block restored, include its pointer in the list of
10889 * pointers to log blocks present in the L2ARC device.
10890 */
10891 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
10892 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
10893 KM_SLEEP);
10894 memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
10895 sizeof (l2arc_log_blkptr_t));
10896 mutex_enter(&dev->l2ad_mtx);
10897 list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
10898 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
10899 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
10900 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
10901 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
10902 mutex_exit(&dev->l2ad_mtx);
10903 vdev_space_update(vd, asize, 0, 0);
10904
10905 /*
10906 * Protection against loops of log blocks:
10907 *
10908 * l2ad_hand l2ad_evict
10909 * V V
10910 * l2ad_start |=======================================| l2ad_end
10911 * -----|||----|||---|||----|||
10912 * (3) (2) (1) (0)
10913 * ---|||---|||----|||---|||
10914 * (7) (6) (5) (4)
10915 *
10916 * In this situation the pointer of log block (4) passes
10917 * l2arc_log_blkptr_valid() but the log block should not be
10918 * restored as it is overwritten by the payload of log block
10919 * (0). Only log blocks (0)-(3) should be restored. We check
10920 * whether l2ad_evict lies in between the payload starting
10921 * offset of the next log block (lbps[1].lbp_payload_start)
10922 * and the payload starting offset of the present log block
10923 * (lbps[0].lbp_payload_start). If true and this isn't the
10924 * first pass, we are looping from the beginning and we should
10925 * stop.
10926 */
10927 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
10928 lbps[0].lbp_payload_start, dev->l2ad_evict) &&
10929 !dev->l2ad_first)
10930 goto out;
10931
10932 kpreempt(KPREEMPT_SYNC);
10933 for (;;) {
10934 mutex_enter(&l2arc_rebuild_thr_lock);
10935 if (dev->l2ad_rebuild_cancel) {
10936 mutex_exit(&l2arc_rebuild_thr_lock);
10937 err = SET_ERROR(ECANCELED);
10938 goto out;
10939 }
10940 mutex_exit(&l2arc_rebuild_thr_lock);
10941 if (spa_config_tryenter(spa, SCL_L2ARC, vd,
10942 RW_READER)) {
10943 lock_held = B_TRUE;
10944 break;
10945 }
10946 /*
10947 * L2ARC config lock held by somebody in writer,
10948 * possibly due to them trying to remove us. They'll
10949 * likely to want us to shut down, so after a little
10950 * delay, we check l2ad_rebuild_cancel and retry
10951 * the lock again.
10952 */
10953 delay(1);
10954 }
10955
10956 /*
10957 * Continue with the next log block.
10958 */
10959 lbps[0] = lbps[1];
10960 lbps[1] = this_lb->lb_prev_lbp;
10961 PTR_SWAP(this_lb, next_lb);
10962 this_io = next_io;
10963 next_io = NULL;
10964 }
10965
10966 if (this_io != NULL)
10967 l2arc_log_blk_fetch_abort(this_io);
10968 out:
10969 if (next_io != NULL)
10970 l2arc_log_blk_fetch_abort(next_io);
10971 vmem_free(this_lb, sizeof (*this_lb));
10972 vmem_free(next_lb, sizeof (*next_lb));
10973
10974 if (err == ECANCELED) {
10975 /*
10976 * In case the rebuild was canceled do not log to spa history
10977 * log as the pool may be in the process of being removed.
10978 */
10979 zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
10980 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10981 return (err);
10982 } else if (!l2arc_rebuild_enabled) {
10983 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10984 "disabled");
10985 } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
10986 ARCSTAT_BUMP(arcstat_l2_rebuild_success);
10987 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10988 "successful, restored %llu blocks",
10989 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10990 } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
10991 /*
10992 * No error but also nothing restored, meaning the lbps array
10993 * in the device header points to invalid/non-present log
10994 * blocks. Reset the header.
10995 */
10996 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10997 "no valid log blocks");
10998 memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
10999 l2arc_dev_hdr_update(dev);
11000 } else if (err != 0) {
11001 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
11002 "aborted, restored %llu blocks",
11003 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
11004 }
11005
11006 if (lock_held)
11007 spa_config_exit(spa, SCL_L2ARC, vd);
11008
11009 return (err);
11010 }
11011
11012 /*
11013 * Attempts to read the device header on the provided L2ARC device and writes
11014 * it to `hdr'. On success, this function returns 0, otherwise the appropriate
11015 * error code is returned.
11016 */
11017 static int
l2arc_dev_hdr_read(l2arc_dev_t * dev)11018 l2arc_dev_hdr_read(l2arc_dev_t *dev)
11019 {
11020 int err;
11021 uint64_t guid;
11022 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11023 const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
11024 abd_t *abd;
11025
11026 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
11027
11028 abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
11029
11030 err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
11031 VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
11032 ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
11033 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
11034 ZIO_FLAG_SPECULATIVE, B_FALSE));
11035
11036 abd_free(abd);
11037
11038 if (err != 0) {
11039 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
11040 zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
11041 "vdev guid: %llu", err,
11042 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11043 return (err);
11044 }
11045
11046 if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
11047 byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
11048
11049 if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
11050 l2dhdr->dh_spa_guid != guid ||
11051 l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
11052 l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
11053 l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
11054 l2dhdr->dh_end != dev->l2ad_end ||
11055 !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
11056 l2dhdr->dh_evict) ||
11057 (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
11058 l2arc_trim_ahead > 0)) {
11059 /*
11060 * Attempt to rebuild a device containing no actual dev hdr
11061 * or containing a header from some other pool or from another
11062 * version of persistent L2ARC.
11063 */
11064 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
11065 return (SET_ERROR(ENOTSUP));
11066 }
11067
11068 return (0);
11069 }
11070
11071 /*
11072 * Reads L2ARC log blocks from storage and validates their contents.
11073 *
11074 * This function implements a simple fetcher to make sure that while
11075 * we're processing one buffer the L2ARC is already fetching the next
11076 * one in the chain.
11077 *
11078 * The arguments this_lp and next_lp point to the current and next log block
11079 * address in the block chain. Similarly, this_lb and next_lb hold the
11080 * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
11081 *
11082 * The `this_io' and `next_io' arguments are used for block fetching.
11083 * When issuing the first blk IO during rebuild, you should pass NULL for
11084 * `this_io'. This function will then issue a sync IO to read the block and
11085 * also issue an async IO to fetch the next block in the block chain. The
11086 * fetched IO is returned in `next_io'. On subsequent calls to this
11087 * function, pass the value returned in `next_io' from the previous call
11088 * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
11089 * Prior to the call, you should initialize your `next_io' pointer to be
11090 * NULL. If no fetch IO was issued, the pointer is left set at NULL.
11091 *
11092 * On success, this function returns 0, otherwise it returns an appropriate
11093 * error code. On error the fetching IO is aborted and cleared before
11094 * returning from this function. Therefore, if we return `success', the
11095 * caller can assume that we have taken care of cleanup of fetch IOs.
11096 */
11097 static int
l2arc_log_blk_read(l2arc_dev_t * dev,const l2arc_log_blkptr_t * this_lbp,const l2arc_log_blkptr_t * next_lbp,l2arc_log_blk_phys_t * this_lb,l2arc_log_blk_phys_t * next_lb,zio_t * this_io,zio_t ** next_io)11098 l2arc_log_blk_read(l2arc_dev_t *dev,
11099 const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
11100 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
11101 zio_t *this_io, zio_t **next_io)
11102 {
11103 int err = 0;
11104 zio_cksum_t cksum;
11105 uint64_t asize;
11106
11107 ASSERT(this_lbp != NULL && next_lbp != NULL);
11108 ASSERT(this_lb != NULL && next_lb != NULL);
11109 ASSERT(next_io != NULL && *next_io == NULL);
11110 ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
11111
11112 /*
11113 * Check to see if we have issued the IO for this log block in a
11114 * previous run. If not, this is the first call, so issue it now.
11115 */
11116 if (this_io == NULL) {
11117 this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
11118 this_lb);
11119 }
11120
11121 /*
11122 * Peek to see if we can start issuing the next IO immediately.
11123 */
11124 if (l2arc_log_blkptr_valid(dev, next_lbp)) {
11125 /*
11126 * Start issuing IO for the next log block early - this
11127 * should help keep the L2ARC device busy while we
11128 * decompress and restore this log block.
11129 */
11130 *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
11131 next_lb);
11132 }
11133
11134 /* Wait for the IO to read this log block to complete */
11135 if ((err = zio_wait(this_io)) != 0) {
11136 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
11137 zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
11138 "offset: %llu, vdev guid: %llu", err,
11139 (u_longlong_t)this_lbp->lbp_daddr,
11140 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11141 goto cleanup;
11142 }
11143
11144 /*
11145 * Make sure the buffer checks out.
11146 * L2BLK_GET_PSIZE returns aligned size for log blocks.
11147 */
11148 asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
11149 fletcher_4_native(this_lb, asize, NULL, &cksum);
11150 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
11151 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
11152 zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
11153 "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
11154 (u_longlong_t)this_lbp->lbp_daddr,
11155 (u_longlong_t)dev->l2ad_vdev->vdev_guid,
11156 (u_longlong_t)dev->l2ad_hand,
11157 (u_longlong_t)dev->l2ad_evict);
11158 err = SET_ERROR(ECKSUM);
11159 goto cleanup;
11160 }
11161
11162 /* Now we can take our time decoding this buffer */
11163 switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
11164 case ZIO_COMPRESS_OFF:
11165 break;
11166 case ZIO_COMPRESS_LZ4: {
11167 abd_t *abd = abd_alloc_linear(asize, B_TRUE);
11168 abd_copy_from_buf_off(abd, this_lb, 0, asize);
11169 abd_t dabd;
11170 abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
11171 err = zio_decompress_data(
11172 L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
11173 abd, &dabd, asize, sizeof (*this_lb), NULL);
11174 abd_free(&dabd);
11175 abd_free(abd);
11176 if (err != 0) {
11177 err = SET_ERROR(EINVAL);
11178 goto cleanup;
11179 }
11180 break;
11181 }
11182 default:
11183 err = SET_ERROR(EINVAL);
11184 goto cleanup;
11185 }
11186 if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
11187 byteswap_uint64_array(this_lb, sizeof (*this_lb));
11188 if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
11189 err = SET_ERROR(EINVAL);
11190 goto cleanup;
11191 }
11192 cleanup:
11193 /* Abort an in-flight fetch I/O in case of error */
11194 if (err != 0 && *next_io != NULL) {
11195 l2arc_log_blk_fetch_abort(*next_io);
11196 *next_io = NULL;
11197 }
11198 return (err);
11199 }
11200
11201 /*
11202 * Restores the payload of a log block to ARC. This creates empty ARC hdr
11203 * entries which only contain an l2arc hdr, essentially restoring the
11204 * buffers to their L2ARC evicted state. This function also updates space
11205 * usage on the L2ARC vdev to make sure it tracks restored buffers.
11206 */
11207 static void
l2arc_log_blk_restore(l2arc_dev_t * dev,const l2arc_log_blk_phys_t * lb,uint64_t lb_asize)11208 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
11209 uint64_t lb_asize)
11210 {
11211 uint64_t size = 0, asize = 0;
11212 uint64_t log_entries = dev->l2ad_log_entries;
11213
11214 /*
11215 * Usually arc_adapt() is called only for data, not headers, but
11216 * since we may allocate significant amount of memory here, let ARC
11217 * grow its arc_c.
11218 */
11219 arc_adapt(log_entries * HDR_L2ONLY_SIZE);
11220
11221 for (int i = log_entries - 1; i >= 0; i--) {
11222 /*
11223 * Restore goes in the reverse temporal direction to preserve
11224 * correct temporal ordering of buffers in the l2ad_buflist.
11225 * l2arc_hdr_restore also does a list_insert_tail instead of
11226 * list_insert_head on the l2ad_buflist:
11227 *
11228 * LIST l2ad_buflist LIST
11229 * HEAD <------ (time) ------ TAIL
11230 * direction +-----+-----+-----+-----+-----+ direction
11231 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
11232 * fill +-----+-----+-----+-----+-----+
11233 * ^ ^
11234 * | |
11235 * | |
11236 * l2arc_feed_thread l2arc_rebuild
11237 * will place new bufs here restores bufs here
11238 *
11239 * During l2arc_rebuild() the device is not used by
11240 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
11241 */
11242 size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
11243 asize += vdev_psize_to_asize(dev->l2ad_vdev,
11244 L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
11245 l2arc_hdr_restore(&lb->lb_entries[i], dev);
11246 }
11247
11248 /*
11249 * Record rebuild stats:
11250 * size Logical size of restored buffers in the L2ARC
11251 * asize Aligned size of restored buffers in the L2ARC
11252 */
11253 ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
11254 ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
11255 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
11256 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
11257 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
11258 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
11259 }
11260
11261 /*
11262 * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
11263 * into a state indicating that it has been evicted to L2ARC.
11264 */
11265 static void
l2arc_hdr_restore(const l2arc_log_ent_phys_t * le,l2arc_dev_t * dev)11266 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
11267 {
11268 arc_buf_hdr_t *hdr, *exists;
11269 kmutex_t *hash_lock;
11270 arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
11271 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
11272 L2BLK_GET_PSIZE((le)->le_prop));
11273
11274 /*
11275 * Do all the allocation before grabbing any locks, this lets us
11276 * sleep if memory is full and we don't have to deal with failed
11277 * allocations.
11278 */
11279 hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
11280 dev, le->le_dva, le->le_daddr,
11281 L2BLK_GET_PSIZE((le)->le_prop), asize, le->le_birth,
11282 L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
11283 L2BLK_GET_PROTECTED((le)->le_prop),
11284 L2BLK_GET_PREFETCH((le)->le_prop),
11285 L2BLK_GET_STATE((le)->le_prop));
11286
11287 /*
11288 * vdev_space_update() has to be called before arc_hdr_destroy() to
11289 * avoid underflow since the latter also calls vdev_space_update().
11290 */
11291 l2arc_hdr_arcstats_increment(hdr);
11292 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11293
11294 mutex_enter(&dev->l2ad_mtx);
11295 list_insert_tail(&dev->l2ad_buflist, hdr);
11296 (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
11297 mutex_exit(&dev->l2ad_mtx);
11298
11299 exists = buf_hash_insert(hdr, &hash_lock);
11300 if (exists) {
11301 /* Buffer was already cached, no need to restore it. */
11302 arc_hdr_destroy(hdr);
11303 /*
11304 * If the buffer is already cached, check whether it has
11305 * L2ARC metadata. If not, enter them and update the flag.
11306 * This is important is case of onlining a cache device, since
11307 * we previously evicted all L2ARC metadata from ARC.
11308 */
11309 if (!HDR_HAS_L2HDR(exists)) {
11310 arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
11311 exists->b_l2hdr.b_dev = dev;
11312 exists->b_l2hdr.b_daddr = le->le_daddr;
11313 exists->b_l2hdr.b_arcs_state =
11314 L2BLK_GET_STATE((le)->le_prop);
11315 /* l2arc_hdr_arcstats_update() expects a valid asize */
11316 HDR_SET_L2SIZE(exists, asize);
11317 mutex_enter(&dev->l2ad_mtx);
11318 list_insert_tail(&dev->l2ad_buflist, exists);
11319 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
11320 arc_hdr_size(exists), exists);
11321 mutex_exit(&dev->l2ad_mtx);
11322 l2arc_hdr_arcstats_increment(exists);
11323 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11324 }
11325 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
11326 }
11327
11328 mutex_exit(hash_lock);
11329 }
11330
11331 /*
11332 * Starts an asynchronous read IO to read a log block. This is used in log
11333 * block reconstruction to start reading the next block before we are done
11334 * decoding and reconstructing the current block, to keep the l2arc device
11335 * nice and hot with read IO to process.
11336 * The returned zio will contain a newly allocated memory buffers for the IO
11337 * data which should then be freed by the caller once the zio is no longer
11338 * needed (i.e. due to it having completed). If you wish to abort this
11339 * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
11340 * care of disposing of the allocated buffers correctly.
11341 */
11342 static zio_t *
l2arc_log_blk_fetch(vdev_t * vd,const l2arc_log_blkptr_t * lbp,l2arc_log_blk_phys_t * lb)11343 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
11344 l2arc_log_blk_phys_t *lb)
11345 {
11346 uint32_t asize;
11347 zio_t *pio;
11348 l2arc_read_callback_t *cb;
11349
11350 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
11351 asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
11352 ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
11353
11354 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
11355 cb->l2rcb_abd = abd_get_from_buf(lb, asize);
11356 pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
11357 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
11358 (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
11359 cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
11360 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
11361 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
11362
11363 return (pio);
11364 }
11365
11366 /*
11367 * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
11368 * buffers allocated for it.
11369 */
11370 static void
l2arc_log_blk_fetch_abort(zio_t * zio)11371 l2arc_log_blk_fetch_abort(zio_t *zio)
11372 {
11373 (void) zio_wait(zio);
11374 }
11375
11376 /*
11377 * Creates a zio to update the device header on an l2arc device.
11378 */
11379 void
l2arc_dev_hdr_update(l2arc_dev_t * dev)11380 l2arc_dev_hdr_update(l2arc_dev_t *dev)
11381 {
11382 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11383 const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
11384 abd_t *abd;
11385 int err;
11386
11387 VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
11388
11389 l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
11390 l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
11391 l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
11392 l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
11393 l2dhdr->dh_log_entries = dev->l2ad_log_entries;
11394 l2dhdr->dh_evict = dev->l2ad_evict;
11395 l2dhdr->dh_start = dev->l2ad_start;
11396 l2dhdr->dh_end = dev->l2ad_end;
11397 l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
11398 l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
11399 l2dhdr->dh_flags = 0;
11400 l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
11401 l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
11402 if (dev->l2ad_first)
11403 l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
11404
11405 abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
11406
11407 err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
11408 VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
11409 NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
11410
11411 abd_free(abd);
11412
11413 if (err != 0) {
11414 zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
11415 "vdev guid: %llu", err,
11416 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11417 }
11418 }
11419
11420 /*
11421 * Commits a log block to the L2ARC device. This routine is invoked from
11422 * l2arc_write_buffers when the log block fills up.
11423 * This function allocates some memory to temporarily hold the serialized
11424 * buffer to be written. This is then released in l2arc_write_done.
11425 */
11426 static uint64_t
l2arc_log_blk_commit(l2arc_dev_t * dev,zio_t * pio,l2arc_write_callback_t * cb)11427 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
11428 {
11429 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
11430 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11431 uint64_t psize, asize;
11432 zio_t *wzio;
11433 l2arc_lb_abd_buf_t *abd_buf;
11434 abd_t *abd = NULL;
11435 l2arc_lb_ptr_buf_t *lb_ptr_buf;
11436
11437 VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
11438
11439 abd_buf = zio_buf_alloc(sizeof (*abd_buf));
11440 abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
11441 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
11442 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
11443
11444 /* link the buffer into the block chain */
11445 lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
11446 lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
11447
11448 /*
11449 * l2arc_log_blk_commit() may be called multiple times during a single
11450 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
11451 * so we can free them in l2arc_write_done() later on.
11452 */
11453 list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
11454
11455 /* try to compress the buffer, at least one sector to save */
11456 psize = zio_compress_data(ZIO_COMPRESS_LZ4,
11457 abd_buf->abd, &abd, sizeof (*lb),
11458 zio_get_compression_max_size(ZIO_COMPRESS_LZ4,
11459 dev->l2ad_vdev->vdev_ashift,
11460 dev->l2ad_vdev->vdev_ashift, sizeof (*lb)), 0);
11461
11462 /* a log block is never entirely zero */
11463 ASSERT(psize != 0);
11464 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
11465 ASSERT(asize <= sizeof (*lb));
11466
11467 /*
11468 * Update the start log block pointer in the device header to point
11469 * to the log block we're about to write.
11470 */
11471 l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
11472 l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
11473 l2dhdr->dh_start_lbps[0].lbp_payload_asize =
11474 dev->l2ad_log_blk_payload_asize;
11475 l2dhdr->dh_start_lbps[0].lbp_payload_start =
11476 dev->l2ad_log_blk_payload_start;
11477 L2BLK_SET_LSIZE(
11478 (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
11479 L2BLK_SET_PSIZE(
11480 (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
11481 L2BLK_SET_CHECKSUM(
11482 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11483 ZIO_CHECKSUM_FLETCHER_4);
11484 if (asize < sizeof (*lb)) {
11485 /* compression succeeded */
11486 abd_zero_off(abd, psize, asize - psize);
11487 L2BLK_SET_COMPRESS(
11488 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11489 ZIO_COMPRESS_LZ4);
11490 } else {
11491 /* compression failed */
11492 abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
11493 L2BLK_SET_COMPRESS(
11494 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11495 ZIO_COMPRESS_OFF);
11496 }
11497
11498 /* checksum what we're about to write */
11499 abd_fletcher_4_native(abd, asize, NULL,
11500 &l2dhdr->dh_start_lbps[0].lbp_cksum);
11501
11502 abd_free(abd_buf->abd);
11503
11504 /* perform the write itself */
11505 abd_buf->abd = abd;
11506 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
11507 asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
11508 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
11509 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
11510 (void) zio_nowait(wzio);
11511
11512 dev->l2ad_hand += asize;
11513 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11514
11515 /*
11516 * Include the committed log block's pointer in the list of pointers
11517 * to log blocks present in the L2ARC device.
11518 */
11519 memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
11520 sizeof (l2arc_log_blkptr_t));
11521 mutex_enter(&dev->l2ad_mtx);
11522 list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
11523 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
11524 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
11525 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
11526 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
11527 mutex_exit(&dev->l2ad_mtx);
11528
11529 /* bump the kstats */
11530 ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
11531 ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
11532 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
11533 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
11534 dev->l2ad_log_blk_payload_asize / asize);
11535
11536 /* start a new log block */
11537 dev->l2ad_log_ent_idx = 0;
11538 dev->l2ad_log_blk_payload_asize = 0;
11539 dev->l2ad_log_blk_payload_start = 0;
11540
11541 return (asize);
11542 }
11543
11544 /*
11545 * Validates an L2ARC log block address to make sure that it can be read
11546 * from the provided L2ARC device.
11547 */
11548 boolean_t
l2arc_log_blkptr_valid(l2arc_dev_t * dev,const l2arc_log_blkptr_t * lbp)11549 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
11550 {
11551 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
11552 uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
11553 uint64_t end = lbp->lbp_daddr + asize - 1;
11554 uint64_t start = lbp->lbp_payload_start;
11555 boolean_t evicted = B_FALSE;
11556
11557 /*
11558 * A log block is valid if all of the following conditions are true:
11559 * - it fits entirely (including its payload) between l2ad_start and
11560 * l2ad_end
11561 * - it has a valid size
11562 * - neither the log block itself nor part of its payload was evicted
11563 * by l2arc_evict():
11564 *
11565 * l2ad_hand l2ad_evict
11566 * | | lbp_daddr
11567 * | start | | end
11568 * | | | | |
11569 * V V V V V
11570 * l2ad_start ============================================ l2ad_end
11571 * --------------------------||||
11572 * ^ ^
11573 * | log block
11574 * payload
11575 */
11576
11577 evicted =
11578 l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
11579 l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
11580 l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
11581 l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
11582
11583 return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
11584 asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
11585 (!evicted || dev->l2ad_first));
11586 }
11587
11588 /*
11589 * Inserts ARC buffer header `hdr' into the current L2ARC log block on
11590 * the device. The buffer being inserted must be present in L2ARC.
11591 * Returns B_TRUE if the L2ARC log block is full and needs to be committed
11592 * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
11593 */
11594 static boolean_t
l2arc_log_blk_insert(l2arc_dev_t * dev,const arc_buf_hdr_t * hdr)11595 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
11596 {
11597 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
11598 l2arc_log_ent_phys_t *le;
11599
11600 if (dev->l2ad_log_entries == 0)
11601 return (B_FALSE);
11602
11603 int index = dev->l2ad_log_ent_idx++;
11604
11605 ASSERT3S(index, <, dev->l2ad_log_entries);
11606 ASSERT(HDR_HAS_L2HDR(hdr));
11607
11608 le = &lb->lb_entries[index];
11609 memset(le, 0, sizeof (*le));
11610 le->le_dva = hdr->b_dva;
11611 le->le_birth = hdr->b_birth;
11612 le->le_daddr = hdr->b_l2hdr.b_daddr;
11613 if (index == 0)
11614 dev->l2ad_log_blk_payload_start = le->le_daddr;
11615 L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
11616 L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
11617 L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
11618 le->le_complevel = hdr->b_complevel;
11619 L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
11620 L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
11621 L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
11622 L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
11623
11624 dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
11625 HDR_GET_PSIZE(hdr));
11626
11627 return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
11628 }
11629
11630 /*
11631 * Checks whether a given L2ARC device address sits in a time-sequential
11632 * range. The trick here is that the L2ARC is a rotary buffer, so we can't
11633 * just do a range comparison, we need to handle the situation in which the
11634 * range wraps around the end of the L2ARC device. Arguments:
11635 * bottom -- Lower end of the range to check (written to earlier).
11636 * top -- Upper end of the range to check (written to later).
11637 * check -- The address for which we want to determine if it sits in
11638 * between the top and bottom.
11639 *
11640 * The 3-way conditional below represents the following cases:
11641 *
11642 * bottom < top : Sequentially ordered case:
11643 * <check>--------+-------------------+
11644 * | (overlap here?) |
11645 * L2ARC dev V V
11646 * |---------------<bottom>============<top>--------------|
11647 *
11648 * bottom > top: Looped-around case:
11649 * <check>--------+------------------+
11650 * | (overlap here?) |
11651 * L2ARC dev V V
11652 * |===============<top>---------------<bottom>===========|
11653 * ^ ^
11654 * | (or here?) |
11655 * +---------------+---------<check>
11656 *
11657 * top == bottom : Just a single address comparison.
11658 */
11659 boolean_t
l2arc_range_check_overlap(uint64_t bottom,uint64_t top,uint64_t check)11660 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
11661 {
11662 if (bottom < top)
11663 return (bottom <= check && check <= top);
11664 else if (bottom > top)
11665 return (check <= top || bottom <= check);
11666 else
11667 return (check == top);
11668 }
11669
11670 EXPORT_SYMBOL(arc_buf_size);
11671 EXPORT_SYMBOL(arc_write);
11672 EXPORT_SYMBOL(arc_read);
11673 EXPORT_SYMBOL(arc_buf_info);
11674 EXPORT_SYMBOL(arc_getbuf_func);
11675 EXPORT_SYMBOL(arc_buf_destroy);
11676 EXPORT_SYMBOL(arc_add_prune_callback);
11677 EXPORT_SYMBOL(arc_remove_prune_callback);
11678
11679 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
11680 spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
11681
11682 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
11683 spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
11684
11685 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
11686 "Balance between metadata and data on ghost hits.");
11687
11688 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
11689 param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
11690
11691 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
11692 param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
11693
11694 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
11695 param_set_arc_no_grow_shift, param_get_uint, ZMOD_RW,
11696 "log2(fraction of ARC which must be free to allow growing)");
11697
11698 #ifdef _KERNEL
11699 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
11700 "Percent of pagecache to reclaim ARC to");
11701 #endif
11702
11703 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
11704 "Target average block size");
11705
11706 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
11707 "Disable compressed ARC buffers");
11708
11709 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
11710 param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
11711
11712 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
11713 param_set_arc_int, param_get_uint, ZMOD_RW,
11714 "Min life of prescient prefetched block in ms");
11715
11716 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
11717 "Max write bytes per interval");
11718
11719 ZFS_MODULE_PARAM_CALL(zfs_l2arc, l2arc_, dwpd_limit, param_set_l2arc_dwpd_limit,
11720 spl_param_get_u64, ZMOD_RW,
11721 "L2ARC device endurance limit as percentage (100 = 1.0 DWPD)");
11722
11723 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
11724 "Number of max device writes to precache");
11725
11726 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
11727 "Compressed l2arc_headroom multiplier");
11728
11729 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
11730 "TRIM ahead L2ARC write size multiplier");
11731
11732 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
11733 "Seconds between L2ARC writing");
11734
11735 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
11736 "Min feed interval in milliseconds");
11737
11738 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
11739 "Skip caching prefetched buffers");
11740
11741 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
11742 "Turbo L2ARC warmup");
11743
11744 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
11745 "No reads during writes");
11746
11747 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
11748 "Percent of ARC size allowed for L2ARC-only headers");
11749
11750 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
11751 "Rebuild the L2ARC when importing a pool");
11752
11753 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
11754 "Min size in bytes to write rebuild log blocks in L2ARC");
11755
11756 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
11757 "Cache only MFU data from ARC into L2ARC");
11758
11759 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
11760 "Exclude dbufs on special vdevs from being cached to L2ARC if set.");
11761
11762 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_cycles, U64, ZMOD_RW,
11763 "Consecutive metadata cycles before skipping to let data run");
11764
11765 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, ext_headroom_pct, U64, ZMOD_RW,
11766 "Depth cap as percentage of state size for marker reset");
11767
11768 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
11769 param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
11770
11771 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
11772 spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
11773
11774 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
11775 spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
11776
11777 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
11778 param_set_arc_int, param_get_uint, ZMOD_RW,
11779 "Percent of ARC meta buffers for dnodes");
11780
11781 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
11782 "Percentage of excess dnodes to try to unpin");
11783
11784 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
11785 "When full, ARC allocation waits for eviction of this % of alloc size");
11786
11787 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
11788 "The number of headers to evict per sublist before moving to the next");
11789
11790 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW,
11791 "The number of batches to run per parallel eviction task");
11792
11793 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
11794 "Number of arc_prune threads");
11795
11796 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
11797 "Number of threads to use for ARC eviction.");
11798